Compare commits

..

58 Commits
0.7 ... master

Author SHA1 Message Date
Jan Bader
7c95ec57d7 Ignore tmp 2024-10-04 14:33:20 +02:00
Jan Bader
c72fbdd086 Improve display for progress 2024-10-04 14:32:59 +02:00
Jan Bader
ce574a0b4a Ignore test_data 2024-10-04 14:32:45 +02:00
Jan Bader
ceab693f34 Fix condition for adding a cluster 2023-12-10 21:51:02 +01:00
Jan Bader
eb6e492251 Improve alignment 2023-12-10 19:01:20 +01:00
Jan Bader
43d2ced820 Add untracked files 2023-12-09 15:10:35 +01:00
Jan Bader
c667707704 Remove empty lines from struct 2023-12-09 15:06:47 +01:00
Jan Bader
e314e89657 Extract clustering.go 2023-12-09 15:05:55 +01:00
Jan Bader
66da8393a4 Fix check being off 2023-12-09 15:02:41 +01:00
Jan Bader
9ce2e8a927 Remove unique files from list before handling dupes 2023-12-09 15:01:54 +01:00
Jan Bader
c6f4c8bd89 Extract promptForDeletion 2023-12-09 14:57:32 +01:00
Jan Bader
1a9e17de10 Extract getImageClusters 2023-12-09 14:50:29 +01:00
Jan Bader
a6c978eaee Improve output of errors 2023-12-09 14:39:02 +01:00
Jan Bader
c618bc88fc Update units for mpb v8 2023-12-06 22:53:50 +01:00
Jan Bader
c57d6cea68 Update go mod 2023-12-06 22:45:06 +01:00
Jan Bader
303833d06f Update go mod 2023-12-06 22:42:44 +01:00
Jan Bader
587f904ebc Reenable commented logic 2023-12-06 16:51:05 +01:00
Jan Bader
f8564f20ac Also list binary identical files 2023-12-06 16:50:31 +01:00
Jan Bader
65ce046585 Remove handled images from slice 2023-12-06 16:43:27 +01:00
Jan Bader
6f5eb6a1ca Also show image hashing bar 2023-12-06 16:37:25 +01:00
Jan Bader
a2b5d2e224 Fix file hashing bar being used for images 2023-12-06 16:33:17 +01:00
Jan Bader
ecaddb7f73 Always increase countFiles 2023-12-06 16:28:32 +01:00
Jan Bader
bfbd6de40b Remove handled images from slice 2023-12-06 16:21:13 +01:00
Jan Bader
ccd2ea8fcd Also close images hashed channel 2023-12-06 16:16:09 +01:00
Jan Bader
f4872c95d1 Also append Images to list 2023-12-06 16:15:30 +01:00
Jan Bader
a66b84a545 Change countFiles to int64 2023-12-06 16:15:19 +01:00
Jan Bader
f6c33a3b5d Also close images hashing channel 2023-12-06 16:14:53 +01:00
Jan Bader
c5186d6ae2 Remove logging of hash 2023-12-06 16:14:35 +01:00
Jan Bader
fc2d1c0cb5 Try to improve hashing and progress 2023-12-06 16:07:24 +01:00
Jan Bader
f4f827b3e4 Reintroduce regular file handling 2023-12-06 15:47:49 +01:00
Jan Bader
6059baeeeb Print distance with duplicates 2023-12-06 15:08:07 +01:00
Jan Bader
eb25a625cb Group by hamming distance 2023-12-06 15:02:29 +01:00
Jan Bader
bbdc296cbd Calculate dhash for jpgs 2023-12-04 22:59:55 +01:00
Jan Bader
b58151efb7 Update to go 1.21 2023-12-04 22:25:50 +01:00
Jan Bader
c535c3d050
Update README.md 2023-07-03 09:17:45 +02:00
c090b6645e Initialize bars in main 2021-08-06 00:18:59 +02:00
1144e97045 Remove IncomingWorker 2021-08-05 23:58:23 +02:00
c885c03130 Fix filemode not being octal 2021-08-05 23:51:38 +02:00
4ea8dfd7ee Remove unneeded check in map 2021-08-05 23:51:29 +02:00
425a87071d Fix format string 2021-08-05 23:51:12 +02:00
dda06924f1 Check minSize earlier 2021-08-05 23:50:57 +02:00
fda00ec0b8 Use mpb v7 2021-08-05 23:50:45 +02:00
31383ad118 Use other progressbar 2021-08-05 23:21:46 +02:00
275b63cfe8 Initialize go.mod 2021-08-05 23:21:22 +02:00
8f0f32d5ee Add progress bar for initial walk 2020-11-26 14:52:33 +01:00
3c3f1d747b Handle all files of set being insider or outside the deleteDupesIn-Folder 2020-11-22 16:35:00 +01:00
66a9ae73e5 Display statistics 2020-11-22 01:57:24 +01:00
594a88c3ec Add Flag to enable cpu profiling 2020-11-22 01:23:07 +01:00
29fa093184 Use SHA1 for better performance 2020-11-22 01:22:52 +01:00
ff2d4daeda Only remember last file 2020-11-22 01:09:02 +01:00
e33d7e2ca0 Add WaitGroup for HashingWorker 2020-11-22 01:02:07 +01:00
a3fa3d4e7c Parallel 2020-11-22 00:55:12 +01:00
8007b5686d Rename variable 2020-11-21 23:13:44 +01:00
87c8a6e817 Extract filesmap to own file 2020-11-21 22:14:42 +01:00
8a9bcbf62e Move funcs to other file 2020-11-21 22:13:34 +01:00
f16a143125 Do not export delete-methods 2020-11-21 22:11:59 +01:00
Jan Bader
903909de77
Add note about releases 2020-11-21 22:05:06 +01:00
Jan Bader
09a4dc8660
Fix code blocks in README 2020-11-21 22:03:14 +01:00
9 changed files with 538 additions and 188 deletions

2
.gitignore vendored
View File

@ -1 +1,3 @@
*.exe
test_data/
tmp/

View File

@ -3,12 +3,14 @@ Because I couldn't find a good program for my usecase, I wrote this simple Go pr
## Installation
If you have go installed, the easiest way to install is `go get`:
If you have go installed, the easiest way to install and update is `go get`:
```
go get "github.com/JaCoB1123/dupe-finder"
go get -u "github.com/JaCoB1123/dupe-finder"
```
Otherwise you can download the latest binary from the [releases](https://github.com/JaCoB1123/dupe-finder/releases) page.
## Usage
dupe-finder supports the following options:
@ -34,15 +36,15 @@ dupe-finder supports the following options:
Find all duplicate files in `~/` and save the results to `dupes.json`
```
> dupe-finder --to-file dupes.json ~/
``̀`
```
Load previous results from `dupes.json` and delete all duplicates located in ~/.cache
```
> dupe-finder --from-file dupes.json --delete-dupes-in ~/.cache
``̀`
```
Find all duplicate files in `~/' and `/mnt/EXT`. Prompt which file to keep for each set of duplicates and move the others to /dupes/.
Find all duplicate files in `~/` and `/mnt/EXT`. Prompt which file to keep for each set of duplicates and move the others to /dupes/.
```
> dupe-finder --delete-prompt --move-files /dupes/ ~/ /mnt/EXT
``̀`
```

46
clustering.go Normal file
View File

@ -0,0 +1,46 @@
package main
import (
"slices"
"github.com/steakknife/hamming"
)
type imageCluster struct {
images []similarImage
}
type similarImage struct {
path string
distance int
}
func (fm *FilesMap) getImageClusters() []imageCluster {
var clusters []imageCluster
for len(fm.Images) > 0 {
file := fm.Images[0]
fm.Images = slices.Delete(fm.Images, 0, 1)
var currentCluster []similarImage
currentCluster = append(currentCluster, similarImage{path: file.path})
for otherIndex := len(fm.Images) - 1; otherIndex >= 0; otherIndex-- {
otherFile := fm.Images[otherIndex]
var distance = hamming.Uint64(file.imageHash, otherFile.imageHash)
if distance > 5 {
continue
}
fm.Images = slices.Delete(fm.Images, otherIndex, otherIndex+1)
currentCluster = append(currentCluster, similarImage{path: otherFile.path, distance: distance})
}
if len(currentCluster) <= 1 {
continue
}
clusters = append(clusters, imageCluster{images: currentCluster})
}
return clusters
}

82
file.go Normal file
View File

@ -0,0 +1,82 @@
package main
import (
"crypto/sha1"
"encoding/base64"
"image"
_ "image/jpeg"
_ "image/png"
"io"
"os"
"path/filepath"
"strconv"
"github.com/corona10/goimagehash"
)
func remove(path string) {
if !*force {
return
}
if *moveToFolder == "" {
os.Remove(path)
return
}
moveButDontOvewrite(path, *moveToFolder)
}
func moveButDontOvewrite(path string, targetPath string) {
num := 0
filename := filepath.Base(path)
target := filepath.Join(targetPath, filename)
for {
_, err := os.Stat(target)
if os.IsNotExist(err) {
os.Rename(path, target)
return
}
target = filepath.Join(targetPath, filename+"."+strconv.Itoa(num))
num++
}
}
func calculateImageHash(path string) (uint64, error) {
f, err := os.Open(path)
if err != nil {
return 0, err
}
defer f.Close()
img, _, err := image.Decode(f)
if err != nil {
return 0, err
}
hash, err := goimagehash.DifferenceHash(img)
if err != nil {
return 0, err
}
return hash.GetHash(), nil
}
func calculateFileHash(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := sha1.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
}
stringHash := base64.RawStdEncoding.EncodeToString(h.Sum(nil))
return stringHash, nil
}

178
filesmap.go Normal file
View File

@ -0,0 +1,178 @@
package main
import (
"errors"
"flag"
"fmt"
"image"
"os"
"path/filepath"
"sync"
"github.com/vbauerster/mpb/v8"
)
// FilesMap is a struct for listing files by Size and Hash to search for duplicates
type FilesMap struct {
Images []imageEntry
FilesBySize map[int64]string
FilesByHash map[string][]string
FilesHashing chan fileEntry
FilesHashed chan fileEntry
ImagesHashing chan imageEntry
ImagesHashed chan imageEntry
progress *mpb.Progress
incomingBar *mpb.Bar
fileHashingBar *mpb.Bar
imageHashingBar *mpb.Bar
lock sync.Mutex
}
func newFilesMap() *FilesMap {
return &FilesMap{
FilesBySize: map[int64]string{},
FilesByHash: map[string][]string{},
FilesHashed: make(chan fileEntry, 100000),
FilesHashing: make(chan fileEntry),
ImagesHashed: make(chan imageEntry, 100000),
ImagesHashing: make(chan imageEntry),
progress: mpb.New(mpb.WithWidth(64)),
}
}
func (fm *FilesMap) FileHashingWorker(wg *sync.WaitGroup) {
for file := range fm.FilesHashing {
if *verbose {
fmt.Fprintf(fm.progress, "Hashing file %s\n", file.path)
}
hash, err := calculateFileHash(file.path)
fm.fileHashingBar.IncrInt64(file.size)
fm.FilesHashed <- file
if err != nil {
fmt.Fprintf(fm.progress, "Error calculating Hash for file %s: %v\n", file.path, err)
continue
}
file.hash = hash
}
wg.Done()
}
func (fm *FilesMap) ImageHashingWorker(wg *sync.WaitGroup) {
for file := range fm.ImagesHashing {
if *verbose {
fmt.Fprintf(fm.progress, "Hashing image %s\n", file.path)
}
hash, err := calculateImageHash(file.path)
fm.imageHashingBar.IncrInt64(file.size)
if errors.Is(err, image.ErrFormat) {
continue
} else if err != nil {
fmt.Fprintf(fm.progress, "Error calculating Hash for image %s: %v\n", file.path, err)
continue
}
file.imageHash = hash
fm.ImagesHashed <- file
}
wg.Done()
}
func (fm *FilesMap) HashedWorker(done chan bool) {
for file := range fm.FilesHashed {
if *verbose {
fmt.Println("Finishing", file.path)
}
fm.lock.Lock()
fm.FilesByHash[file.hash] = append(fm.FilesByHash[file.hash], file.path)
fm.lock.Unlock()
}
for file := range fm.ImagesHashed {
if *verbose {
fmt.Println("Finishing", file.path)
}
fm.lock.Lock()
fm.Images = append(fm.Images, file)
fm.lock.Unlock()
}
done <- true
}
func (fm *FilesMap) WalkDirectories() int64 {
var countFiles int64 = 0
sumSize := int64(0)
for _, path := range flag.Args() {
filepath.Walk(path, func(path string, info os.FileInfo, err error) error {
if info.IsDir() {
return nil
}
size := info.Size()
if *minSize > size {
return nil
}
countFiles++
fm.incomingBar.SetTotal(int64(countFiles), false)
fm.hashImage(path, size)
count := fm.hashFile(path, size)
if count > 0 {
sumSize += size * count
fm.fileHashingBar.SetTotal(int64(sumSize), false)
}
return nil
})
}
fm.incomingBar.SetTotal(int64(countFiles), true)
close(fm.FilesHashing)
close(fm.ImagesHashing)
return countFiles
}
func (fm *FilesMap) hashFile(path string, size int64) int64 {
prevFile, ok := fm.FilesBySize[size]
if !ok {
fm.FilesBySize[size] = path
return 0
}
fm.FilesBySize[size] = ""
fm.incomingBar.Increment()
if *verbose {
fmt.Println("Incoming", path)
}
fm.FilesHashing <- fileEntry{path, size, ""}
if prevFile != "" {
fm.FilesHashing <- fileEntry{prevFile, size, ""}
return 2
}
return 1
}
func (fm *FilesMap) hashImage(path string, size int64) {
fm.ImagesHashing <- imageEntry{path, size, 0}
}
type imageEntry struct {
path string
size int64
imageHash uint64
}
type fileEntry struct {
path string
size int64
hash string
}

18
go.mod Normal file
View File

@ -0,0 +1,18 @@
module github.com/JaCoB1123/dupe-finder
go 1.21
require (
github.com/corona10/goimagehash v1.1.0
github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3
github.com/vbauerster/mpb/v8 v8.7.0
)
require (
github.com/VividCortex/ewma v1.2.0 // indirect
github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect
github.com/rivo/uniseg v0.4.4 // indirect
golang.org/x/sys v0.15.0 // indirect
)

19
go.sum Normal file
View File

@ -0,0 +1,19 @@
github.com/VividCortex/ewma v1.2.0 h1:f58SaIzcDXrSy3kWaHNvuJgJ3Nmz59Zji6XoJR/q1ow=
github.com/VividCortex/ewma v1.2.0/go.mod h1:nz4BbCtbLyFDeC9SUHbtcT5644juEuWfUAUnGx7j5l4=
github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d h1:licZJFw2RwpHMqeKTCYkitsPqHNxTmd4SNR5r94FGM8=
github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d/go.mod h1:asat636LX7Bqt5lYEZ27JNDcqxfjdBQuJ/MM4CN/Lzo=
github.com/corona10/goimagehash v1.1.0 h1:teNMX/1e+Wn/AYSbLHX8mj+mF9r60R1kBeqE9MkoYwI=
github.com/corona10/goimagehash v1.1.0/go.mod h1:VkvE0mLn84L4aF8vCb6mafVajEb6QYMHl2ZJLn0mOGI=
github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U=
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6Oo2LfFZAehjjQMERAvZLEDnQ=
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis=
github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3 h1:njlZPzLwU639dk2kqnCPPv+wNjq7Xb6EfUxe/oX0/NM=
github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3/go.mod h1:hpGUWaI9xL8pRQCTXQgocU38Qw1g0Us7n5PxxTwTCYU=
github.com/vbauerster/mpb/v8 v8.7.0 h1:n2LTGyol7qqNBcLQn8FL5Bga2O8CGF75OOYsJVFsfMg=
github.com/vbauerster/mpb/v8 v8.7.0/go.mod h1:0RgdqeTpu6cDbdWeSaDvEvfgm9O598rBnRZ09HKaV0k=
golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc=
golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=

318
main.go
View File

@ -2,17 +2,20 @@ package main
import (
"bufio"
"crypto/sha256"
"encoding/base64"
"encoding/json"
"flag"
"fmt"
"io"
"io/ioutil"
"log"
"os"
"path/filepath"
"strconv"
"runtime"
"runtime/pprof"
"strings"
"sync"
"github.com/vbauerster/mpb/v8"
"github.com/vbauerster/mpb/v8/decor"
)
var fromFile = flag.String("from-file", "", "Load results file from <path>")
@ -20,144 +23,174 @@ var toFile = flag.String("to-file", "", "Save results to <path>")
var deleteDupesIn = flag.String("delete-dupes-in", "", "Delete duplicates if they are contained in <path>")
var promptForDelete = flag.Bool("delete-prompt", false, "Ask which file to keep for each dupe-set")
var moveToFolder = flag.String("move-files", "", "Move files to <path> instead of deleting them")
var minSize = flag.Int64("min-size", -1, "Ignore all files smaller than <size> in Bytes")
var force = flag.Bool("force", false, "Actually delete files. Without this options, the files to be deleted are only printed")
var verbose = flag.Bool("verbose", false, "Output additional information")
func Delete(path string) {
if !*force {
return
}
if *moveToFolder == "" {
os.Remove(path)
return
}
MoveButDontOvewrite(path, *moveToFolder)
}
func MoveButDontOvewrite(path string, targetPath string) {
num := 0
filename := filepath.Base(path)
target := filepath.Join(targetPath, filename)
for {
_, err := os.Stat(target)
if os.IsNotExist(err) {
os.Rename(path, target)
return
}
target = filepath.Join(targetPath, filename+"."+strconv.Itoa(num))
num++
}
}
var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
func main() {
flag.Parse()
if *cpuprofile != "" {
f, err := os.Create(*cpuprofile)
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
}
if *verbose {
printConfiguration()
}
var countFiles int64 = 0
filesMap := newFilesMap()
if *fromFile != "" {
fmt.Println("Loading file", *fromFile)
byteValue, _ := ioutil.ReadFile(*fromFile)
err := json.Unmarshal(byteValue, &filesMap.FilesBySize)
err := json.Unmarshal(byteValue, &filesMap.FilesByHash)
if err != nil {
panic(err)
}
} else {
for _, path := range flag.Args() {
filepath.Walk(path, func(path string, info os.FileInfo, err error) error {
filesMap.Add(path, info)
return nil
})
filesMap.incomingBar = filesMap.progress.AddSpinner(0,
mpb.PrependDecorators(
decor.Name("Finding files "),
decor.Elapsed(decor.ET_STYLE_HHMMSS),
),
mpb.AppendDecorators(
decor.AverageSpeed(0, "%8.2f"),
decor.Name(" "),
decor.CurrentNoUnit("%5d"),
),
)
filesMap.fileHashingBar = filesMap.progress.AddBar(0,
mpb.PrependDecorators(
decor.Name("Hashing files "),
decor.Elapsed(decor.ET_STYLE_HHMMSS),
),
mpb.AppendDecorators(
decor.AverageSpeed(decor.SizeB1024(0), "%23.2f"),
decor.Name(" "),
decor.CurrentKibiByte("%5d"),
),
)
filesMap.imageHashingBar = filesMap.progress.AddBar(0,
mpb.PrependDecorators(
decor.Name("Hashing images "),
decor.Elapsed(decor.ET_STYLE_HHMMSS),
),
mpb.AppendDecorators(
decor.AverageSpeed(decor.SizeB1024(0), "%23.2f"),
decor.Name(" "),
decor.CurrentKibiByte("%5d"),
),
)
done := make(chan bool)
wg := sync.WaitGroup{}
for i := 0; i < runtime.GOMAXPROCS(0); i++ {
wg.Add(2)
go filesMap.ImageHashingWorker(&wg)
go filesMap.FileHashingWorker(&wg)
}
go filesMap.HashedWorker(done)
countFiles = filesMap.WalkDirectories()
wg.Wait()
close(filesMap.FilesHashed)
close(filesMap.ImagesHashed)
<-done
}
if *toFile != "" && *fromFile == "" {
json, _ := json.MarshalIndent(filesMap.FilesBySize, "", " ")
ioutil.WriteFile(*toFile, json, 644)
json, _ := json.MarshalIndent(filesMap.FilesByHash, "", " ")
ioutil.WriteFile(*toFile, json, 0644)
}
for hash, duplicateFiles := range filesMap.FilesByHash {
if len(duplicateFiles) > 1 {
continue
}
delete(filesMap.FilesByHash, hash)
}
if *deleteDupesIn != "" {
deleteIn := filepath.Clean(*deleteDupesIn)
for size := range filesMap.FilesBySize {
for hash := range filesMap.FilesBySize[size] {
duplicateFiles := filesMap.FilesBySize[size][hash]
if len(duplicateFiles) <= 1 {
continue
for hash := range filesMap.FilesByHash {
duplicateFiles := filesMap.FilesByHash[hash]
hasDupesInFolder := false
hasDupesOutsideFolder := false
for _, file := range duplicateFiles {
fileIsInFolder := strings.HasPrefix(filepath.Clean(file), deleteIn)
hasDupesOutsideFolder = hasDupesOutsideFolder || !fileIsInFolder
hasDupesInFolder = hasDupesInFolder || fileIsInFolder
}
if !hasDupesInFolder || !hasDupesOutsideFolder {
if !hasDupesOutsideFolder {
fmt.Println("Not deleting one of the following files, since all would be deleted")
}
if !hasDupesInFolder {
fmt.Println("Not deleting one of the following files, since none are in the selected directory")
}
for _, file := range duplicateFiles {
if strings.HasPrefix(filepath.Clean(file), deleteIn) {
fmt.Println("Would delete ", file)
if *force {
Delete(file)
}
fmt.Println("-", file)
}
fmt.Println()
continue
}
for _, file := range duplicateFiles {
if strings.HasPrefix(filepath.Clean(file), deleteIn) {
fmt.Println("Would delete ", file)
if *force {
remove(file)
}
}
}
}
} else if *promptForDelete {
reader := bufio.NewReader(os.Stdin)
for size := range filesMap.FilesBySize {
for hash := range filesMap.FilesBySize[size] {
duplicateFiles := filesMap.FilesBySize[size][hash]
if len(duplicateFiles) <= 1 {
continue
}
fmt.Print("\033[H\033[2J")
for i, file := range duplicateFiles {
fmt.Println(i+1, file)
}
fmt.Printf("Which file to keep? ")
input, err := reader.ReadString('\n')
if err != nil {
fmt.Println("Invalid input")
continue
}
input = strings.TrimRight(input, "\n\r")
intInput, err := strconv.Atoi(input)
if err != nil || intInput > len(duplicateFiles) || intInput < 1 {
fmt.Println("Invalid input")
continue
}
for i, file := range duplicateFiles {
if i+1 == intInput {
continue
}
if *force {
Delete(file)
}
}
}
for hash := range filesMap.FilesByHash {
duplicateFiles := filesMap.FilesByHash[hash]
promptForDeletion(reader, duplicateFiles)
}
} else {
for size := range filesMap.FilesBySize {
for hash := range filesMap.FilesBySize[size] {
duplicateFiles := filesMap.FilesBySize[size][hash]
if len(duplicateFiles) <= 1 {
continue
}
countInstances := 0
countDupeSets := 0
for _, file := range duplicateFiles {
fmt.Println(file)
}
fmt.Println()
fmt.Println("Files that are binary identical:")
for _, duplicateFiles := range filesMap.FilesByHash {
countDupeSets++
for _, file := range duplicateFiles {
countInstances++
fmt.Println(file)
}
fmt.Println()
}
fmt.Println("Images that are similar:")
imageClusters := filesMap.getImageClusters()
for _, cluster := range imageClusters {
countDupeSets++
for _, image := range cluster.images {
countInstances++
fmt.Println(image.path, image.distance)
}
fmt.Println()
}
fmt.Println("Statistics:")
fmt.Println(countFiles, "Files")
fmt.Println(len(filesMap.FilesBySize), "Unique Sizes")
fmt.Println(len(filesMap.FilesByHash), "Unique Hashes")
fmt.Println(countInstances, "Duplicate Files")
fmt.Println(countDupeSets, "Duplicate Sets")
}
}
@ -174,82 +207,3 @@ func printConfiguration() {
fmt.Println()
fmt.Println()
}
// FilesMap is a struct for listing files by Size and Hash to search for duplicates
type FilesMap struct {
FilesBySize map[int64]map[string][]string
}
// Add a file to the Map and calculate hash on demand
func (fm *FilesMap) Add(path string, info os.FileInfo) error {
if info.IsDir() {
return nil
}
fileInfo := path
filesByHash := fm.FilesBySize[info.Size()]
// first file with same size
// => create new map for size
if filesByHash == nil {
filesByHash = map[string][]string{}
fm.FilesBySize[info.Size()] = filesByHash
filesByHash[""] = []string{fileInfo}
return nil
}
// second file with same size
// => calculate hashes for all entries
if _, hasEmptyHash := filesByHash[""]; hasEmptyHash {
err := appendByFileHash(filesByHash, fileInfo)
err2 := appendByFileHash(filesByHash, filesByHash[""][0])
delete(filesByHash, "")
if err != nil {
return err
}
return err2
}
// for later files always append by hash
return appendByFileHash(filesByHash, fileInfo)
}
func appendByFileHash(filesByHash map[string][]string, fileInfo string) error {
hash, err := calculateHash(fileInfo)
if err != nil {
return err
}
if _, ok := filesByHash[hash]; ok {
filesByHash[hash] = append(filesByHash[hash], fileInfo)
} else {
filesByHash[hash] = []string{fileInfo}
}
return nil
}
func newFilesMap() *FilesMap {
return &FilesMap{
FilesBySize: map[int64]map[string][]string{},
}
}
func calculateHash(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := sha256.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
}
return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil
}

49
ui.go Normal file
View File

@ -0,0 +1,49 @@
package main
import (
"bufio"
"fmt"
"strconv"
"strings"
)
func promptForDeletion(reader *bufio.Reader, files []string) {
fmt.Print("\033[H\033[2J")
for i, file := range files {
fmt.Println(i+1, file)
}
fmt.Println(0, "Keep all")
fmt.Printf("Which file to keep? ")
input, err := reader.ReadString('\n')
if err != nil {
fmt.Println("Invalid input")
return
}
input = strings.TrimRight(input, "\n\r")
intInput, err := strconv.Atoi(input)
if err != nil {
fmt.Println("Invalid input")
return
}
if intInput == 0 {
return
}
if intInput > len(files) || intInput < 1 {
fmt.Println("Invalid input")
return
}
for i, file := range files {
if i+1 == intInput {
continue
}
if *force {
remove(file)
}
}
}