diff --git a/file.go b/file.go index 6df9a36..8aa2a08 100644 --- a/file.go +++ b/file.go @@ -1,10 +1,7 @@ package main import ( - "crypto/sha1" - "encoding/base64" "image/jpeg" - "io" "os" "path/filepath" "strconv" @@ -45,30 +42,31 @@ func moveButDontOvewrite(path string, targetPath string) { } } -func calculateHash(path string) (string, error) { +func calculateHash(path string) (uint64, error) { f, err := os.Open(path) if err != nil { - return "", err + return 0, err } defer f.Close() if strings.HasSuffix(path, ".jpg") { img, err := jpeg.Decode(f) if err != nil { - return "", err + return 0, err } hash, err := goimagehash.DifferenceHash(img) if err != nil { - return "", err + return 0, err } - return hash.ToString(), nil + return hash.GetHash(), nil } - h := sha1.New() - if _, err := io.Copy(h, f); err != nil { - return "", err - } + /* h := sha1.New() + if _, err := io.Copy(h, f); err != nil { + return 0, err + } - return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil + return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil*/ + return 0, nil } diff --git a/filesmap.go b/filesmap.go index eed8b27..8642a15 100644 --- a/filesmap.go +++ b/filesmap.go @@ -14,9 +14,7 @@ import ( // FilesMap is a struct for listing files by Size and Hash to search for duplicates type FilesMap struct { - FilesBySize map[int64]string - - FilesByHash map[string][]string + Files []fileEntry FilesHashing chan fileEntry @@ -33,8 +31,6 @@ type FilesMap struct { func newFilesMap() *FilesMap { return &FilesMap{ - FilesBySize: map[int64]string{}, - FilesByHash: map[string][]string{}, FilesHashed: make(chan fileEntry, 100000), FilesHashing: make(chan fileEntry), progress: mpb.New(mpb.WithWidth(64)), @@ -68,7 +64,7 @@ func (fm *FilesMap) HashedWorker(done chan bool) { } fm.lock.Lock() - fm.FilesByHash[file.hash] = append(fm.FilesByHash[file.hash], file.path) + fm.Files = append(fm.Files, file) fm.lock.Unlock() } @@ -100,22 +96,9 @@ func (fm *FilesMap) WalkDirectories() int { fmt.Println("Incoming", path) } - prevFile, ok := fm.FilesBySize[size] - if !ok { - fm.FilesBySize[size] = path - return nil - } - - if prevFile != "" { - sumSize += size - fm.FilesHashing <- fileEntry{prevFile, size, ""} - } - - fm.FilesBySize[size] = "" - sumSize += size fm.hashingBar.SetTotal(int64(sumSize), false) - fm.FilesHashing <- fileEntry{path, info.Size(), ""} + fm.FilesHashing <- fileEntry{path, info.Size(), 0} return nil }) } @@ -128,5 +111,5 @@ func (fm *FilesMap) WalkDirectories() int { type fileEntry struct { path string size int64 - hash string + hash uint64 } diff --git a/go.mod b/go.mod index 574656c..652f4ca 100644 --- a/go.mod +++ b/go.mod @@ -11,5 +11,6 @@ require ( github.com/mattn/go-runewidth v0.0.13 // indirect github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect github.com/rivo/uniseg v0.2.0 // indirect + github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3 // indirect golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c // indirect ) diff --git a/go.sum b/go.sum index a4f592a..d14baba 100644 --- a/go.sum +++ b/go.sum @@ -10,6 +10,8 @@ github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6 github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8= github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3 h1:njlZPzLwU639dk2kqnCPPv+wNjq7Xb6EfUxe/oX0/NM= +github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3/go.mod h1:hpGUWaI9xL8pRQCTXQgocU38Qw1g0Us7n5PxxTwTCYU= github.com/vbauerster/mpb/v7 v7.0.5 h1:/CQxyoPjdlON6kqqq3Uq3UUw5tFjuBCjOmLQYaYvBmM= github.com/vbauerster/mpb/v7 v7.0.5/go.mod h1:emzg+wTChQAdJgyrDatWRHxji2AnmCrAemByOURuvZs= golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I= diff --git a/main.go b/main.go index 539d569..0e9584c 100644 --- a/main.go +++ b/main.go @@ -1,20 +1,15 @@ package main import ( - "bufio" - "encoding/json" "flag" "fmt" - "io/ioutil" "log" "os" - "path/filepath" "runtime" "runtime/pprof" - "strconv" - "strings" "sync" + "github.com/steakknife/hamming" "github.com/vbauerster/mpb/v7" "github.com/vbauerster/mpb/v7/decor" ) @@ -48,11 +43,11 @@ func main() { countFiles := 0 filesMap := newFilesMap() if *fromFile != "" { - byteValue, _ := ioutil.ReadFile(*fromFile) - err := json.Unmarshal(byteValue, &filesMap.FilesByHash) - if err != nil { - panic(err) - } + // byteValue, _ := ioutil.ReadFile(*fromFile) + // err := json.Unmarshal(byteValue, &filesMap.FilesByHash) + // if err != nil { + // panic(err) + // } } else { filesMap.incomingBar = filesMap.progress.AddSpinner(0, mpb.PrependDecorators( @@ -92,108 +87,124 @@ func main() { } if *toFile != "" && *fromFile == "" { - json, _ := json.MarshalIndent(filesMap.FilesByHash, "", " ") - ioutil.WriteFile(*toFile, json, 0644) + // json, _ := json.MarshalIndent(filesMap.FilesByHash, "", " ") + // ioutil.WriteFile(*toFile, json, 0644) } if *deleteDupesIn != "" { - deleteIn := filepath.Clean(*deleteDupesIn) - for hash := range filesMap.FilesByHash { - duplicateFiles := filesMap.FilesByHash[hash] - if len(duplicateFiles) <= 1 { - continue - } - - hasDupesInFolder := false - hasDupesOutsideFolder := false - for _, file := range duplicateFiles { - fileIsInFolder := strings.HasPrefix(filepath.Clean(file), deleteIn) - hasDupesOutsideFolder = hasDupesOutsideFolder || !fileIsInFolder - hasDupesInFolder = hasDupesInFolder || fileIsInFolder - } - - if !hasDupesInFolder || !hasDupesOutsideFolder { - if !hasDupesOutsideFolder { - fmt.Println("Not deleting one of the following files, since all would be deleted") - } - if !hasDupesInFolder { - fmt.Println("Not deleting one of the following files, since none are in the selected directory") - } - - for _, file := range duplicateFiles { - fmt.Println("-", file) - } - fmt.Println() - continue - } - - for _, file := range duplicateFiles { - if strings.HasPrefix(filepath.Clean(file), deleteIn) { - fmt.Println("Would delete ", file) - if *force { - remove(file) + /* deleteIn := filepath.Clean(*deleteDupesIn) + for hash := range filesMap.FilesByHash { + duplicateFiles := filesMap.FilesByHash[hash] + if len(duplicateFiles) <= 1 { + continue } - } - } - } + + hasDupesInFolder := false + hasDupesOutsideFolder := false + for _, file := range duplicateFiles { + fileIsInFolder := strings.HasPrefix(filepath.Clean(file), deleteIn) + hasDupesOutsideFolder = hasDupesOutsideFolder || !fileIsInFolder + hasDupesInFolder = hasDupesInFolder || fileIsInFolder + } + + if !hasDupesInFolder || !hasDupesOutsideFolder { + if !hasDupesOutsideFolder { + fmt.Println("Not deleting one of the following files, since all would be deleted") + } + if !hasDupesInFolder { + fmt.Println("Not deleting one of the following files, since none are in the selected directory") + } + + for _, file := range duplicateFiles { + fmt.Println("-", file) + } + fmt.Println() + continue + } + + for _, file := range duplicateFiles { + if strings.HasPrefix(filepath.Clean(file), deleteIn) { + fmt.Println("Would delete ", file) + if *force { + remove(file) + } + } + } + }*/ } else if *promptForDelete { - reader := bufio.NewReader(os.Stdin) - for hash := range filesMap.FilesByHash { - duplicateFiles := filesMap.FilesByHash[hash] - if len(duplicateFiles) <= 1 { - continue - } + /* reader := bufio.NewReader(os.Stdin) + for hash := range filesMap.FilesByHash { + duplicateFiles := filesMap.FilesByHash[hash] + if len(duplicateFiles) <= 1 { + continue + } - fmt.Print("\033[H\033[2J") - for i, file := range duplicateFiles { - fmt.Println(i+1, file) - } + fmt.Print("\033[H\033[2J") + for i, file := range duplicateFiles { + fmt.Println(i+1, file) + } - fmt.Printf("Which file to keep? ") - input, err := reader.ReadString('\n') - if err != nil { - fmt.Println("Invalid input") - continue - } + fmt.Printf("Which file to keep? ") + input, err := reader.ReadString('\n') + if err != nil { + fmt.Println("Invalid input") + continue + } - input = strings.TrimRight(input, "\n\r") - intInput, err := strconv.Atoi(input) - if err != nil || intInput > len(duplicateFiles) || intInput < 1 { - fmt.Println("Invalid input") - continue - } + input = strings.TrimRight(input, "\n\r") + intInput, err := strconv.Atoi(input) + if err != nil || intInput > len(duplicateFiles) || intInput < 1 { + fmt.Println("Invalid input") + continue + } - for i, file := range duplicateFiles { - if i+1 == intInput { - continue - } + for i, file := range duplicateFiles { + if i+1 == intInput { + continue + } - if *force { - remove(file) - } - } - } + if *force { + remove(file) + } + } + }*/ } else { countInstances := 0 countDupeSets := 0 - for hash := range filesMap.FilesByHash { - duplicateFiles := filesMap.FilesByHash[hash] - if len(duplicateFiles) <= 1 { + + for fileIndex := range filesMap.Files { + var currentCluster []fileEntry + file := filesMap.Files[fileIndex] + currentCluster = append(currentCluster, filesMap.Files[fileIndex]) + for otherIndex := range filesMap.Files { + if fileIndex == otherIndex { + continue + } + otherFile := filesMap.Files[otherIndex] + var distance = hamming.Uint64(file.hash, otherFile.hash) + if distance > 5 { + continue + } + + currentCluster = append(currentCluster, otherFile) + } + + if len(currentCluster) <= 1 { continue } countDupeSets++ - for _, file := range duplicateFiles { + for _, file := range currentCluster { countInstances++ - fmt.Println(file) + fmt.Println(file.path) } fmt.Println() } fmt.Println("Statistics:") fmt.Println(countFiles, "Files") - fmt.Println(len(filesMap.FilesBySize), "Unique Sizes") - fmt.Println(len(filesMap.FilesByHash), "Unique Hashes") + // fmt.Println(len(filesMap.FilesBySize), "Unique Sizes") + // fmt.Println(len(filesMap.FilesByHash), "Unique Hashes") fmt.Println(countInstances, "Duplicate Files") fmt.Println(countDupeSets, "Duplicate Sets") }