From f4f827b3e4d22c47a6d0f3b9e2343e9891f2f4fa Mon Sep 17 00:00:00 2001 From: Jan Bader Date: Wed, 6 Dec 2023 15:47:49 +0100 Subject: [PATCH] Reintroduce regular file handling --- file.go | 61 +++++++++++++++++++++------------ filesmap.go | 98 +++++++++++++++++++++++++++++++++++++++++------------ main.go | 23 +++++++------ 3 files changed, 128 insertions(+), 54 deletions(-) diff --git a/file.go b/file.go index 8aa2a08..b5d0ce6 100644 --- a/file.go +++ b/file.go @@ -1,11 +1,16 @@ package main import ( - "image/jpeg" + "crypto/sha1" + "encoding/base64" + "fmt" + "image" + _ "image/jpeg" + _ "image/png" + "io" "os" "path/filepath" "strconv" - "strings" "github.com/corona10/goimagehash" ) @@ -42,31 +47,45 @@ func moveButDontOvewrite(path string, targetPath string) { } } -func calculateHash(path string) (uint64, error) { +func calculateImageHash(path string) (uint64, error) { f, err := os.Open(path) if err != nil { return 0, err } defer f.Close() - if strings.HasSuffix(path, ".jpg") { - img, err := jpeg.Decode(f) - if err != nil { - return 0, err - } - hash, err := goimagehash.DifferenceHash(img) - if err != nil { - return 0, err - } - - return hash.GetHash(), nil + fmt.Println("jh:", path, err) + img, _, err := image.Decode(f) + fmt.Println("oh:", path, err) + if err != nil { + fmt.Println("kh:", path, err) + return 0, err + } + fmt.Println("lh:", path, err) + hash, err := goimagehash.DifferenceHash(img) + if err != nil { + fmt.Println("mh:", path, err) + return 0, err } - /* h := sha1.New() - if _, err := io.Copy(h, f); err != nil { - return 0, err - } - - return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil*/ - return 0, nil + fmt.Println("nh:", path, err) + fmt.Println(path, hash.ToString()) + return hash.GetHash(), nil +} + +func calculateFileHash(path string) (string, error) { + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + + h := sha1.New() + if _, err := io.Copy(h, f); err != nil { + return "", err + } + + stringHash := base64.RawStdEncoding.EncodeToString(h.Sum(nil)) + fmt.Println(path, stringHash) + return stringHash, nil } diff --git a/filesmap.go b/filesmap.go index 8642a15..ba986fe 100644 --- a/filesmap.go +++ b/filesmap.go @@ -6,7 +6,6 @@ import ( "log" "os" "path/filepath" - "strings" "sync" "github.com/vbauerster/mpb/v7" @@ -14,12 +13,20 @@ import ( // FilesMap is a struct for listing files by Size and Hash to search for duplicates type FilesMap struct { - Files []fileEntry + Images []imageEntry + + FilesBySize map[int64]string + + FilesByHash map[string][]string FilesHashing chan fileEntry FilesHashed chan fileEntry + ImagesHashing chan imageEntry + + ImagesHashed chan imageEntry + progress *mpb.Progress incomingBar *mpb.Bar @@ -31,22 +38,26 @@ type FilesMap struct { func newFilesMap() *FilesMap { return &FilesMap{ - FilesHashed: make(chan fileEntry, 100000), - FilesHashing: make(chan fileEntry), - progress: mpb.New(mpb.WithWidth(64)), + FilesBySize: map[int64]string{}, + FilesByHash: map[string][]string{}, + FilesHashed: make(chan fileEntry, 100000), + FilesHashing: make(chan fileEntry), + ImagesHashed: make(chan imageEntry, 100000), + ImagesHashing: make(chan imageEntry), + progress: mpb.New(mpb.WithWidth(64)), } } -func (fm *FilesMap) HashingWorker(wg *sync.WaitGroup) { +func (fm *FilesMap) FileHashingWorker(wg *sync.WaitGroup) { for file := range fm.FilesHashing { if *verbose { - fmt.Println("Hashing", file.path) + fmt.Println("Hashing file", file.path) } - hash, err := calculateHash(file.path) + hash, err := calculateFileHash(file.path) if err != nil { - log.Printf("Error calculating Hash for %s: %v\n", file.path, err) + log.Printf("Error calculating Hash file for %s: %v\n", file.path, err) continue } @@ -57,6 +68,25 @@ func (fm *FilesMap) HashingWorker(wg *sync.WaitGroup) { wg.Done() } +func (fm *FilesMap) ImageHashingWorker(wg *sync.WaitGroup) { + for file := range fm.ImagesHashing { + if *verbose { + fmt.Println("Hashing image", file.path) + } + + hash, err := calculateImageHash(file.path) + if err != nil { + log.Printf("Error calculating Hash for image %s: %v\n", file.path, err) + continue + } + + file.imageHash = hash + fm.hashingBar.IncrInt64(file.size) + fm.ImagesHashed <- file + } + wg.Done() +} + func (fm *FilesMap) HashedWorker(done chan bool) { for file := range fm.FilesHashed { if *verbose { @@ -64,7 +94,7 @@ func (fm *FilesMap) HashedWorker(done chan bool) { } fm.lock.Lock() - fm.Files = append(fm.Files, file) + fm.FilesByHash[file.hash] = append(fm.FilesByHash[file.hash], file.path) fm.lock.Unlock() } @@ -85,20 +115,14 @@ func (fm *FilesMap) WalkDirectories() int { return nil } - if !strings.HasSuffix(path, ".jpg") { - size = 123456789123456 - } - - fm.incomingBar.Increment() - countFiles++ - fm.incomingBar.SetTotal(int64(countFiles), false) - if *verbose { - fmt.Println("Incoming", path) - } + fmt.Println("Walked past", path) sumSize += size + countFiles++ + fm.incomingBar.SetTotal(int64(countFiles), false) fm.hashingBar.SetTotal(int64(sumSize), false) - fm.FilesHashing <- fileEntry{path, info.Size(), 0} + fm.hashFile(path, size) + fm.hashImage(path, size) return nil }) } @@ -108,8 +132,38 @@ func (fm *FilesMap) WalkDirectories() int { return countFiles } +func (fm *FilesMap) hashFile(path string, size int64) { + prevFile, ok := fm.FilesBySize[size] + if !ok { + fm.FilesBySize[size] = path + return + } + + if prevFile != "" { + fm.FilesHashing <- fileEntry{prevFile, size, ""} + } + + fm.FilesBySize[size] = "" + fm.incomingBar.Increment() + if *verbose { + fmt.Println("Incoming", path) + } + + fm.FilesHashing <- fileEntry{path, size, ""} +} + +func (fm *FilesMap) hashImage(path string, size int64) { + fm.ImagesHashing <- imageEntry{path, size, 0} +} + +type imageEntry struct { + path string + size int64 + imageHash uint64 +} + type fileEntry struct { path string size int64 - hash uint64 + hash string } diff --git a/main.go b/main.go index 4bb4fd9..05c0678 100644 --- a/main.go +++ b/main.go @@ -73,8 +73,9 @@ func main() { done := make(chan bool) wg := sync.WaitGroup{} for i := 0; i < runtime.GOMAXPROCS(0); i++ { - wg.Add(1) - go filesMap.HashingWorker(&wg) + wg.Add(2) + go filesMap.ImageHashingWorker(&wg) + go filesMap.FileHashingWorker(&wg) } go filesMap.HashedWorker(done) @@ -172,17 +173,17 @@ func main() { countInstances := 0 countDupeSets := 0 - for fileIndex := range filesMap.Files { - var currentCluster []fileEntry - file := filesMap.Files[fileIndex] - currentCluster = append(currentCluster, filesMap.Files[fileIndex]) - for otherIndex := range filesMap.Files { + for fileIndex := range filesMap.Images { + var currentCluster []imageEntry + file := filesMap.Images[fileIndex] + currentCluster = append(currentCluster, filesMap.Images[fileIndex]) + for otherIndex := range filesMap.Images { if fileIndex == otherIndex { continue } - otherFile := filesMap.Files[otherIndex] - var distance = hamming.Uint64(file.hash, otherFile.hash) + otherFile := filesMap.Images[otherIndex] + var distance = hamming.Uint64(file.imageHash, otherFile.imageHash) if distance > 5 { continue } @@ -206,8 +207,8 @@ func main() { fmt.Println("Statistics:") fmt.Println(countFiles, "Files") - // fmt.Println(len(filesMap.FilesBySize), "Unique Sizes") - // fmt.Println(len(filesMap.FilesByHash), "Unique Hashes") + fmt.Println(len(filesMap.FilesBySize), "Unique Sizes") + fmt.Println(len(filesMap.FilesByHash), "Unique Hashes") fmt.Println(countInstances, "Duplicate Files") fmt.Println(countDupeSets, "Duplicate Sets") }