diff --git a/filesmap.go b/filesmap.go index da273da..0780528 100644 --- a/filesmap.go +++ b/filesmap.go @@ -7,8 +7,10 @@ import ( "image" "os" "path/filepath" + "slices" "sync" + "github.com/steakknife/hamming" "github.com/vbauerster/mpb/v8" ) @@ -172,6 +174,45 @@ func (fm *FilesMap) hashFile(path string, size int64) int64 { return 1 } +type imageCluster struct { + images []similarImage +} + +type similarImage struct { + path string + distance int +} + +func (fm *FilesMap) getImageClusters() []imageCluster { + var clusters []imageCluster + + for len(fm.Images) > 0 { + file := fm.Images[0] + fm.Images = slices.Delete(fm.Images, 0, 1) + + var currentCluster []similarImage + currentCluster = append(currentCluster, similarImage{path: file.path}) + for otherIndex := len(fm.Images) - 1; otherIndex >= 0; otherIndex-- { + otherFile := fm.Images[otherIndex] + var distance = hamming.Uint64(file.imageHash, otherFile.imageHash) + if distance > 5 { + continue + } + + fm.Images = slices.Delete(fm.Images, otherIndex, otherIndex+1) + currentCluster = append(currentCluster, similarImage{path: otherFile.path, distance: distance}) + } + + if len(currentCluster) == 0 { + continue + } + + clusters = append(clusters, imageCluster{images: currentCluster}) + } + + return clusters +} + func (fm *FilesMap) hashImage(path string, size int64) { fm.ImagesHashing <- imageEntry{path, size, 0} } diff --git a/main.go b/main.go index 3182a1f..b256494 100644 --- a/main.go +++ b/main.go @@ -15,9 +15,6 @@ import ( "strings" "sync" - "slices" - - "github.com/steakknife/hamming" "github.com/vbauerster/mpb/v8" "github.com/vbauerster/mpb/v8/decor" ) @@ -195,8 +192,7 @@ func main() { countDupeSets := 0 fmt.Println("Files that are binary identical:") - for hash := range filesMap.FilesByHash { - duplicateFiles := filesMap.FilesByHash[hash] + for _, duplicateFiles := range filesMap.FilesByHash { if len(duplicateFiles) <= 1 { continue } @@ -210,32 +206,12 @@ func main() { } fmt.Println("Images that are similar:") - for len(filesMap.Images) > 0 { - file := filesMap.Images[0] - filesMap.Images = slices.Delete(filesMap.Images, 0, 1) - var currentCluster []imageEntry - - currentCluster = append(currentCluster, file) - for otherIndex := len(filesMap.Images) - 1; otherIndex >= 0; otherIndex-- { - otherFile := filesMap.Images[otherIndex] - var distance = hamming.Uint64(file.imageHash, otherFile.imageHash) - if distance > 5 { - continue - } - - filesMap.Images = slices.Delete(filesMap.Images, otherIndex, otherIndex+1) - if len(currentCluster) == 1 { - fmt.Println(currentCluster[0].path) - countDupeSets++ - countInstances++ - } - currentCluster = append(currentCluster, otherFile) - fmt.Println(otherFile.path, distance) + imageClusters := filesMap.getImageClusters() + for _, cluster := range imageClusters { + countDupeSets++ + for _, image := range cluster.images { countInstances++ - } - - if len(currentCluster) <= 1 { - continue + fmt.Println(image.path, image.distance) } fmt.Println()