Extract getImageClusters

This commit is contained in:
Jan Bader
2023-12-09 14:50:29 +01:00
parent a6c978eaee
commit 1a9e17de10
2 changed files with 47 additions and 30 deletions

View File

@ -7,8 +7,10 @@ import (
"image"
"os"
"path/filepath"
"slices"
"sync"
"github.com/steakknife/hamming"
"github.com/vbauerster/mpb/v8"
)
@ -172,6 +174,45 @@ func (fm *FilesMap) hashFile(path string, size int64) int64 {
return 1
}
type imageCluster struct {
images []similarImage
}
type similarImage struct {
path string
distance int
}
func (fm *FilesMap) getImageClusters() []imageCluster {
var clusters []imageCluster
for len(fm.Images) > 0 {
file := fm.Images[0]
fm.Images = slices.Delete(fm.Images, 0, 1)
var currentCluster []similarImage
currentCluster = append(currentCluster, similarImage{path: file.path})
for otherIndex := len(fm.Images) - 1; otherIndex >= 0; otherIndex-- {
otherFile := fm.Images[otherIndex]
var distance = hamming.Uint64(file.imageHash, otherFile.imageHash)
if distance > 5 {
continue
}
fm.Images = slices.Delete(fm.Images, otherIndex, otherIndex+1)
currentCluster = append(currentCluster, similarImage{path: otherFile.path, distance: distance})
}
if len(currentCluster) == 0 {
continue
}
clusters = append(clusters, imageCluster{images: currentCluster})
}
return clusters
}
func (fm *FilesMap) hashImage(path string, size int64) {
fm.ImagesHashing <- imageEntry{path, size, 0}
}

36
main.go
View File

@ -15,9 +15,6 @@ import (
"strings"
"sync"
"slices"
"github.com/steakknife/hamming"
"github.com/vbauerster/mpb/v8"
"github.com/vbauerster/mpb/v8/decor"
)
@ -195,8 +192,7 @@ func main() {
countDupeSets := 0
fmt.Println("Files that are binary identical:")
for hash := range filesMap.FilesByHash {
duplicateFiles := filesMap.FilesByHash[hash]
for _, duplicateFiles := range filesMap.FilesByHash {
if len(duplicateFiles) <= 1 {
continue
}
@ -210,32 +206,12 @@ func main() {
}
fmt.Println("Images that are similar:")
for len(filesMap.Images) > 0 {
file := filesMap.Images[0]
filesMap.Images = slices.Delete(filesMap.Images, 0, 1)
var currentCluster []imageEntry
currentCluster = append(currentCluster, file)
for otherIndex := len(filesMap.Images) - 1; otherIndex >= 0; otherIndex-- {
otherFile := filesMap.Images[otherIndex]
var distance = hamming.Uint64(file.imageHash, otherFile.imageHash)
if distance > 5 {
continue
}
filesMap.Images = slices.Delete(filesMap.Images, otherIndex, otherIndex+1)
if len(currentCluster) == 1 {
fmt.Println(currentCluster[0].path)
countDupeSets++
countInstances++
}
currentCluster = append(currentCluster, otherFile)
fmt.Println(otherFile.path, distance)
imageClusters := filesMap.getImageClusters()
for _, cluster := range imageClusters {
countDupeSets++
for _, image := range cluster.images {
countInstances++
}
if len(currentCluster) <= 1 {
continue
fmt.Println(image.path, image.distance)
}
fmt.Println()