Group by hamming distance

This commit is contained in:
Jan Bader
2023-12-06 15:02:29 +01:00
parent bbdc296cbd
commit eb25a625cb
5 changed files with 118 additions and 123 deletions

24
file.go
View File

@ -1,10 +1,7 @@
package main package main
import ( import (
"crypto/sha1"
"encoding/base64"
"image/jpeg" "image/jpeg"
"io"
"os" "os"
"path/filepath" "path/filepath"
"strconv" "strconv"
@ -45,30 +42,31 @@ func moveButDontOvewrite(path string, targetPath string) {
} }
} }
func calculateHash(path string) (string, error) { func calculateHash(path string) (uint64, error) {
f, err := os.Open(path) f, err := os.Open(path)
if err != nil { if err != nil {
return "", err return 0, err
} }
defer f.Close() defer f.Close()
if strings.HasSuffix(path, ".jpg") { if strings.HasSuffix(path, ".jpg") {
img, err := jpeg.Decode(f) img, err := jpeg.Decode(f)
if err != nil { if err != nil {
return "", err return 0, err
} }
hash, err := goimagehash.DifferenceHash(img) hash, err := goimagehash.DifferenceHash(img)
if err != nil { if err != nil {
return "", err return 0, err
} }
return hash.ToString(), nil return hash.GetHash(), nil
} }
h := sha1.New() /* h := sha1.New()
if _, err := io.Copy(h, f); err != nil { if _, err := io.Copy(h, f); err != nil {
return "", err return 0, err
} }
return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil*/
return 0, nil
} }

View File

@ -14,9 +14,7 @@ import (
// FilesMap is a struct for listing files by Size and Hash to search for duplicates // FilesMap is a struct for listing files by Size and Hash to search for duplicates
type FilesMap struct { type FilesMap struct {
FilesBySize map[int64]string Files []fileEntry
FilesByHash map[string][]string
FilesHashing chan fileEntry FilesHashing chan fileEntry
@ -33,8 +31,6 @@ type FilesMap struct {
func newFilesMap() *FilesMap { func newFilesMap() *FilesMap {
return &FilesMap{ return &FilesMap{
FilesBySize: map[int64]string{},
FilesByHash: map[string][]string{},
FilesHashed: make(chan fileEntry, 100000), FilesHashed: make(chan fileEntry, 100000),
FilesHashing: make(chan fileEntry), FilesHashing: make(chan fileEntry),
progress: mpb.New(mpb.WithWidth(64)), progress: mpb.New(mpb.WithWidth(64)),
@ -68,7 +64,7 @@ func (fm *FilesMap) HashedWorker(done chan bool) {
} }
fm.lock.Lock() fm.lock.Lock()
fm.FilesByHash[file.hash] = append(fm.FilesByHash[file.hash], file.path) fm.Files = append(fm.Files, file)
fm.lock.Unlock() fm.lock.Unlock()
} }
@ -100,22 +96,9 @@ func (fm *FilesMap) WalkDirectories() int {
fmt.Println("Incoming", path) fmt.Println("Incoming", path)
} }
prevFile, ok := fm.FilesBySize[size]
if !ok {
fm.FilesBySize[size] = path
return nil
}
if prevFile != "" {
sumSize += size
fm.FilesHashing <- fileEntry{prevFile, size, ""}
}
fm.FilesBySize[size] = ""
sumSize += size sumSize += size
fm.hashingBar.SetTotal(int64(sumSize), false) fm.hashingBar.SetTotal(int64(sumSize), false)
fm.FilesHashing <- fileEntry{path, info.Size(), ""} fm.FilesHashing <- fileEntry{path, info.Size(), 0}
return nil return nil
}) })
} }
@ -128,5 +111,5 @@ func (fm *FilesMap) WalkDirectories() int {
type fileEntry struct { type fileEntry struct {
path string path string
size int64 size int64
hash string hash uint64
} }

1
go.mod
View File

@ -11,5 +11,6 @@ require (
github.com/mattn/go-runewidth v0.0.13 // indirect github.com/mattn/go-runewidth v0.0.13 // indirect
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect
github.com/rivo/uniseg v0.2.0 // indirect github.com/rivo/uniseg v0.2.0 // indirect
github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3 // indirect
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c // indirect golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c // indirect
) )

2
go.sum
View File

@ -10,6 +10,8 @@ github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8= github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3 h1:njlZPzLwU639dk2kqnCPPv+wNjq7Xb6EfUxe/oX0/NM=
github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3/go.mod h1:hpGUWaI9xL8pRQCTXQgocU38Qw1g0Us7n5PxxTwTCYU=
github.com/vbauerster/mpb/v7 v7.0.5 h1:/CQxyoPjdlON6kqqq3Uq3UUw5tFjuBCjOmLQYaYvBmM= github.com/vbauerster/mpb/v7 v7.0.5 h1:/CQxyoPjdlON6kqqq3Uq3UUw5tFjuBCjOmLQYaYvBmM=
github.com/vbauerster/mpb/v7 v7.0.5/go.mod h1:emzg+wTChQAdJgyrDatWRHxji2AnmCrAemByOURuvZs= github.com/vbauerster/mpb/v7 v7.0.5/go.mod h1:emzg+wTChQAdJgyrDatWRHxji2AnmCrAemByOURuvZs=
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I= golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I=

189
main.go
View File

@ -1,20 +1,15 @@
package main package main
import ( import (
"bufio"
"encoding/json"
"flag" "flag"
"fmt" "fmt"
"io/ioutil"
"log" "log"
"os" "os"
"path/filepath"
"runtime" "runtime"
"runtime/pprof" "runtime/pprof"
"strconv"
"strings"
"sync" "sync"
"github.com/steakknife/hamming"
"github.com/vbauerster/mpb/v7" "github.com/vbauerster/mpb/v7"
"github.com/vbauerster/mpb/v7/decor" "github.com/vbauerster/mpb/v7/decor"
) )
@ -48,11 +43,11 @@ func main() {
countFiles := 0 countFiles := 0
filesMap := newFilesMap() filesMap := newFilesMap()
if *fromFile != "" { if *fromFile != "" {
byteValue, _ := ioutil.ReadFile(*fromFile) // byteValue, _ := ioutil.ReadFile(*fromFile)
err := json.Unmarshal(byteValue, &filesMap.FilesByHash) // err := json.Unmarshal(byteValue, &filesMap.FilesByHash)
if err != nil { // if err != nil {
panic(err) // panic(err)
} // }
} else { } else {
filesMap.incomingBar = filesMap.progress.AddSpinner(0, filesMap.incomingBar = filesMap.progress.AddSpinner(0,
mpb.PrependDecorators( mpb.PrependDecorators(
@ -92,108 +87,124 @@ func main() {
} }
if *toFile != "" && *fromFile == "" { if *toFile != "" && *fromFile == "" {
json, _ := json.MarshalIndent(filesMap.FilesByHash, "", " ") // json, _ := json.MarshalIndent(filesMap.FilesByHash, "", " ")
ioutil.WriteFile(*toFile, json, 0644) // ioutil.WriteFile(*toFile, json, 0644)
} }
if *deleteDupesIn != "" { if *deleteDupesIn != "" {
deleteIn := filepath.Clean(*deleteDupesIn) /* deleteIn := filepath.Clean(*deleteDupesIn)
for hash := range filesMap.FilesByHash { for hash := range filesMap.FilesByHash {
duplicateFiles := filesMap.FilesByHash[hash] duplicateFiles := filesMap.FilesByHash[hash]
if len(duplicateFiles) <= 1 { if len(duplicateFiles) <= 1 {
continue continue
}
hasDupesInFolder := false
hasDupesOutsideFolder := false
for _, file := range duplicateFiles {
fileIsInFolder := strings.HasPrefix(filepath.Clean(file), deleteIn)
hasDupesOutsideFolder = hasDupesOutsideFolder || !fileIsInFolder
hasDupesInFolder = hasDupesInFolder || fileIsInFolder
}
if !hasDupesInFolder || !hasDupesOutsideFolder {
if !hasDupesOutsideFolder {
fmt.Println("Not deleting one of the following files, since all would be deleted")
}
if !hasDupesInFolder {
fmt.Println("Not deleting one of the following files, since none are in the selected directory")
}
for _, file := range duplicateFiles {
fmt.Println("-", file)
}
fmt.Println()
continue
}
for _, file := range duplicateFiles {
if strings.HasPrefix(filepath.Clean(file), deleteIn) {
fmt.Println("Would delete ", file)
if *force {
remove(file)
} }
}
} hasDupesInFolder := false
} hasDupesOutsideFolder := false
for _, file := range duplicateFiles {
fileIsInFolder := strings.HasPrefix(filepath.Clean(file), deleteIn)
hasDupesOutsideFolder = hasDupesOutsideFolder || !fileIsInFolder
hasDupesInFolder = hasDupesInFolder || fileIsInFolder
}
if !hasDupesInFolder || !hasDupesOutsideFolder {
if !hasDupesOutsideFolder {
fmt.Println("Not deleting one of the following files, since all would be deleted")
}
if !hasDupesInFolder {
fmt.Println("Not deleting one of the following files, since none are in the selected directory")
}
for _, file := range duplicateFiles {
fmt.Println("-", file)
}
fmt.Println()
continue
}
for _, file := range duplicateFiles {
if strings.HasPrefix(filepath.Clean(file), deleteIn) {
fmt.Println("Would delete ", file)
if *force {
remove(file)
}
}
}
}*/
} else if *promptForDelete { } else if *promptForDelete {
reader := bufio.NewReader(os.Stdin) /* reader := bufio.NewReader(os.Stdin)
for hash := range filesMap.FilesByHash { for hash := range filesMap.FilesByHash {
duplicateFiles := filesMap.FilesByHash[hash] duplicateFiles := filesMap.FilesByHash[hash]
if len(duplicateFiles) <= 1 { if len(duplicateFiles) <= 1 {
continue continue
} }
fmt.Print("\033[H\033[2J") fmt.Print("\033[H\033[2J")
for i, file := range duplicateFiles { for i, file := range duplicateFiles {
fmt.Println(i+1, file) fmt.Println(i+1, file)
} }
fmt.Printf("Which file to keep? ") fmt.Printf("Which file to keep? ")
input, err := reader.ReadString('\n') input, err := reader.ReadString('\n')
if err != nil { if err != nil {
fmt.Println("Invalid input") fmt.Println("Invalid input")
continue continue
} }
input = strings.TrimRight(input, "\n\r") input = strings.TrimRight(input, "\n\r")
intInput, err := strconv.Atoi(input) intInput, err := strconv.Atoi(input)
if err != nil || intInput > len(duplicateFiles) || intInput < 1 { if err != nil || intInput > len(duplicateFiles) || intInput < 1 {
fmt.Println("Invalid input") fmt.Println("Invalid input")
continue continue
} }
for i, file := range duplicateFiles { for i, file := range duplicateFiles {
if i+1 == intInput { if i+1 == intInput {
continue continue
} }
if *force { if *force {
remove(file) remove(file)
} }
} }
} }*/
} else { } else {
countInstances := 0 countInstances := 0
countDupeSets := 0 countDupeSets := 0
for hash := range filesMap.FilesByHash {
duplicateFiles := filesMap.FilesByHash[hash] for fileIndex := range filesMap.Files {
if len(duplicateFiles) <= 1 { var currentCluster []fileEntry
file := filesMap.Files[fileIndex]
currentCluster = append(currentCluster, filesMap.Files[fileIndex])
for otherIndex := range filesMap.Files {
if fileIndex == otherIndex {
continue
}
otherFile := filesMap.Files[otherIndex]
var distance = hamming.Uint64(file.hash, otherFile.hash)
if distance > 5 {
continue
}
currentCluster = append(currentCluster, otherFile)
}
if len(currentCluster) <= 1 {
continue continue
} }
countDupeSets++ countDupeSets++
for _, file := range duplicateFiles { for _, file := range currentCluster {
countInstances++ countInstances++
fmt.Println(file) fmt.Println(file.path)
} }
fmt.Println() fmt.Println()
} }
fmt.Println("Statistics:") fmt.Println("Statistics:")
fmt.Println(countFiles, "Files") fmt.Println(countFiles, "Files")
fmt.Println(len(filesMap.FilesBySize), "Unique Sizes") // fmt.Println(len(filesMap.FilesBySize), "Unique Sizes")
fmt.Println(len(filesMap.FilesByHash), "Unique Hashes") // fmt.Println(len(filesMap.FilesByHash), "Unique Hashes")
fmt.Println(countInstances, "Duplicate Files") fmt.Println(countInstances, "Duplicate Files")
fmt.Println(countDupeSets, "Duplicate Sets") fmt.Println(countDupeSets, "Duplicate Sets")
} }