Group by hamming distance

This commit is contained in:
Jan Bader
2023-12-06 15:02:29 +01:00
parent bbdc296cbd
commit eb25a625cb
5 changed files with 118 additions and 123 deletions

20
file.go
View File

@ -1,10 +1,7 @@
package main
import (
"crypto/sha1"
"encoding/base64"
"image/jpeg"
"io"
"os"
"path/filepath"
"strconv"
@ -45,30 +42,31 @@ func moveButDontOvewrite(path string, targetPath string) {
}
}
func calculateHash(path string) (string, error) {
func calculateHash(path string) (uint64, error) {
f, err := os.Open(path)
if err != nil {
return "", err
return 0, err
}
defer f.Close()
if strings.HasSuffix(path, ".jpg") {
img, err := jpeg.Decode(f)
if err != nil {
return "", err
return 0, err
}
hash, err := goimagehash.DifferenceHash(img)
if err != nil {
return "", err
return 0, err
}
return hash.ToString(), nil
return hash.GetHash(), nil
}
h := sha1.New()
/* h := sha1.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
return 0, err
}
return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil
return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil*/
return 0, nil
}

View File

@ -14,9 +14,7 @@ import (
// FilesMap is a struct for listing files by Size and Hash to search for duplicates
type FilesMap struct {
FilesBySize map[int64]string
FilesByHash map[string][]string
Files []fileEntry
FilesHashing chan fileEntry
@ -33,8 +31,6 @@ type FilesMap struct {
func newFilesMap() *FilesMap {
return &FilesMap{
FilesBySize: map[int64]string{},
FilesByHash: map[string][]string{},
FilesHashed: make(chan fileEntry, 100000),
FilesHashing: make(chan fileEntry),
progress: mpb.New(mpb.WithWidth(64)),
@ -68,7 +64,7 @@ func (fm *FilesMap) HashedWorker(done chan bool) {
}
fm.lock.Lock()
fm.FilesByHash[file.hash] = append(fm.FilesByHash[file.hash], file.path)
fm.Files = append(fm.Files, file)
fm.lock.Unlock()
}
@ -100,22 +96,9 @@ func (fm *FilesMap) WalkDirectories() int {
fmt.Println("Incoming", path)
}
prevFile, ok := fm.FilesBySize[size]
if !ok {
fm.FilesBySize[size] = path
return nil
}
if prevFile != "" {
sumSize += size
fm.FilesHashing <- fileEntry{prevFile, size, ""}
}
fm.FilesBySize[size] = ""
sumSize += size
fm.hashingBar.SetTotal(int64(sumSize), false)
fm.FilesHashing <- fileEntry{path, info.Size(), ""}
fm.FilesHashing <- fileEntry{path, info.Size(), 0}
return nil
})
}
@ -128,5 +111,5 @@ func (fm *FilesMap) WalkDirectories() int {
type fileEntry struct {
path string
size int64
hash string
hash uint64
}

1
go.mod
View File

@ -11,5 +11,6 @@ require (
github.com/mattn/go-runewidth v0.0.13 // indirect
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3 // indirect
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c // indirect
)

2
go.sum
View File

@ -10,6 +10,8 @@ github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3 h1:njlZPzLwU639dk2kqnCPPv+wNjq7Xb6EfUxe/oX0/NM=
github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3/go.mod h1:hpGUWaI9xL8pRQCTXQgocU38Qw1g0Us7n5PxxTwTCYU=
github.com/vbauerster/mpb/v7 v7.0.5 h1:/CQxyoPjdlON6kqqq3Uq3UUw5tFjuBCjOmLQYaYvBmM=
github.com/vbauerster/mpb/v7 v7.0.5/go.mod h1:emzg+wTChQAdJgyrDatWRHxji2AnmCrAemByOURuvZs=
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I=

59
main.go
View File

@ -1,20 +1,15 @@
package main
import (
"bufio"
"encoding/json"
"flag"
"fmt"
"io/ioutil"
"log"
"os"
"path/filepath"
"runtime"
"runtime/pprof"
"strconv"
"strings"
"sync"
"github.com/steakknife/hamming"
"github.com/vbauerster/mpb/v7"
"github.com/vbauerster/mpb/v7/decor"
)
@ -48,11 +43,11 @@ func main() {
countFiles := 0
filesMap := newFilesMap()
if *fromFile != "" {
byteValue, _ := ioutil.ReadFile(*fromFile)
err := json.Unmarshal(byteValue, &filesMap.FilesByHash)
if err != nil {
panic(err)
}
// byteValue, _ := ioutil.ReadFile(*fromFile)
// err := json.Unmarshal(byteValue, &filesMap.FilesByHash)
// if err != nil {
// panic(err)
// }
} else {
filesMap.incomingBar = filesMap.progress.AddSpinner(0,
mpb.PrependDecorators(
@ -92,12 +87,12 @@ func main() {
}
if *toFile != "" && *fromFile == "" {
json, _ := json.MarshalIndent(filesMap.FilesByHash, "", " ")
ioutil.WriteFile(*toFile, json, 0644)
// json, _ := json.MarshalIndent(filesMap.FilesByHash, "", " ")
// ioutil.WriteFile(*toFile, json, 0644)
}
if *deleteDupesIn != "" {
deleteIn := filepath.Clean(*deleteDupesIn)
/* deleteIn := filepath.Clean(*deleteDupesIn)
for hash := range filesMap.FilesByHash {
duplicateFiles := filesMap.FilesByHash[hash]
if len(duplicateFiles) <= 1 {
@ -135,9 +130,9 @@ func main() {
}
}
}
}
}*/
} else if *promptForDelete {
reader := bufio.NewReader(os.Stdin)
/* reader := bufio.NewReader(os.Stdin)
for hash := range filesMap.FilesByHash {
duplicateFiles := filesMap.FilesByHash[hash]
if len(duplicateFiles) <= 1 {
@ -172,28 +167,44 @@ func main() {
remove(file)
}
}
}
}*/
} else {
countInstances := 0
countDupeSets := 0
for hash := range filesMap.FilesByHash {
duplicateFiles := filesMap.FilesByHash[hash]
if len(duplicateFiles) <= 1 {
for fileIndex := range filesMap.Files {
var currentCluster []fileEntry
file := filesMap.Files[fileIndex]
currentCluster = append(currentCluster, filesMap.Files[fileIndex])
for otherIndex := range filesMap.Files {
if fileIndex == otherIndex {
continue
}
otherFile := filesMap.Files[otherIndex]
var distance = hamming.Uint64(file.hash, otherFile.hash)
if distance > 5 {
continue
}
currentCluster = append(currentCluster, otherFile)
}
if len(currentCluster) <= 1 {
continue
}
countDupeSets++
for _, file := range duplicateFiles {
for _, file := range currentCluster {
countInstances++
fmt.Println(file)
fmt.Println(file.path)
}
fmt.Println()
}
fmt.Println("Statistics:")
fmt.Println(countFiles, "Files")
fmt.Println(len(filesMap.FilesBySize), "Unique Sizes")
fmt.Println(len(filesMap.FilesByHash), "Unique Hashes")
// fmt.Println(len(filesMap.FilesBySize), "Unique Sizes")
// fmt.Println(len(filesMap.FilesByHash), "Unique Hashes")
fmt.Println(countInstances, "Duplicate Files")
fmt.Println(countDupeSets, "Duplicate Sets")
}