mirror of
https://github.com/JaCoB1123/dupe-finder.git
synced 2025-07-03 09:08:54 +02:00
Group by hamming distance
This commit is contained in:
24
file.go
24
file.go
@ -1,10 +1,7 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"crypto/sha1"
|
|
||||||
"encoding/base64"
|
|
||||||
"image/jpeg"
|
"image/jpeg"
|
||||||
"io"
|
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
@ -45,30 +42,31 @@ func moveButDontOvewrite(path string, targetPath string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func calculateHash(path string) (string, error) {
|
func calculateHash(path string) (uint64, error) {
|
||||||
f, err := os.Open(path)
|
f, err := os.Open(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return 0, err
|
||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
if strings.HasSuffix(path, ".jpg") {
|
if strings.HasSuffix(path, ".jpg") {
|
||||||
img, err := jpeg.Decode(f)
|
img, err := jpeg.Decode(f)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return 0, err
|
||||||
}
|
}
|
||||||
hash, err := goimagehash.DifferenceHash(img)
|
hash, err := goimagehash.DifferenceHash(img)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return hash.ToString(), nil
|
return hash.GetHash(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
h := sha1.New()
|
/* h := sha1.New()
|
||||||
if _, err := io.Copy(h, f); err != nil {
|
if _, err := io.Copy(h, f); err != nil {
|
||||||
return "", err
|
return 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil
|
return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil*/
|
||||||
|
return 0, nil
|
||||||
}
|
}
|
||||||
|
25
filesmap.go
25
filesmap.go
@ -14,9 +14,7 @@ import (
|
|||||||
|
|
||||||
// FilesMap is a struct for listing files by Size and Hash to search for duplicates
|
// FilesMap is a struct for listing files by Size and Hash to search for duplicates
|
||||||
type FilesMap struct {
|
type FilesMap struct {
|
||||||
FilesBySize map[int64]string
|
Files []fileEntry
|
||||||
|
|
||||||
FilesByHash map[string][]string
|
|
||||||
|
|
||||||
FilesHashing chan fileEntry
|
FilesHashing chan fileEntry
|
||||||
|
|
||||||
@ -33,8 +31,6 @@ type FilesMap struct {
|
|||||||
|
|
||||||
func newFilesMap() *FilesMap {
|
func newFilesMap() *FilesMap {
|
||||||
return &FilesMap{
|
return &FilesMap{
|
||||||
FilesBySize: map[int64]string{},
|
|
||||||
FilesByHash: map[string][]string{},
|
|
||||||
FilesHashed: make(chan fileEntry, 100000),
|
FilesHashed: make(chan fileEntry, 100000),
|
||||||
FilesHashing: make(chan fileEntry),
|
FilesHashing: make(chan fileEntry),
|
||||||
progress: mpb.New(mpb.WithWidth(64)),
|
progress: mpb.New(mpb.WithWidth(64)),
|
||||||
@ -68,7 +64,7 @@ func (fm *FilesMap) HashedWorker(done chan bool) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fm.lock.Lock()
|
fm.lock.Lock()
|
||||||
fm.FilesByHash[file.hash] = append(fm.FilesByHash[file.hash], file.path)
|
fm.Files = append(fm.Files, file)
|
||||||
fm.lock.Unlock()
|
fm.lock.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -100,22 +96,9 @@ func (fm *FilesMap) WalkDirectories() int {
|
|||||||
fmt.Println("Incoming", path)
|
fmt.Println("Incoming", path)
|
||||||
}
|
}
|
||||||
|
|
||||||
prevFile, ok := fm.FilesBySize[size]
|
|
||||||
if !ok {
|
|
||||||
fm.FilesBySize[size] = path
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if prevFile != "" {
|
|
||||||
sumSize += size
|
|
||||||
fm.FilesHashing <- fileEntry{prevFile, size, ""}
|
|
||||||
}
|
|
||||||
|
|
||||||
fm.FilesBySize[size] = ""
|
|
||||||
|
|
||||||
sumSize += size
|
sumSize += size
|
||||||
fm.hashingBar.SetTotal(int64(sumSize), false)
|
fm.hashingBar.SetTotal(int64(sumSize), false)
|
||||||
fm.FilesHashing <- fileEntry{path, info.Size(), ""}
|
fm.FilesHashing <- fileEntry{path, info.Size(), 0}
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -128,5 +111,5 @@ func (fm *FilesMap) WalkDirectories() int {
|
|||||||
type fileEntry struct {
|
type fileEntry struct {
|
||||||
path string
|
path string
|
||||||
size int64
|
size int64
|
||||||
hash string
|
hash uint64
|
||||||
}
|
}
|
||||||
|
1
go.mod
1
go.mod
@ -11,5 +11,6 @@ require (
|
|||||||
github.com/mattn/go-runewidth v0.0.13 // indirect
|
github.com/mattn/go-runewidth v0.0.13 // indirect
|
||||||
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect
|
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect
|
||||||
github.com/rivo/uniseg v0.2.0 // indirect
|
github.com/rivo/uniseg v0.2.0 // indirect
|
||||||
|
github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3 // indirect
|
||||||
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c // indirect
|
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c // indirect
|
||||||
)
|
)
|
||||||
|
2
go.sum
2
go.sum
@ -10,6 +10,8 @@ github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6
|
|||||||
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
|
github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
|
||||||
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
|
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
|
||||||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||||
|
github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3 h1:njlZPzLwU639dk2kqnCPPv+wNjq7Xb6EfUxe/oX0/NM=
|
||||||
|
github.com/steakknife/hamming v0.0.0-20180906055917-c99c65617cd3/go.mod h1:hpGUWaI9xL8pRQCTXQgocU38Qw1g0Us7n5PxxTwTCYU=
|
||||||
github.com/vbauerster/mpb/v7 v7.0.5 h1:/CQxyoPjdlON6kqqq3Uq3UUw5tFjuBCjOmLQYaYvBmM=
|
github.com/vbauerster/mpb/v7 v7.0.5 h1:/CQxyoPjdlON6kqqq3Uq3UUw5tFjuBCjOmLQYaYvBmM=
|
||||||
github.com/vbauerster/mpb/v7 v7.0.5/go.mod h1:emzg+wTChQAdJgyrDatWRHxji2AnmCrAemByOURuvZs=
|
github.com/vbauerster/mpb/v7 v7.0.5/go.mod h1:emzg+wTChQAdJgyrDatWRHxji2AnmCrAemByOURuvZs=
|
||||||
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I=
|
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I=
|
||||||
|
189
main.go
189
main.go
@ -1,20 +1,15 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
|
||||||
"encoding/json"
|
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io/ioutil"
|
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
|
||||||
"runtime"
|
"runtime"
|
||||||
"runtime/pprof"
|
"runtime/pprof"
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
|
"github.com/steakknife/hamming"
|
||||||
"github.com/vbauerster/mpb/v7"
|
"github.com/vbauerster/mpb/v7"
|
||||||
"github.com/vbauerster/mpb/v7/decor"
|
"github.com/vbauerster/mpb/v7/decor"
|
||||||
)
|
)
|
||||||
@ -48,11 +43,11 @@ func main() {
|
|||||||
countFiles := 0
|
countFiles := 0
|
||||||
filesMap := newFilesMap()
|
filesMap := newFilesMap()
|
||||||
if *fromFile != "" {
|
if *fromFile != "" {
|
||||||
byteValue, _ := ioutil.ReadFile(*fromFile)
|
// byteValue, _ := ioutil.ReadFile(*fromFile)
|
||||||
err := json.Unmarshal(byteValue, &filesMap.FilesByHash)
|
// err := json.Unmarshal(byteValue, &filesMap.FilesByHash)
|
||||||
if err != nil {
|
// if err != nil {
|
||||||
panic(err)
|
// panic(err)
|
||||||
}
|
// }
|
||||||
} else {
|
} else {
|
||||||
filesMap.incomingBar = filesMap.progress.AddSpinner(0,
|
filesMap.incomingBar = filesMap.progress.AddSpinner(0,
|
||||||
mpb.PrependDecorators(
|
mpb.PrependDecorators(
|
||||||
@ -92,108 +87,124 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if *toFile != "" && *fromFile == "" {
|
if *toFile != "" && *fromFile == "" {
|
||||||
json, _ := json.MarshalIndent(filesMap.FilesByHash, "", " ")
|
// json, _ := json.MarshalIndent(filesMap.FilesByHash, "", " ")
|
||||||
ioutil.WriteFile(*toFile, json, 0644)
|
// ioutil.WriteFile(*toFile, json, 0644)
|
||||||
}
|
}
|
||||||
|
|
||||||
if *deleteDupesIn != "" {
|
if *deleteDupesIn != "" {
|
||||||
deleteIn := filepath.Clean(*deleteDupesIn)
|
/* deleteIn := filepath.Clean(*deleteDupesIn)
|
||||||
for hash := range filesMap.FilesByHash {
|
for hash := range filesMap.FilesByHash {
|
||||||
duplicateFiles := filesMap.FilesByHash[hash]
|
duplicateFiles := filesMap.FilesByHash[hash]
|
||||||
if len(duplicateFiles) <= 1 {
|
if len(duplicateFiles) <= 1 {
|
||||||
continue
|
continue
|
||||||
}
|
|
||||||
|
|
||||||
hasDupesInFolder := false
|
|
||||||
hasDupesOutsideFolder := false
|
|
||||||
for _, file := range duplicateFiles {
|
|
||||||
fileIsInFolder := strings.HasPrefix(filepath.Clean(file), deleteIn)
|
|
||||||
hasDupesOutsideFolder = hasDupesOutsideFolder || !fileIsInFolder
|
|
||||||
hasDupesInFolder = hasDupesInFolder || fileIsInFolder
|
|
||||||
}
|
|
||||||
|
|
||||||
if !hasDupesInFolder || !hasDupesOutsideFolder {
|
|
||||||
if !hasDupesOutsideFolder {
|
|
||||||
fmt.Println("Not deleting one of the following files, since all would be deleted")
|
|
||||||
}
|
|
||||||
if !hasDupesInFolder {
|
|
||||||
fmt.Println("Not deleting one of the following files, since none are in the selected directory")
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, file := range duplicateFiles {
|
|
||||||
fmt.Println("-", file)
|
|
||||||
}
|
|
||||||
fmt.Println()
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, file := range duplicateFiles {
|
|
||||||
if strings.HasPrefix(filepath.Clean(file), deleteIn) {
|
|
||||||
fmt.Println("Would delete ", file)
|
|
||||||
if *force {
|
|
||||||
remove(file)
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
hasDupesInFolder := false
|
||||||
}
|
hasDupesOutsideFolder := false
|
||||||
|
for _, file := range duplicateFiles {
|
||||||
|
fileIsInFolder := strings.HasPrefix(filepath.Clean(file), deleteIn)
|
||||||
|
hasDupesOutsideFolder = hasDupesOutsideFolder || !fileIsInFolder
|
||||||
|
hasDupesInFolder = hasDupesInFolder || fileIsInFolder
|
||||||
|
}
|
||||||
|
|
||||||
|
if !hasDupesInFolder || !hasDupesOutsideFolder {
|
||||||
|
if !hasDupesOutsideFolder {
|
||||||
|
fmt.Println("Not deleting one of the following files, since all would be deleted")
|
||||||
|
}
|
||||||
|
if !hasDupesInFolder {
|
||||||
|
fmt.Println("Not deleting one of the following files, since none are in the selected directory")
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, file := range duplicateFiles {
|
||||||
|
fmt.Println("-", file)
|
||||||
|
}
|
||||||
|
fmt.Println()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, file := range duplicateFiles {
|
||||||
|
if strings.HasPrefix(filepath.Clean(file), deleteIn) {
|
||||||
|
fmt.Println("Would delete ", file)
|
||||||
|
if *force {
|
||||||
|
remove(file)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}*/
|
||||||
} else if *promptForDelete {
|
} else if *promptForDelete {
|
||||||
reader := bufio.NewReader(os.Stdin)
|
/* reader := bufio.NewReader(os.Stdin)
|
||||||
for hash := range filesMap.FilesByHash {
|
for hash := range filesMap.FilesByHash {
|
||||||
duplicateFiles := filesMap.FilesByHash[hash]
|
duplicateFiles := filesMap.FilesByHash[hash]
|
||||||
if len(duplicateFiles) <= 1 {
|
if len(duplicateFiles) <= 1 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Print("\033[H\033[2J")
|
fmt.Print("\033[H\033[2J")
|
||||||
for i, file := range duplicateFiles {
|
for i, file := range duplicateFiles {
|
||||||
fmt.Println(i+1, file)
|
fmt.Println(i+1, file)
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Printf("Which file to keep? ")
|
fmt.Printf("Which file to keep? ")
|
||||||
input, err := reader.ReadString('\n')
|
input, err := reader.ReadString('\n')
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println("Invalid input")
|
fmt.Println("Invalid input")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
input = strings.TrimRight(input, "\n\r")
|
input = strings.TrimRight(input, "\n\r")
|
||||||
intInput, err := strconv.Atoi(input)
|
intInput, err := strconv.Atoi(input)
|
||||||
if err != nil || intInput > len(duplicateFiles) || intInput < 1 {
|
if err != nil || intInput > len(duplicateFiles) || intInput < 1 {
|
||||||
fmt.Println("Invalid input")
|
fmt.Println("Invalid input")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
for i, file := range duplicateFiles {
|
for i, file := range duplicateFiles {
|
||||||
if i+1 == intInput {
|
if i+1 == intInput {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if *force {
|
if *force {
|
||||||
remove(file)
|
remove(file)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}*/
|
||||||
} else {
|
} else {
|
||||||
countInstances := 0
|
countInstances := 0
|
||||||
countDupeSets := 0
|
countDupeSets := 0
|
||||||
for hash := range filesMap.FilesByHash {
|
|
||||||
duplicateFiles := filesMap.FilesByHash[hash]
|
for fileIndex := range filesMap.Files {
|
||||||
if len(duplicateFiles) <= 1 {
|
var currentCluster []fileEntry
|
||||||
|
file := filesMap.Files[fileIndex]
|
||||||
|
currentCluster = append(currentCluster, filesMap.Files[fileIndex])
|
||||||
|
for otherIndex := range filesMap.Files {
|
||||||
|
if fileIndex == otherIndex {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
otherFile := filesMap.Files[otherIndex]
|
||||||
|
var distance = hamming.Uint64(file.hash, otherFile.hash)
|
||||||
|
if distance > 5 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
currentCluster = append(currentCluster, otherFile)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(currentCluster) <= 1 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
countDupeSets++
|
countDupeSets++
|
||||||
for _, file := range duplicateFiles {
|
for _, file := range currentCluster {
|
||||||
countInstances++
|
countInstances++
|
||||||
fmt.Println(file)
|
fmt.Println(file.path)
|
||||||
}
|
}
|
||||||
fmt.Println()
|
fmt.Println()
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Println("Statistics:")
|
fmt.Println("Statistics:")
|
||||||
fmt.Println(countFiles, "Files")
|
fmt.Println(countFiles, "Files")
|
||||||
fmt.Println(len(filesMap.FilesBySize), "Unique Sizes")
|
// fmt.Println(len(filesMap.FilesBySize), "Unique Sizes")
|
||||||
fmt.Println(len(filesMap.FilesByHash), "Unique Hashes")
|
// fmt.Println(len(filesMap.FilesByHash), "Unique Hashes")
|
||||||
fmt.Println(countInstances, "Duplicate Files")
|
fmt.Println(countInstances, "Duplicate Files")
|
||||||
fmt.Println(countDupeSets, "Duplicate Sets")
|
fmt.Println(countDupeSets, "Duplicate Sets")
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user