Reintroduce regular file handling

This commit is contained in:
Jan Bader 2023-12-06 15:47:49 +01:00
parent 6059baeeeb
commit f4f827b3e4
3 changed files with 128 additions and 54 deletions

37
file.go
View File

@ -1,11 +1,16 @@
package main package main
import ( import (
"image/jpeg" "crypto/sha1"
"encoding/base64"
"fmt"
"image"
_ "image/jpeg"
_ "image/png"
"io"
"os" "os"
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings"
"github.com/corona10/goimagehash" "github.com/corona10/goimagehash"
) )
@ -42,31 +47,45 @@ func moveButDontOvewrite(path string, targetPath string) {
} }
} }
func calculateHash(path string) (uint64, error) { func calculateImageHash(path string) (uint64, error) {
f, err := os.Open(path) f, err := os.Open(path)
if err != nil { if err != nil {
return 0, err return 0, err
} }
defer f.Close() defer f.Close()
if strings.HasSuffix(path, ".jpg") { fmt.Println("jh:", path, err)
img, err := jpeg.Decode(f) img, _, err := image.Decode(f)
fmt.Println("oh:", path, err)
if err != nil { if err != nil {
fmt.Println("kh:", path, err)
return 0, err return 0, err
} }
fmt.Println("lh:", path, err)
hash, err := goimagehash.DifferenceHash(img) hash, err := goimagehash.DifferenceHash(img)
if err != nil { if err != nil {
fmt.Println("mh:", path, err)
return 0, err return 0, err
} }
fmt.Println("nh:", path, err)
fmt.Println(path, hash.ToString())
return hash.GetHash(), nil return hash.GetHash(), nil
} }
/* h := sha1.New() func calculateFileHash(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := sha1.New()
if _, err := io.Copy(h, f); err != nil { if _, err := io.Copy(h, f); err != nil {
return 0, err return "", err
} }
return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil*/ stringHash := base64.RawStdEncoding.EncodeToString(h.Sum(nil))
return 0, nil fmt.Println(path, stringHash)
return stringHash, nil
} }

View File

@ -6,7 +6,6 @@ import (
"log" "log"
"os" "os"
"path/filepath" "path/filepath"
"strings"
"sync" "sync"
"github.com/vbauerster/mpb/v7" "github.com/vbauerster/mpb/v7"
@ -14,12 +13,20 @@ import (
// FilesMap is a struct for listing files by Size and Hash to search for duplicates // FilesMap is a struct for listing files by Size and Hash to search for duplicates
type FilesMap struct { type FilesMap struct {
Files []fileEntry Images []imageEntry
FilesBySize map[int64]string
FilesByHash map[string][]string
FilesHashing chan fileEntry FilesHashing chan fileEntry
FilesHashed chan fileEntry FilesHashed chan fileEntry
ImagesHashing chan imageEntry
ImagesHashed chan imageEntry
progress *mpb.Progress progress *mpb.Progress
incomingBar *mpb.Bar incomingBar *mpb.Bar
@ -31,22 +38,26 @@ type FilesMap struct {
func newFilesMap() *FilesMap { func newFilesMap() *FilesMap {
return &FilesMap{ return &FilesMap{
FilesBySize: map[int64]string{},
FilesByHash: map[string][]string{},
FilesHashed: make(chan fileEntry, 100000), FilesHashed: make(chan fileEntry, 100000),
FilesHashing: make(chan fileEntry), FilesHashing: make(chan fileEntry),
ImagesHashed: make(chan imageEntry, 100000),
ImagesHashing: make(chan imageEntry),
progress: mpb.New(mpb.WithWidth(64)), progress: mpb.New(mpb.WithWidth(64)),
} }
} }
func (fm *FilesMap) HashingWorker(wg *sync.WaitGroup) { func (fm *FilesMap) FileHashingWorker(wg *sync.WaitGroup) {
for file := range fm.FilesHashing { for file := range fm.FilesHashing {
if *verbose { if *verbose {
fmt.Println("Hashing", file.path) fmt.Println("Hashing file", file.path)
} }
hash, err := calculateHash(file.path) hash, err := calculateFileHash(file.path)
if err != nil { if err != nil {
log.Printf("Error calculating Hash for %s: %v\n", file.path, err) log.Printf("Error calculating Hash file for %s: %v\n", file.path, err)
continue continue
} }
@ -57,6 +68,25 @@ func (fm *FilesMap) HashingWorker(wg *sync.WaitGroup) {
wg.Done() wg.Done()
} }
func (fm *FilesMap) ImageHashingWorker(wg *sync.WaitGroup) {
for file := range fm.ImagesHashing {
if *verbose {
fmt.Println("Hashing image", file.path)
}
hash, err := calculateImageHash(file.path)
if err != nil {
log.Printf("Error calculating Hash for image %s: %v\n", file.path, err)
continue
}
file.imageHash = hash
fm.hashingBar.IncrInt64(file.size)
fm.ImagesHashed <- file
}
wg.Done()
}
func (fm *FilesMap) HashedWorker(done chan bool) { func (fm *FilesMap) HashedWorker(done chan bool) {
for file := range fm.FilesHashed { for file := range fm.FilesHashed {
if *verbose { if *verbose {
@ -64,7 +94,7 @@ func (fm *FilesMap) HashedWorker(done chan bool) {
} }
fm.lock.Lock() fm.lock.Lock()
fm.Files = append(fm.Files, file) fm.FilesByHash[file.hash] = append(fm.FilesByHash[file.hash], file.path)
fm.lock.Unlock() fm.lock.Unlock()
} }
@ -85,20 +115,14 @@ func (fm *FilesMap) WalkDirectories() int {
return nil return nil
} }
if !strings.HasSuffix(path, ".jpg") { fmt.Println("Walked past", path)
size = 123456789123456
}
fm.incomingBar.Increment()
countFiles++
fm.incomingBar.SetTotal(int64(countFiles), false)
if *verbose {
fmt.Println("Incoming", path)
}
sumSize += size sumSize += size
countFiles++
fm.incomingBar.SetTotal(int64(countFiles), false)
fm.hashingBar.SetTotal(int64(sumSize), false) fm.hashingBar.SetTotal(int64(sumSize), false)
fm.FilesHashing <- fileEntry{path, info.Size(), 0} fm.hashFile(path, size)
fm.hashImage(path, size)
return nil return nil
}) })
} }
@ -108,8 +132,38 @@ func (fm *FilesMap) WalkDirectories() int {
return countFiles return countFiles
} }
func (fm *FilesMap) hashFile(path string, size int64) {
prevFile, ok := fm.FilesBySize[size]
if !ok {
fm.FilesBySize[size] = path
return
}
if prevFile != "" {
fm.FilesHashing <- fileEntry{prevFile, size, ""}
}
fm.FilesBySize[size] = ""
fm.incomingBar.Increment()
if *verbose {
fmt.Println("Incoming", path)
}
fm.FilesHashing <- fileEntry{path, size, ""}
}
func (fm *FilesMap) hashImage(path string, size int64) {
fm.ImagesHashing <- imageEntry{path, size, 0}
}
type imageEntry struct {
path string
size int64
imageHash uint64
}
type fileEntry struct { type fileEntry struct {
path string path string
size int64 size int64
hash uint64 hash string
} }

23
main.go
View File

@ -73,8 +73,9 @@ func main() {
done := make(chan bool) done := make(chan bool)
wg := sync.WaitGroup{} wg := sync.WaitGroup{}
for i := 0; i < runtime.GOMAXPROCS(0); i++ { for i := 0; i < runtime.GOMAXPROCS(0); i++ {
wg.Add(1) wg.Add(2)
go filesMap.HashingWorker(&wg) go filesMap.ImageHashingWorker(&wg)
go filesMap.FileHashingWorker(&wg)
} }
go filesMap.HashedWorker(done) go filesMap.HashedWorker(done)
@ -172,17 +173,17 @@ func main() {
countInstances := 0 countInstances := 0
countDupeSets := 0 countDupeSets := 0
for fileIndex := range filesMap.Files { for fileIndex := range filesMap.Images {
var currentCluster []fileEntry var currentCluster []imageEntry
file := filesMap.Files[fileIndex] file := filesMap.Images[fileIndex]
currentCluster = append(currentCluster, filesMap.Files[fileIndex]) currentCluster = append(currentCluster, filesMap.Images[fileIndex])
for otherIndex := range filesMap.Files { for otherIndex := range filesMap.Images {
if fileIndex == otherIndex { if fileIndex == otherIndex {
continue continue
} }
otherFile := filesMap.Files[otherIndex] otherFile := filesMap.Images[otherIndex]
var distance = hamming.Uint64(file.hash, otherFile.hash) var distance = hamming.Uint64(file.imageHash, otherFile.imageHash)
if distance > 5 { if distance > 5 {
continue continue
} }
@ -206,8 +207,8 @@ func main() {
fmt.Println("Statistics:") fmt.Println("Statistics:")
fmt.Println(countFiles, "Files") fmt.Println(countFiles, "Files")
// fmt.Println(len(filesMap.FilesBySize), "Unique Sizes") fmt.Println(len(filesMap.FilesBySize), "Unique Sizes")
// fmt.Println(len(filesMap.FilesByHash), "Unique Hashes") fmt.Println(len(filesMap.FilesByHash), "Unique Hashes")
fmt.Println(countInstances, "Duplicate Files") fmt.Println(countInstances, "Duplicate Files")
fmt.Println(countDupeSets, "Duplicate Sets") fmt.Println(countDupeSets, "Duplicate Sets")
} }