mirror of
https://github.com/JaCoB1123/dupe-finder.git
synced 2025-05-18 06:01:56 +02:00
Reintroduce regular file handling
This commit is contained in:
parent
6059baeeeb
commit
f4f827b3e4
61
file.go
61
file.go
@ -1,11 +1,16 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"image/jpeg"
|
"crypto/sha1"
|
||||||
|
"encoding/base64"
|
||||||
|
"fmt"
|
||||||
|
"image"
|
||||||
|
_ "image/jpeg"
|
||||||
|
_ "image/png"
|
||||||
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/corona10/goimagehash"
|
"github.com/corona10/goimagehash"
|
||||||
)
|
)
|
||||||
@ -42,31 +47,45 @@ func moveButDontOvewrite(path string, targetPath string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func calculateHash(path string) (uint64, error) {
|
func calculateImageHash(path string) (uint64, error) {
|
||||||
f, err := os.Open(path)
|
f, err := os.Open(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
if strings.HasSuffix(path, ".jpg") {
|
fmt.Println("jh:", path, err)
|
||||||
img, err := jpeg.Decode(f)
|
img, _, err := image.Decode(f)
|
||||||
if err != nil {
|
fmt.Println("oh:", path, err)
|
||||||
return 0, err
|
if err != nil {
|
||||||
}
|
fmt.Println("kh:", path, err)
|
||||||
hash, err := goimagehash.DifferenceHash(img)
|
return 0, err
|
||||||
if err != nil {
|
}
|
||||||
return 0, err
|
fmt.Println("lh:", path, err)
|
||||||
}
|
hash, err := goimagehash.DifferenceHash(img)
|
||||||
|
if err != nil {
|
||||||
return hash.GetHash(), nil
|
fmt.Println("mh:", path, err)
|
||||||
|
return 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
/* h := sha1.New()
|
fmt.Println("nh:", path, err)
|
||||||
if _, err := io.Copy(h, f); err != nil {
|
fmt.Println(path, hash.ToString())
|
||||||
return 0, err
|
return hash.GetHash(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil*/
|
func calculateFileHash(path string) (string, error) {
|
||||||
return 0, nil
|
f, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
h := sha1.New()
|
||||||
|
if _, err := io.Copy(h, f); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
stringHash := base64.RawStdEncoding.EncodeToString(h.Sum(nil))
|
||||||
|
fmt.Println(path, stringHash)
|
||||||
|
return stringHash, nil
|
||||||
}
|
}
|
||||||
|
98
filesmap.go
98
filesmap.go
@ -6,7 +6,6 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"github.com/vbauerster/mpb/v7"
|
"github.com/vbauerster/mpb/v7"
|
||||||
@ -14,12 +13,20 @@ import (
|
|||||||
|
|
||||||
// FilesMap is a struct for listing files by Size and Hash to search for duplicates
|
// FilesMap is a struct for listing files by Size and Hash to search for duplicates
|
||||||
type FilesMap struct {
|
type FilesMap struct {
|
||||||
Files []fileEntry
|
Images []imageEntry
|
||||||
|
|
||||||
|
FilesBySize map[int64]string
|
||||||
|
|
||||||
|
FilesByHash map[string][]string
|
||||||
|
|
||||||
FilesHashing chan fileEntry
|
FilesHashing chan fileEntry
|
||||||
|
|
||||||
FilesHashed chan fileEntry
|
FilesHashed chan fileEntry
|
||||||
|
|
||||||
|
ImagesHashing chan imageEntry
|
||||||
|
|
||||||
|
ImagesHashed chan imageEntry
|
||||||
|
|
||||||
progress *mpb.Progress
|
progress *mpb.Progress
|
||||||
|
|
||||||
incomingBar *mpb.Bar
|
incomingBar *mpb.Bar
|
||||||
@ -31,22 +38,26 @@ type FilesMap struct {
|
|||||||
|
|
||||||
func newFilesMap() *FilesMap {
|
func newFilesMap() *FilesMap {
|
||||||
return &FilesMap{
|
return &FilesMap{
|
||||||
FilesHashed: make(chan fileEntry, 100000),
|
FilesBySize: map[int64]string{},
|
||||||
FilesHashing: make(chan fileEntry),
|
FilesByHash: map[string][]string{},
|
||||||
progress: mpb.New(mpb.WithWidth(64)),
|
FilesHashed: make(chan fileEntry, 100000),
|
||||||
|
FilesHashing: make(chan fileEntry),
|
||||||
|
ImagesHashed: make(chan imageEntry, 100000),
|
||||||
|
ImagesHashing: make(chan imageEntry),
|
||||||
|
progress: mpb.New(mpb.WithWidth(64)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (fm *FilesMap) HashingWorker(wg *sync.WaitGroup) {
|
func (fm *FilesMap) FileHashingWorker(wg *sync.WaitGroup) {
|
||||||
for file := range fm.FilesHashing {
|
for file := range fm.FilesHashing {
|
||||||
if *verbose {
|
if *verbose {
|
||||||
fmt.Println("Hashing", file.path)
|
fmt.Println("Hashing file", file.path)
|
||||||
}
|
}
|
||||||
|
|
||||||
hash, err := calculateHash(file.path)
|
hash, err := calculateFileHash(file.path)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Error calculating Hash for %s: %v\n", file.path, err)
|
log.Printf("Error calculating Hash file for %s: %v\n", file.path, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,6 +68,25 @@ func (fm *FilesMap) HashingWorker(wg *sync.WaitGroup) {
|
|||||||
wg.Done()
|
wg.Done()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (fm *FilesMap) ImageHashingWorker(wg *sync.WaitGroup) {
|
||||||
|
for file := range fm.ImagesHashing {
|
||||||
|
if *verbose {
|
||||||
|
fmt.Println("Hashing image", file.path)
|
||||||
|
}
|
||||||
|
|
||||||
|
hash, err := calculateImageHash(file.path)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Error calculating Hash for image %s: %v\n", file.path, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
file.imageHash = hash
|
||||||
|
fm.hashingBar.IncrInt64(file.size)
|
||||||
|
fm.ImagesHashed <- file
|
||||||
|
}
|
||||||
|
wg.Done()
|
||||||
|
}
|
||||||
|
|
||||||
func (fm *FilesMap) HashedWorker(done chan bool) {
|
func (fm *FilesMap) HashedWorker(done chan bool) {
|
||||||
for file := range fm.FilesHashed {
|
for file := range fm.FilesHashed {
|
||||||
if *verbose {
|
if *verbose {
|
||||||
@ -64,7 +94,7 @@ func (fm *FilesMap) HashedWorker(done chan bool) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fm.lock.Lock()
|
fm.lock.Lock()
|
||||||
fm.Files = append(fm.Files, file)
|
fm.FilesByHash[file.hash] = append(fm.FilesByHash[file.hash], file.path)
|
||||||
fm.lock.Unlock()
|
fm.lock.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -85,20 +115,14 @@ func (fm *FilesMap) WalkDirectories() int {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if !strings.HasSuffix(path, ".jpg") {
|
fmt.Println("Walked past", path)
|
||||||
size = 123456789123456
|
|
||||||
}
|
|
||||||
|
|
||||||
fm.incomingBar.Increment()
|
|
||||||
countFiles++
|
|
||||||
fm.incomingBar.SetTotal(int64(countFiles), false)
|
|
||||||
if *verbose {
|
|
||||||
fmt.Println("Incoming", path)
|
|
||||||
}
|
|
||||||
|
|
||||||
sumSize += size
|
sumSize += size
|
||||||
|
countFiles++
|
||||||
|
fm.incomingBar.SetTotal(int64(countFiles), false)
|
||||||
fm.hashingBar.SetTotal(int64(sumSize), false)
|
fm.hashingBar.SetTotal(int64(sumSize), false)
|
||||||
fm.FilesHashing <- fileEntry{path, info.Size(), 0}
|
fm.hashFile(path, size)
|
||||||
|
fm.hashImage(path, size)
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -108,8 +132,38 @@ func (fm *FilesMap) WalkDirectories() int {
|
|||||||
return countFiles
|
return countFiles
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (fm *FilesMap) hashFile(path string, size int64) {
|
||||||
|
prevFile, ok := fm.FilesBySize[size]
|
||||||
|
if !ok {
|
||||||
|
fm.FilesBySize[size] = path
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if prevFile != "" {
|
||||||
|
fm.FilesHashing <- fileEntry{prevFile, size, ""}
|
||||||
|
}
|
||||||
|
|
||||||
|
fm.FilesBySize[size] = ""
|
||||||
|
fm.incomingBar.Increment()
|
||||||
|
if *verbose {
|
||||||
|
fmt.Println("Incoming", path)
|
||||||
|
}
|
||||||
|
|
||||||
|
fm.FilesHashing <- fileEntry{path, size, ""}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (fm *FilesMap) hashImage(path string, size int64) {
|
||||||
|
fm.ImagesHashing <- imageEntry{path, size, 0}
|
||||||
|
}
|
||||||
|
|
||||||
|
type imageEntry struct {
|
||||||
|
path string
|
||||||
|
size int64
|
||||||
|
imageHash uint64
|
||||||
|
}
|
||||||
|
|
||||||
type fileEntry struct {
|
type fileEntry struct {
|
||||||
path string
|
path string
|
||||||
size int64
|
size int64
|
||||||
hash uint64
|
hash string
|
||||||
}
|
}
|
||||||
|
23
main.go
23
main.go
@ -73,8 +73,9 @@ func main() {
|
|||||||
done := make(chan bool)
|
done := make(chan bool)
|
||||||
wg := sync.WaitGroup{}
|
wg := sync.WaitGroup{}
|
||||||
for i := 0; i < runtime.GOMAXPROCS(0); i++ {
|
for i := 0; i < runtime.GOMAXPROCS(0); i++ {
|
||||||
wg.Add(1)
|
wg.Add(2)
|
||||||
go filesMap.HashingWorker(&wg)
|
go filesMap.ImageHashingWorker(&wg)
|
||||||
|
go filesMap.FileHashingWorker(&wg)
|
||||||
}
|
}
|
||||||
|
|
||||||
go filesMap.HashedWorker(done)
|
go filesMap.HashedWorker(done)
|
||||||
@ -172,17 +173,17 @@ func main() {
|
|||||||
countInstances := 0
|
countInstances := 0
|
||||||
countDupeSets := 0
|
countDupeSets := 0
|
||||||
|
|
||||||
for fileIndex := range filesMap.Files {
|
for fileIndex := range filesMap.Images {
|
||||||
var currentCluster []fileEntry
|
var currentCluster []imageEntry
|
||||||
file := filesMap.Files[fileIndex]
|
file := filesMap.Images[fileIndex]
|
||||||
currentCluster = append(currentCluster, filesMap.Files[fileIndex])
|
currentCluster = append(currentCluster, filesMap.Images[fileIndex])
|
||||||
for otherIndex := range filesMap.Files {
|
for otherIndex := range filesMap.Images {
|
||||||
if fileIndex == otherIndex {
|
if fileIndex == otherIndex {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
otherFile := filesMap.Files[otherIndex]
|
otherFile := filesMap.Images[otherIndex]
|
||||||
var distance = hamming.Uint64(file.hash, otherFile.hash)
|
var distance = hamming.Uint64(file.imageHash, otherFile.imageHash)
|
||||||
if distance > 5 {
|
if distance > 5 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@ -206,8 +207,8 @@ func main() {
|
|||||||
|
|
||||||
fmt.Println("Statistics:")
|
fmt.Println("Statistics:")
|
||||||
fmt.Println(countFiles, "Files")
|
fmt.Println(countFiles, "Files")
|
||||||
// fmt.Println(len(filesMap.FilesBySize), "Unique Sizes")
|
fmt.Println(len(filesMap.FilesBySize), "Unique Sizes")
|
||||||
// fmt.Println(len(filesMap.FilesByHash), "Unique Hashes")
|
fmt.Println(len(filesMap.FilesByHash), "Unique Hashes")
|
||||||
fmt.Println(countInstances, "Duplicate Files")
|
fmt.Println(countInstances, "Duplicate Files")
|
||||||
fmt.Println(countDupeSets, "Duplicate Sets")
|
fmt.Println(countDupeSets, "Duplicate Sets")
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user