mirror of
				https://github.com/JaCoB1123/dupe-finder.git
				synced 2025-11-04 12:38:29 +01:00 
			
		
		
		
	Reintroduce regular file handling
This commit is contained in:
		
							
								
								
									
										47
									
								
								file.go
									
									
									
									
									
								
							
							
						
						
									
										47
									
								
								file.go
									
									
									
									
									
								
							@@ -1,11 +1,16 @@
 | 
			
		||||
package main
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"image/jpeg"
 | 
			
		||||
	"crypto/sha1"
 | 
			
		||||
	"encoding/base64"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"image"
 | 
			
		||||
	_ "image/jpeg"
 | 
			
		||||
	_ "image/png"
 | 
			
		||||
	"io"
 | 
			
		||||
	"os"
 | 
			
		||||
	"path/filepath"
 | 
			
		||||
	"strconv"
 | 
			
		||||
	"strings"
 | 
			
		||||
 | 
			
		||||
	"github.com/corona10/goimagehash"
 | 
			
		||||
)
 | 
			
		||||
@@ -42,31 +47,45 @@ func moveButDontOvewrite(path string, targetPath string) {
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func calculateHash(path string) (uint64, error) {
 | 
			
		||||
func calculateImageHash(path string) (uint64, error) {
 | 
			
		||||
	f, err := os.Open(path)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return 0, err
 | 
			
		||||
	}
 | 
			
		||||
	defer f.Close()
 | 
			
		||||
 | 
			
		||||
	if strings.HasSuffix(path, ".jpg") {
 | 
			
		||||
		img, err := jpeg.Decode(f)
 | 
			
		||||
	fmt.Println("jh:", path, err)
 | 
			
		||||
	img, _, err := image.Decode(f)
 | 
			
		||||
	fmt.Println("oh:", path, err)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		fmt.Println("kh:", path, err)
 | 
			
		||||
		return 0, err
 | 
			
		||||
	}
 | 
			
		||||
	fmt.Println("lh:", path, err)
 | 
			
		||||
	hash, err := goimagehash.DifferenceHash(img)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		fmt.Println("mh:", path, err)
 | 
			
		||||
		return 0, err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	fmt.Println("nh:", path, err)
 | 
			
		||||
	fmt.Println(path, hash.ToString())
 | 
			
		||||
	return hash.GetHash(), nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/*	h := sha1.New()
 | 
			
		||||
		if _, err := io.Copy(h, f); err != nil {
 | 
			
		||||
			return 0, err
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil*/
 | 
			
		||||
	return 0, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func calculateFileHash(path string) (string, error) {
 | 
			
		||||
	f, err := os.Open(path)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return "", err
 | 
			
		||||
	}
 | 
			
		||||
	defer f.Close()
 | 
			
		||||
 | 
			
		||||
	h := sha1.New()
 | 
			
		||||
	if _, err := io.Copy(h, f); err != nil {
 | 
			
		||||
		return "", err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	stringHash := base64.RawStdEncoding.EncodeToString(h.Sum(nil))
 | 
			
		||||
	fmt.Println(path, stringHash)
 | 
			
		||||
	return stringHash, nil
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										92
									
								
								filesmap.go
									
									
									
									
									
								
							
							
						
						
									
										92
									
								
								filesmap.go
									
									
									
									
									
								
							@@ -6,7 +6,6 @@ import (
 | 
			
		||||
	"log"
 | 
			
		||||
	"os"
 | 
			
		||||
	"path/filepath"
 | 
			
		||||
	"strings"
 | 
			
		||||
	"sync"
 | 
			
		||||
 | 
			
		||||
	"github.com/vbauerster/mpb/v7"
 | 
			
		||||
@@ -14,12 +13,20 @@ import (
 | 
			
		||||
 | 
			
		||||
// FilesMap is a struct for listing files by Size and Hash to search for duplicates
 | 
			
		||||
type FilesMap struct {
 | 
			
		||||
	Files []fileEntry
 | 
			
		||||
	Images []imageEntry
 | 
			
		||||
 | 
			
		||||
	FilesBySize map[int64]string
 | 
			
		||||
 | 
			
		||||
	FilesByHash map[string][]string
 | 
			
		||||
 | 
			
		||||
	FilesHashing chan fileEntry
 | 
			
		||||
 | 
			
		||||
	FilesHashed chan fileEntry
 | 
			
		||||
 | 
			
		||||
	ImagesHashing chan imageEntry
 | 
			
		||||
 | 
			
		||||
	ImagesHashed chan imageEntry
 | 
			
		||||
 | 
			
		||||
	progress *mpb.Progress
 | 
			
		||||
 | 
			
		||||
	incomingBar *mpb.Bar
 | 
			
		||||
@@ -31,22 +38,26 @@ type FilesMap struct {
 | 
			
		||||
 | 
			
		||||
func newFilesMap() *FilesMap {
 | 
			
		||||
	return &FilesMap{
 | 
			
		||||
		FilesBySize:   map[int64]string{},
 | 
			
		||||
		FilesByHash:   map[string][]string{},
 | 
			
		||||
		FilesHashed:   make(chan fileEntry, 100000),
 | 
			
		||||
		FilesHashing:  make(chan fileEntry),
 | 
			
		||||
		ImagesHashed:  make(chan imageEntry, 100000),
 | 
			
		||||
		ImagesHashing: make(chan imageEntry),
 | 
			
		||||
		progress:      mpb.New(mpb.WithWidth(64)),
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (fm *FilesMap) HashingWorker(wg *sync.WaitGroup) {
 | 
			
		||||
func (fm *FilesMap) FileHashingWorker(wg *sync.WaitGroup) {
 | 
			
		||||
	for file := range fm.FilesHashing {
 | 
			
		||||
		if *verbose {
 | 
			
		||||
			fmt.Println("Hashing", file.path)
 | 
			
		||||
			fmt.Println("Hashing file", file.path)
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		hash, err := calculateHash(file.path)
 | 
			
		||||
		hash, err := calculateFileHash(file.path)
 | 
			
		||||
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			log.Printf("Error calculating Hash for %s: %v\n", file.path, err)
 | 
			
		||||
			log.Printf("Error calculating Hash file for %s: %v\n", file.path, err)
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
@@ -57,6 +68,25 @@ func (fm *FilesMap) HashingWorker(wg *sync.WaitGroup) {
 | 
			
		||||
	wg.Done()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (fm *FilesMap) ImageHashingWorker(wg *sync.WaitGroup) {
 | 
			
		||||
	for file := range fm.ImagesHashing {
 | 
			
		||||
		if *verbose {
 | 
			
		||||
			fmt.Println("Hashing image", file.path)
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		hash, err := calculateImageHash(file.path)
 | 
			
		||||
		if err != nil {
 | 
			
		||||
			log.Printf("Error calculating Hash for image %s: %v\n", file.path, err)
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		file.imageHash = hash
 | 
			
		||||
		fm.hashingBar.IncrInt64(file.size)
 | 
			
		||||
		fm.ImagesHashed <- file
 | 
			
		||||
	}
 | 
			
		||||
	wg.Done()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (fm *FilesMap) HashedWorker(done chan bool) {
 | 
			
		||||
	for file := range fm.FilesHashed {
 | 
			
		||||
		if *verbose {
 | 
			
		||||
@@ -64,7 +94,7 @@ func (fm *FilesMap) HashedWorker(done chan bool) {
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		fm.lock.Lock()
 | 
			
		||||
		fm.Files = append(fm.Files, file)
 | 
			
		||||
		fm.FilesByHash[file.hash] = append(fm.FilesByHash[file.hash], file.path)
 | 
			
		||||
		fm.lock.Unlock()
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
@@ -85,20 +115,14 @@ func (fm *FilesMap) WalkDirectories() int {
 | 
			
		||||
				return nil
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			if !strings.HasSuffix(path, ".jpg") {
 | 
			
		||||
				size = 123456789123456
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			fm.incomingBar.Increment()
 | 
			
		||||
			countFiles++
 | 
			
		||||
			fm.incomingBar.SetTotal(int64(countFiles), false)
 | 
			
		||||
			if *verbose {
 | 
			
		||||
				fmt.Println("Incoming", path)
 | 
			
		||||
			}
 | 
			
		||||
			fmt.Println("Walked past", path)
 | 
			
		||||
 | 
			
		||||
			sumSize += size
 | 
			
		||||
			countFiles++
 | 
			
		||||
			fm.incomingBar.SetTotal(int64(countFiles), false)
 | 
			
		||||
			fm.hashingBar.SetTotal(int64(sumSize), false)
 | 
			
		||||
			fm.FilesHashing <- fileEntry{path, info.Size(), 0}
 | 
			
		||||
			fm.hashFile(path, size)
 | 
			
		||||
			fm.hashImage(path, size)
 | 
			
		||||
			return nil
 | 
			
		||||
		})
 | 
			
		||||
	}
 | 
			
		||||
@@ -108,8 +132,38 @@ func (fm *FilesMap) WalkDirectories() int {
 | 
			
		||||
	return countFiles
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (fm *FilesMap) hashFile(path string, size int64) {
 | 
			
		||||
	prevFile, ok := fm.FilesBySize[size]
 | 
			
		||||
	if !ok {
 | 
			
		||||
		fm.FilesBySize[size] = path
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if prevFile != "" {
 | 
			
		||||
		fm.FilesHashing <- fileEntry{prevFile, size, ""}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	fm.FilesBySize[size] = ""
 | 
			
		||||
	fm.incomingBar.Increment()
 | 
			
		||||
	if *verbose {
 | 
			
		||||
		fmt.Println("Incoming", path)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	fm.FilesHashing <- fileEntry{path, size, ""}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (fm *FilesMap) hashImage(path string, size int64) {
 | 
			
		||||
	fm.ImagesHashing <- imageEntry{path, size, 0}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type imageEntry struct {
 | 
			
		||||
	path      string
 | 
			
		||||
	size      int64
 | 
			
		||||
	imageHash uint64
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type fileEntry struct {
 | 
			
		||||
	path string
 | 
			
		||||
	size int64
 | 
			
		||||
	hash uint64
 | 
			
		||||
	hash string
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										23
									
								
								main.go
									
									
									
									
									
								
							
							
						
						
									
										23
									
								
								main.go
									
									
									
									
									
								
							@@ -73,8 +73,9 @@ func main() {
 | 
			
		||||
		done := make(chan bool)
 | 
			
		||||
		wg := sync.WaitGroup{}
 | 
			
		||||
		for i := 0; i < runtime.GOMAXPROCS(0); i++ {
 | 
			
		||||
			wg.Add(1)
 | 
			
		||||
			go filesMap.HashingWorker(&wg)
 | 
			
		||||
			wg.Add(2)
 | 
			
		||||
			go filesMap.ImageHashingWorker(&wg)
 | 
			
		||||
			go filesMap.FileHashingWorker(&wg)
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		go filesMap.HashedWorker(done)
 | 
			
		||||
@@ -172,17 +173,17 @@ func main() {
 | 
			
		||||
		countInstances := 0
 | 
			
		||||
		countDupeSets := 0
 | 
			
		||||
 | 
			
		||||
		for fileIndex := range filesMap.Files {
 | 
			
		||||
			var currentCluster []fileEntry
 | 
			
		||||
			file := filesMap.Files[fileIndex]
 | 
			
		||||
			currentCluster = append(currentCluster, filesMap.Files[fileIndex])
 | 
			
		||||
			for otherIndex := range filesMap.Files {
 | 
			
		||||
		for fileIndex := range filesMap.Images {
 | 
			
		||||
			var currentCluster []imageEntry
 | 
			
		||||
			file := filesMap.Images[fileIndex]
 | 
			
		||||
			currentCluster = append(currentCluster, filesMap.Images[fileIndex])
 | 
			
		||||
			for otherIndex := range filesMap.Images {
 | 
			
		||||
				if fileIndex == otherIndex {
 | 
			
		||||
					continue
 | 
			
		||||
				}
 | 
			
		||||
 | 
			
		||||
				otherFile := filesMap.Files[otherIndex]
 | 
			
		||||
				var distance = hamming.Uint64(file.hash, otherFile.hash)
 | 
			
		||||
				otherFile := filesMap.Images[otherIndex]
 | 
			
		||||
				var distance = hamming.Uint64(file.imageHash, otherFile.imageHash)
 | 
			
		||||
				if distance > 5 {
 | 
			
		||||
					continue
 | 
			
		||||
				}
 | 
			
		||||
@@ -206,8 +207,8 @@ func main() {
 | 
			
		||||
 | 
			
		||||
		fmt.Println("Statistics:")
 | 
			
		||||
		fmt.Println(countFiles, "Files")
 | 
			
		||||
		//		fmt.Println(len(filesMap.FilesBySize), "Unique Sizes")
 | 
			
		||||
		//		fmt.Println(len(filesMap.FilesByHash), "Unique Hashes")
 | 
			
		||||
		fmt.Println(len(filesMap.FilesBySize), "Unique Sizes")
 | 
			
		||||
		fmt.Println(len(filesMap.FilesByHash), "Unique Hashes")
 | 
			
		||||
		fmt.Println(countInstances, "Duplicate Files")
 | 
			
		||||
		fmt.Println(countDupeSets, "Duplicate Sets")
 | 
			
		||||
	}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user