package main import ( "bufio" "encoding/json" "flag" "fmt" "io/ioutil" "log" "os" "path/filepath" "runtime" "runtime/pprof" "strconv" "strings" "sync" "slices" "github.com/steakknife/hamming" "github.com/vbauerster/mpb/v8" "github.com/vbauerster/mpb/v8/decor" ) var fromFile = flag.String("from-file", "", "Load results file from ") var toFile = flag.String("to-file", "", "Save results to ") var deleteDupesIn = flag.String("delete-dupes-in", "", "Delete duplicates if they are contained in ") var promptForDelete = flag.Bool("delete-prompt", false, "Ask which file to keep for each dupe-set") var moveToFolder = flag.String("move-files", "", "Move files to instead of deleting them") var minSize = flag.Int64("min-size", -1, "Ignore all files smaller than in Bytes") var force = flag.Bool("force", false, "Actually delete files. Without this options, the files to be deleted are only printed") var verbose = flag.Bool("verbose", false, "Output additional information") var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file") func main() { flag.Parse() if *cpuprofile != "" { f, err := os.Create(*cpuprofile) if err != nil { log.Fatal(err) } pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } if *verbose { printConfiguration() } var countFiles int64 = 0 filesMap := newFilesMap() if *fromFile != "" { byteValue, _ := ioutil.ReadFile(*fromFile) err := json.Unmarshal(byteValue, &filesMap.FilesByHash) if err != nil { panic(err) } } else { filesMap.incomingBar = filesMap.progress.AddSpinner(0, mpb.PrependDecorators( decor.Name("Finding files "), decor.Elapsed(decor.ET_STYLE_HHMMSS), ), mpb.AppendDecorators( decor.AverageSpeed(0, "%8.2f"), decor.Name(" "), decor.CurrentNoUnit("%5d"), ), ) filesMap.fileHashingBar = filesMap.progress.AddBar(0, mpb.PrependDecorators( decor.Name("Hashing files "), decor.Elapsed(decor.ET_STYLE_HHMMSS), ), mpb.AppendDecorators( decor.AverageSpeed(decor.SizeB1024(0), "%23.2f"), decor.Name(" "), decor.CurrentKibiByte("%5d"), ), ) filesMap.imageHashingBar = filesMap.progress.AddBar(0, mpb.PrependDecorators( decor.Name("Hashing images "), decor.Elapsed(decor.ET_STYLE_HHMMSS), ), mpb.AppendDecorators( decor.AverageSpeed(decor.SizeB1024(0), "%23.2f"), decor.Name(" "), decor.CurrentKibiByte("%5d"), ), ) done := make(chan bool) wg := sync.WaitGroup{} for i := 0; i < runtime.GOMAXPROCS(0); i++ { wg.Add(2) go filesMap.ImageHashingWorker(&wg) go filesMap.FileHashingWorker(&wg) } go filesMap.HashedWorker(done) countFiles = filesMap.WalkDirectories() wg.Wait() close(filesMap.FilesHashed) close(filesMap.ImagesHashed) <-done } if *toFile != "" && *fromFile == "" { json, _ := json.MarshalIndent(filesMap.FilesByHash, "", " ") ioutil.WriteFile(*toFile, json, 0644) } if *deleteDupesIn != "" { deleteIn := filepath.Clean(*deleteDupesIn) for hash := range filesMap.FilesByHash { duplicateFiles := filesMap.FilesByHash[hash] if len(duplicateFiles) <= 1 { continue } hasDupesInFolder := false hasDupesOutsideFolder := false for _, file := range duplicateFiles { fileIsInFolder := strings.HasPrefix(filepath.Clean(file), deleteIn) hasDupesOutsideFolder = hasDupesOutsideFolder || !fileIsInFolder hasDupesInFolder = hasDupesInFolder || fileIsInFolder } if !hasDupesInFolder || !hasDupesOutsideFolder { if !hasDupesOutsideFolder { fmt.Println("Not deleting one of the following files, since all would be deleted") } if !hasDupesInFolder { fmt.Println("Not deleting one of the following files, since none are in the selected directory") } for _, file := range duplicateFiles { fmt.Println("-", file) } fmt.Println() continue } for _, file := range duplicateFiles { if strings.HasPrefix(filepath.Clean(file), deleteIn) { fmt.Println("Would delete ", file) if *force { remove(file) } } } } } else if *promptForDelete { reader := bufio.NewReader(os.Stdin) for hash := range filesMap.FilesByHash { duplicateFiles := filesMap.FilesByHash[hash] if len(duplicateFiles) <= 1 { continue } fmt.Print("\033[H\033[2J") for i, file := range duplicateFiles { fmt.Println(i+1, file) } fmt.Printf("Which file to keep? ") input, err := reader.ReadString('\n') if err != nil { fmt.Println("Invalid input") continue } input = strings.TrimRight(input, "\n\r") intInput, err := strconv.Atoi(input) if err != nil || intInput > len(duplicateFiles) || intInput < 1 { fmt.Println("Invalid input") continue } for i, file := range duplicateFiles { if i+1 == intInput { continue } if *force { remove(file) } } } } else { countInstances := 0 countDupeSets := 0 fmt.Println("Files that are binary identical:") for hash := range filesMap.FilesByHash { duplicateFiles := filesMap.FilesByHash[hash] if len(duplicateFiles) <= 1 { continue } countDupeSets++ for _, file := range duplicateFiles { countInstances++ fmt.Println(file) } fmt.Println() } fmt.Println("Images that are similar:") for len(filesMap.Images) > 0 { file := filesMap.Images[0] filesMap.Images = slices.Delete(filesMap.Images, 0, 1) var currentCluster []imageEntry currentCluster = append(currentCluster, file) for otherIndex := len(filesMap.Images) - 1; otherIndex >= 0; otherIndex-- { otherFile := filesMap.Images[otherIndex] var distance = hamming.Uint64(file.imageHash, otherFile.imageHash) if distance > 5 { continue } filesMap.Images = slices.Delete(filesMap.Images, otherIndex, otherIndex+1) if len(currentCluster) == 1 { fmt.Println(currentCluster[0].path) countDupeSets++ countInstances++ } currentCluster = append(currentCluster, otherFile) fmt.Println(otherFile.path, distance) countInstances++ } if len(currentCluster) <= 1 { continue } fmt.Println() } fmt.Println("Statistics:") fmt.Println(countFiles, "Files") fmt.Println(len(filesMap.FilesBySize), "Unique Sizes") fmt.Println(len(filesMap.FilesByHash), "Unique Hashes") fmt.Println(countInstances, "Duplicate Files") fmt.Println(countDupeSets, "Duplicate Sets") } } func printConfiguration() { fmt.Printf("fromFile: \"%v\"\n", *fromFile) fmt.Printf("toFile: \"%v\"\n", *toFile) fmt.Printf("deleteDupesIn: \"%v\"\n", *deleteDupesIn) fmt.Printf("force: \"%v\"\n", *force) fmt.Println("Searching paths:") for _, path := range flag.Args() { fmt.Println("- ", path) } fmt.Println() fmt.Println() }