11 Commits
0.7 ... 0.8

Author SHA1 Message Date
594a88c3ec Add Flag to enable cpu profiling 2020-11-22 01:23:07 +01:00
29fa093184 Use SHA1 for better performance 2020-11-22 01:22:52 +01:00
ff2d4daeda Only remember last file 2020-11-22 01:09:02 +01:00
e33d7e2ca0 Add WaitGroup for HashingWorker 2020-11-22 01:02:07 +01:00
a3fa3d4e7c Parallel 2020-11-22 00:55:12 +01:00
8007b5686d Rename variable 2020-11-21 23:13:44 +01:00
87c8a6e817 Extract filesmap to own file 2020-11-21 22:14:42 +01:00
8a9bcbf62e Move funcs to other file 2020-11-21 22:13:34 +01:00
f16a143125 Do not export delete-methods 2020-11-21 22:11:59 +01:00
903909de77 Add note about releases 2020-11-21 22:05:06 +01:00
09a4dc8660 Fix code blocks in README 2020-11-21 22:03:14 +01:00
4 changed files with 257 additions and 180 deletions

View File

@ -3,12 +3,14 @@ Because I couldn't find a good program for my usecase, I wrote this simple Go pr
## Installation ## Installation
If you have go installed, the easiest way to install is `go get`: If you have go installed, the easiest way to install and update is `go get`:
``` ```
go get "github.com/JaCoB1123/dupe-finder" go get -u "github.com/JaCoB1123/dupe-finder"
``` ```
Otherwise you can download the latest binary from the [releases](https://github.com/JaCoB1123/dupe-finder/releases) page.
## Usage ## Usage
dupe-finder supports the following options: dupe-finder supports the following options:
@ -34,15 +36,15 @@ dupe-finder supports the following options:
Find all duplicate files in `~/` and save the results to `dupes.json` Find all duplicate files in `~/` and save the results to `dupes.json`
``` ```
> dupe-finder --to-file dupes.json ~/ > dupe-finder --to-file dupes.json ~/
``̀` ```
Load previous results from `dupes.json` and delete all duplicates located in ~/.cache Load previous results from `dupes.json` and delete all duplicates located in ~/.cache
``` ```
> dupe-finder --from-file dupes.json --delete-dupes-in ~/.cache > dupe-finder --from-file dupes.json --delete-dupes-in ~/.cache
`` ```
Find all duplicate files in `~/' and `/mnt/EXT`. Prompt which file to keep for each set of duplicates and move the others to /dupes/. Find all duplicate files in `~/' and `/mnt/EXT`. Prompt which file to keep for each set of duplicates and move the others to /dupes/.
``` ```
> dupe-finder --delete-prompt --move-files /dupes/ ~/ /mnt/EXT > dupe-finder --delete-prompt --move-files /dupes/ ~/ /mnt/EXT
``̀` ```

57
file.go Normal file
View File

@ -0,0 +1,57 @@
package main
import (
"crypto/sha1"
"encoding/base64"
"io"
"os"
"path/filepath"
"strconv"
)
func remove(path string) {
if !*force {
return
}
if *moveToFolder == "" {
os.Remove(path)
return
}
moveButDontOvewrite(path, *moveToFolder)
}
func moveButDontOvewrite(path string, targetPath string) {
num := 0
filename := filepath.Base(path)
target := filepath.Join(targetPath, filename)
for {
_, err := os.Stat(target)
if os.IsNotExist(err) {
os.Rename(path, target)
return
}
target = filepath.Join(targetPath, filename+"."+strconv.Itoa(num))
num++
}
}
func calculateHash(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := sha1.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
}
return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil
}

110
filesmap.go Normal file
View File

@ -0,0 +1,110 @@
package main
import (
"flag"
"fmt"
"log"
"os"
"path/filepath"
"sync"
)
// FilesMap is a struct for listing files by Size and Hash to search for duplicates
type FilesMap struct {
FilesBySize map[int64]string
FilesByHash map[string][]string
FilesHashing chan fileEntry
FilesIncoming chan fileEntry
FilesHashed chan fileEntry
lock sync.Mutex
}
func newFilesMap() *FilesMap {
return &FilesMap{
FilesBySize: map[int64]string{},
FilesByHash: map[string][]string{},
FilesHashed: make(chan fileEntry),
FilesIncoming: make(chan fileEntry),
FilesHashing: make(chan fileEntry),
}
}
func (fm *FilesMap) IncomingWorker() {
for file := range fm.FilesIncoming {
if *verbose {
fmt.Println("Incoming", file.path)
}
prevFile, ok := fm.FilesBySize[file.size]
if !ok {
fm.FilesBySize[file.size] = file.path
continue
}
if prevFile != "" {
fm.FilesHashing <- fileEntry{prevFile, file.size, ""}
}
fm.FilesBySize[file.size] = ""
fm.FilesHashing <- file
}
close(fm.FilesHashing)
}
func (fm *FilesMap) HashingWorker(wg *sync.WaitGroup) {
for file := range fm.FilesHashing {
if *verbose {
fmt.Println("Hashing", file.path)
}
hash, err := calculateHash(file.path)
if err != nil {
log.Printf("Error calculating Hash for %s: %v\n", file, err)
continue
}
file.hash = hash
fm.FilesHashed <- file
}
wg.Done()
}
func (fm *FilesMap) HashedWorker(done chan bool) {
for file := range fm.FilesHashed {
if *verbose {
fmt.Println("Finishing", file.path)
}
fm.lock.Lock()
if _, ok := fm.FilesByHash[file.hash]; ok {
fm.FilesByHash[file.hash] = append(fm.FilesByHash[file.hash], file.path)
} else {
fm.FilesByHash[file.hash] = []string{file.path}
}
fm.lock.Unlock()
}
done <- true
}
func (fm *FilesMap) WalkDirectories() {
for _, path := range flag.Args() {
filepath.Walk(path, func(path string, info os.FileInfo, err error) error {
if info.IsDir() {
return nil
}
fm.FilesIncoming <- fileEntry{path, info.Size(), ""}
return nil
})
}
close(fm.FilesIncoming)
}

178
main.go
View File

@ -2,17 +2,18 @@ package main
import ( import (
"bufio" "bufio"
"crypto/sha256"
"encoding/base64"
"encoding/json" "encoding/json"
"flag" "flag"
"fmt" "fmt"
"io"
"io/ioutil" "io/ioutil"
"log"
"os" "os"
"path/filepath" "path/filepath"
"runtime"
"runtime/pprof"
"strconv" "strconv"
"strings" "strings"
"sync"
) )
var fromFile = flag.String("from-file", "", "Load results file from <path>") var fromFile = flag.String("from-file", "", "Load results file from <path>")
@ -22,74 +23,59 @@ var promptForDelete = flag.Bool("delete-prompt", false, "Ask which file to keep
var moveToFolder = flag.String("move-files", "", "Move files to <path> instead of deleting them") var moveToFolder = flag.String("move-files", "", "Move files to <path> instead of deleting them")
var force = flag.Bool("force", false, "Actually delete files. Without this options, the files to be deleted are only printed") var force = flag.Bool("force", false, "Actually delete files. Without this options, the files to be deleted are only printed")
var verbose = flag.Bool("verbose", false, "Output additional information") var verbose = flag.Bool("verbose", false, "Output additional information")
var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
func Delete(path string) {
if !*force {
return
}
if *moveToFolder == "" {
os.Remove(path)
return
}
MoveButDontOvewrite(path, *moveToFolder)
}
func MoveButDontOvewrite(path string, targetPath string) {
num := 0
filename := filepath.Base(path)
target := filepath.Join(targetPath, filename)
for {
_, err := os.Stat(target)
if os.IsNotExist(err) {
os.Rename(path, target)
return
}
target = filepath.Join(targetPath, filename+"."+strconv.Itoa(num))
num++
}
}
func main() { func main() {
flag.Parse() flag.Parse()
if *cpuprofile != "" {
f, err := os.Create(*cpuprofile)
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
}
if *verbose { if *verbose {
printConfiguration() printConfiguration()
} }
filesMap := newFilesMap() filesMap := newFilesMap()
if *fromFile != "" { if *fromFile != "" {
fmt.Println("Loading file", *fromFile)
byteValue, _ := ioutil.ReadFile(*fromFile) byteValue, _ := ioutil.ReadFile(*fromFile)
err := json.Unmarshal(byteValue, &filesMap.FilesBySize) err := json.Unmarshal(byteValue, &filesMap.FilesByHash)
if err != nil { if err != nil {
panic(err) panic(err)
} }
} else { } else {
for _, path := range flag.Args() { done := make(chan bool)
filepath.Walk(path, func(path string, info os.FileInfo, err error) error { wg := sync.WaitGroup{}
filesMap.Add(path, info) for i := 0; i < runtime.GOMAXPROCS(0); i++ {
return nil wg.Add(1)
}) go filesMap.HashingWorker(&wg)
} }
go filesMap.IncomingWorker()
go filesMap.HashedWorker(done)
filesMap.WalkDirectories()
wg.Wait()
close(filesMap.FilesHashed)
<-done
} }
if *toFile != "" && *fromFile == "" { if *toFile != "" && *fromFile == "" {
json, _ := json.MarshalIndent(filesMap.FilesBySize, "", " ") json, _ := json.MarshalIndent(filesMap.FilesByHash, "", " ")
ioutil.WriteFile(*toFile, json, 644) ioutil.WriteFile(*toFile, json, 644)
} }
if *deleteDupesIn != "" { if *deleteDupesIn != "" {
deleteIn := filepath.Clean(*deleteDupesIn) deleteIn := filepath.Clean(*deleteDupesIn)
for size := range filesMap.FilesBySize { for hash := range filesMap.FilesByHash {
for hash := range filesMap.FilesBySize[size] { duplicateFiles := filesMap.FilesByHash[hash]
duplicateFiles := filesMap.FilesBySize[size][hash]
if len(duplicateFiles) <= 1 { if len(duplicateFiles) <= 1 {
continue continue
} }
@ -98,17 +84,15 @@ func main() {
if strings.HasPrefix(filepath.Clean(file), deleteIn) { if strings.HasPrefix(filepath.Clean(file), deleteIn) {
fmt.Println("Would delete ", file) fmt.Println("Would delete ", file)
if *force { if *force {
Delete(file) remove(file)
}
} }
} }
} }
} }
} else if *promptForDelete { } else if *promptForDelete {
reader := bufio.NewReader(os.Stdin) reader := bufio.NewReader(os.Stdin)
for size := range filesMap.FilesBySize { for hash := range filesMap.FilesByHash {
for hash := range filesMap.FilesBySize[size] { duplicateFiles := filesMap.FilesByHash[hash]
duplicateFiles := filesMap.FilesBySize[size][hash]
if len(duplicateFiles) <= 1 { if len(duplicateFiles) <= 1 {
continue continue
} }
@ -138,16 +122,14 @@ func main() {
} }
if *force { if *force {
Delete(file) remove(file)
}
} }
} }
} }
} else { } else {
for size := range filesMap.FilesBySize { for hash := range filesMap.FilesByHash {
for hash := range filesMap.FilesBySize[size] { duplicateFiles := filesMap.FilesByHash[hash]
duplicateFiles := filesMap.FilesBySize[size][hash]
if len(duplicateFiles) <= 1 { if len(duplicateFiles) <= 1 {
continue continue
} }
@ -159,7 +141,6 @@ func main() {
} }
} }
} }
}
func printConfiguration() { func printConfiguration() {
fmt.Printf("fromFile: \"%v\"\n", *fromFile) fmt.Printf("fromFile: \"%v\"\n", *fromFile)
@ -175,81 +156,8 @@ func printConfiguration() {
fmt.Println() fmt.Println()
} }
// FilesMap is a struct for listing files by Size and Hash to search for duplicates type fileEntry struct {
type FilesMap struct { path string
FilesBySize map[int64]map[string][]string size int64
} hash string
// Add a file to the Map and calculate hash on demand
func (fm *FilesMap) Add(path string, info os.FileInfo) error {
if info.IsDir() {
return nil
}
fileInfo := path
filesByHash := fm.FilesBySize[info.Size()]
// first file with same size
// => create new map for size
if filesByHash == nil {
filesByHash = map[string][]string{}
fm.FilesBySize[info.Size()] = filesByHash
filesByHash[""] = []string{fileInfo}
return nil
}
// second file with same size
// => calculate hashes for all entries
if _, hasEmptyHash := filesByHash[""]; hasEmptyHash {
err := appendByFileHash(filesByHash, fileInfo)
err2 := appendByFileHash(filesByHash, filesByHash[""][0])
delete(filesByHash, "")
if err != nil {
return err
}
return err2
}
// for later files always append by hash
return appendByFileHash(filesByHash, fileInfo)
}
func appendByFileHash(filesByHash map[string][]string, fileInfo string) error {
hash, err := calculateHash(fileInfo)
if err != nil {
return err
}
if _, ok := filesByHash[hash]; ok {
filesByHash[hash] = append(filesByHash[hash], fileInfo)
} else {
filesByHash[hash] = []string{fileInfo}
}
return nil
}
func newFilesMap() *FilesMap {
return &FilesMap{
FilesBySize: map[int64]map[string][]string{},
}
}
func calculateHash(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := sha256.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
}
return base64.RawStdEncoding.EncodeToString(h.Sum(nil)), nil
} }