aboutsummaryrefslogtreecommitdiff
path: root/fs/operations/dedupe.go
diff options
context:
space:
mode:
Diffstat (limited to 'fs/operations/dedupe.go')
-rw-r--r--fs/operations/dedupe.go506
1 files changed, 506 insertions, 0 deletions
diff --git a/fs/operations/dedupe.go b/fs/operations/dedupe.go
new file mode 100644
index 0000000..60c66c4
--- /dev/null
+++ b/fs/operations/dedupe.go
@@ -0,0 +1,506 @@
+// dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega)
+
+package operations
+
+import (
+ "context"
+ "fmt"
+ "path"
+ "sort"
+ "strings"
+
+ "github.com/rclone/rclone/fs"
+ "github.com/rclone/rclone/fs/accounting"
+ "github.com/rclone/rclone/fs/config"
+ "github.com/rclone/rclone/fs/hash"
+ "github.com/rclone/rclone/fs/walk"
+)
+
+// dedupeRename renames the objs slice to different names
+func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) {
+ doMove := f.Features().Move
+ if doMove == nil {
+ fs.Fatalf(nil, "Fs %v doesn't support Move", f)
+ }
+ ext := path.Ext(remote)
+ base := remote[:len(remote)-len(ext)]
+
+outer:
+ for i, o := range objs {
+ suffix := 1
+ newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
+ _, err := f.NewObject(ctx, newName)
+ for ; err != fs.ErrorObjectNotFound; suffix++ {
+ if err != nil {
+ err = fs.CountError(ctx, err)
+ fs.Errorf(o, "Failed to check for existing object: %v", err)
+ continue outer
+ }
+ if suffix > 100 {
+ fs.Errorf(o, "Could not find an available new name")
+ continue outer
+ }
+ newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
+ _, err = f.NewObject(ctx, newName)
+ }
+ if !SkipDestructive(ctx, o, "rename") {
+ newObj, err := doMove(ctx, o, newName)
+ if err != nil {
+ err = fs.CountError(ctx, err)
+ fs.Errorf(o, "Failed to rename: %v", err)
+ continue
+ }
+ fs.Infof(newObj, "renamed from: %v", o)
+ }
+ }
+}
+
+// dedupeDeleteAllButOne deletes all but the one in keep
+func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) {
+ count := 0
+ for i, o := range objs {
+ if i == keep {
+ continue
+ }
+ err := DeleteFile(ctx, o)
+ if err == nil {
+ count++
+ }
+ }
+ if count > 0 {
+ fs.Logf(remote, "Deleted %d extra copies", count)
+ }
+}
+
+// dedupeDeleteIdentical deletes all but one of identical (by hash) copies
+func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) {
+ ci := fs.GetConfig(ctx)
+
+ // Make map of IDs
+ IDs := make(map[string]int, len(objs))
+ for _, o := range objs {
+ if do, ok := o.(fs.IDer); ok {
+ if ID := do.ID(); ID != "" {
+ IDs[ID]++
+ }
+ }
+ }
+
+ // Remove duplicate IDs
+ newObjs := objs[:0]
+ for _, o := range objs {
+ if do, ok := o.(fs.IDer); ok {
+ if ID := do.ID(); ID != "" {
+ if IDs[ID] <= 1 {
+ newObjs = append(newObjs, o)
+ } else {
+ fs.Logf(o, "Ignoring as it appears %d times in the listing and deleting would lead to data loss", IDs[ID])
+ }
+ }
+ }
+ }
+ objs = newObjs
+
+ // See how many of these duplicates are identical
+ dupesByID := make(map[string][]fs.Object, len(objs))
+ for _, o := range objs {
+ ID := ""
+ if ci.SizeOnly && o.Size() >= 0 {
+ ID = fmt.Sprintf("size %d", o.Size())
+ } else if ht != hash.None {
+ hashValue, err := o.Hash(ctx, ht)
+ if err == nil && hashValue != "" {
+ ID = fmt.Sprintf("%v %s", ht, hashValue)
+ }
+ }
+ if ID == "" {
+ remainingObjs = append(remainingObjs, o)
+ } else {
+ dupesByID[ID] = append(dupesByID[ID], o)
+ }
+ }
+
+ // Delete identical duplicates, filling remainingObjs with the ones remaining
+ for ID, dupes := range dupesByID {
+ remainingObjs = append(remainingObjs, dupes[0])
+ if len(dupes) > 1 {
+ fs.Logf(remote, "Deleting %d/%d identical duplicates (%s)", len(dupes)-1, len(dupes), ID)
+ for _, o := range dupes[1:] {
+ err := DeleteFile(ctx, o)
+ if err != nil {
+ remainingObjs = append(remainingObjs, o)
+ }
+ }
+ }
+ }
+
+ return remainingObjs
+}
+
+// dedupeList lists the duplicates and does nothing
+func dedupeList(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) {
+ fmt.Printf("%s: %d duplicates\n", remote, len(objs))
+ for i, o := range objs {
+ hashValue := ""
+ if ht != hash.None {
+ var err error
+ hashValue, err = o.Hash(ctx, ht)
+ if err != nil {
+ hashValue = err.Error()
+ }
+ }
+ if byHash {
+ fmt.Printf(" %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote())
+ } else {
+ fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
+ }
+ }
+}
+
+// dedupeInteractive interactively dedupes the slice of objects
+func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) bool {
+ dedupeList(ctx, f, ht, remote, objs, byHash)
+ commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"}
+ if !byHash {
+ commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)")
+ }
+ commands = append(commands, "qQuit")
+ switch config.Command(commands) {
+ case 's':
+ case 'k':
+ keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
+ dedupeDeleteAllButOne(ctx, keep-1, remote, objs)
+ case 'r':
+ dedupeRename(ctx, f, remote, objs)
+ case 'q':
+ return false
+ }
+ return true
+}
+
+// DeduplicateMode is how the dedupe command chooses what to do
+type DeduplicateMode int
+
+// Deduplicate modes
+const (
+ DeduplicateInteractive DeduplicateMode = iota // interactively ask the user
+ DeduplicateSkip // skip all conflicts
+ DeduplicateFirst // choose the first object
+ DeduplicateNewest // choose the newest object
+ DeduplicateOldest // choose the oldest object
+ DeduplicateRename // rename the objects
+ DeduplicateLargest // choose the largest object
+ DeduplicateSmallest // choose the smallest object
+ DeduplicateList // list duplicates only
+)
+
+func (x DeduplicateMode) String() string {
+ switch x {
+ case DeduplicateInteractive:
+ return "interactive"
+ case DeduplicateSkip:
+ return "skip"
+ case DeduplicateFirst:
+ return "first"
+ case DeduplicateNewest:
+ return "newest"
+ case DeduplicateOldest:
+ return "oldest"
+ case DeduplicateRename:
+ return "rename"
+ case DeduplicateLargest:
+ return "largest"
+ case DeduplicateSmallest:
+ return "smallest"
+ case DeduplicateList:
+ return "list"
+ }
+ return "unknown"
+}
+
+// Set a DeduplicateMode from a string
+func (x *DeduplicateMode) Set(s string) error {
+ switch strings.ToLower(s) {
+ case "interactive":
+ *x = DeduplicateInteractive
+ case "skip":
+ *x = DeduplicateSkip
+ case "first":
+ *x = DeduplicateFirst
+ case "newest":
+ *x = DeduplicateNewest
+ case "oldest":
+ *x = DeduplicateOldest
+ case "rename":
+ *x = DeduplicateRename
+ case "largest":
+ *x = DeduplicateLargest
+ case "smallest":
+ *x = DeduplicateSmallest
+ case "list":
+ *x = DeduplicateList
+ default:
+ return fmt.Errorf("unknown mode for dedupe %q", s)
+ }
+ return nil
+}
+
+// Type of the value
+func (x *DeduplicateMode) Type() string {
+ return "string"
+}
+
+// Directory with entry count and links to parents
+type dedupeDir struct {
+ dir fs.Directory
+ parent string
+ count int
+}
+
+// Map of directories by ID with recursive counts
+type dedupeDirsMap map[string]*dedupeDir
+
+func (dm dedupeDirsMap) get(id string) *dedupeDir {
+ d := dm[id]
+ if d == nil {
+ d = &dedupeDir{}
+ dm[id] = d
+ }
+ return d
+}
+
+func (dm dedupeDirsMap) increment(parent string) {
+ if parent != "" {
+ d := dm.get(parent)
+ d.count++
+ dm.increment(d.parent)
+ }
+}
+
+// dedupeFindDuplicateDirs scans f for duplicate directories
+func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) (duplicateDirs [][]*dedupeDir, err error) {
+ dirsByID := dedupeDirsMap{}
+ dirs := map[string][]*dedupeDir{}
+
+ ci := fs.GetConfig(ctx)
+ err = walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListAll, func(entries fs.DirEntries) error {
+ for _, entry := range entries {
+ tr := accounting.Stats(ctx).NewCheckingTransfer(entry, "merging")
+
+ remote := entry.Remote()
+ parentRemote := path.Dir(remote)
+ if parentRemote == "." {
+ parentRemote = ""
+ }
+
+ // Obtain ID of the object parent, if known.
+ // (This usually means that backend allows duplicate paths)
+ // Fall back to remote parent path, if unavailable.
+ var parent string
+ if entryParentIDer, ok := entry.(fs.ParentIDer); ok {
+ parent = entryParentIDer.ParentID()
+ }
+ if parent == "" {
+ parent = parentRemote
+ }
+
+ var ID string
+ if entryIDer, ok := entry.(fs.IDer); ok {
+ ID = entryIDer.ID()
+ }
+ if ID == "" {
+ ID = remote
+ }
+
+ if fsDir, ok := entry.(fs.Directory); ok {
+ d := dirsByID.get(ID)
+ d.dir = fsDir
+ d.parent = parent
+ dirs[remote] = append(dirs[remote], d)
+ }
+
+ dirsByID.increment(parent)
+ tr.Done(ctx, nil)
+ }
+ return nil
+ })
+ if err != nil {
+ return nil, fmt.Errorf("find duplicate dirs: %w", err)
+ }
+
+ // Make sure parents are before children
+ duplicateNames := []string{}
+ for name, ds := range dirs {
+ if len(ds) > 1 {
+ duplicateNames = append(duplicateNames, name)
+ }
+ }
+ sort.Strings(duplicateNames)
+ for _, name := range duplicateNames {
+ duplicateDirs = append(duplicateDirs, dirs[name])
+ }
+
+ return
+}
+
+// dedupeMergeDuplicateDirs merges all the duplicate directories found
+func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]*dedupeDir) error {
+ mergeDirs := f.Features().MergeDirs
+ if mergeDirs == nil {
+ return fmt.Errorf("%v: can't merge directories", f)
+ }
+ dirCacheFlush := f.Features().DirCacheFlush
+ if dirCacheFlush == nil {
+ return fmt.Errorf("%v: can't flush dir cache", f)
+ }
+ for _, dedupeDirs := range duplicateDirs {
+ if SkipDestructive(ctx, dedupeDirs[0].dir, "merge duplicate directories") {
+ continue
+ }
+
+ // Put largest directory in front to minimize movements
+ fsDirs := []fs.Directory{}
+ largestCount := -1
+ largestIdx := 0
+ for i, d := range dedupeDirs {
+ fsDirs = append(fsDirs, d.dir)
+ if d.count > largestCount {
+ largestIdx = i
+ largestCount = d.count
+ }
+ }
+ fsDirs[largestIdx], fsDirs[0] = fsDirs[0], fsDirs[largestIdx]
+
+ fs.Infof(fsDirs[0], "Merging contents of duplicate directories")
+ err := mergeDirs(ctx, fsDirs)
+ if err != nil {
+ err = fs.CountError(ctx, err)
+ fs.Errorf(nil, "merge duplicate dirs: %v", err)
+ }
+ }
+ dirCacheFlush()
+ return nil
+}
+
+// sort oldest first
+func sortOldestFirst(objs []fs.Object) {
+ sort.Slice(objs, func(i, j int) bool {
+ return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO()))
+ })
+}
+
+// sort smallest first
+func sortSmallestFirst(objs []fs.Object) {
+ sort.Slice(objs, func(i, j int) bool {
+ return objs[i].Size() < objs[j].Size()
+ })
+}
+
+// Deduplicate interactively finds duplicate files and offers to
+// delete all but one or rename them to be different. Only useful with
+// Google Drive which can have duplicate file names.
+func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error {
+ ci := fs.GetConfig(ctx)
+ // find a hash to use
+ ht := f.Hashes().GetOne()
+ what := "names"
+ if byHash {
+ if ht == hash.None {
+ return fmt.Errorf("%v has no hashes", f)
+ }
+ what = ht.String() + " hashes"
+ }
+ fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode)
+
+ // Find duplicate directories first and fix them
+ if !byHash {
+ duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
+ if err != nil {
+ return err
+ }
+ if len(duplicateDirs) > 0 {
+ if mode != DeduplicateList {
+ err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
+ if err != nil {
+ return err
+ }
+ } else {
+ for _, dedupeDirs := range duplicateDirs {
+ remote := dedupeDirs[0].dir.Remote()
+ fmt.Printf("%s: %d duplicates of this directory\n", remote, len(dedupeDirs))
+ }
+ }
+ }
+ }
+
+ // Now find duplicate files
+ files := map[string][]fs.Object{}
+ err := walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
+ entries.ForObject(func(o fs.Object) {
+ tr := accounting.Stats(ctx).NewCheckingTransfer(o, "checking")
+ defer tr.Done(ctx, nil)
+
+ var remote string
+ var err error
+ if byHash {
+ remote, err = o.Hash(ctx, ht)
+ if err != nil {
+ fs.Errorf(o, "Failed to hash: %v", err)
+ remote = ""
+ }
+ } else {
+ remote = o.Remote()
+ }
+ if remote != "" {
+ files[remote] = append(files[remote], o)
+ }
+ })
+ return nil
+ })
+ if err != nil {
+ return err
+ }
+
+ for remote, objs := range files {
+ if len(objs) <= 1 {
+ continue
+ }
+ fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
+ if !byHash && mode != DeduplicateList {
+ objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
+ if len(objs) <= 1 {
+ fs.Logf(remote, "All duplicates removed")
+ continue
+ }
+ }
+ switch mode {
+ case DeduplicateInteractive:
+ if !dedupeInteractive(ctx, f, ht, remote, objs, byHash) {
+ return nil
+ }
+ case DeduplicateFirst:
+ dedupeDeleteAllButOne(ctx, 0, remote, objs)
+ case DeduplicateNewest:
+ sortOldestFirst(objs)
+ dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
+ case DeduplicateOldest:
+ sortOldestFirst(objs)
+ dedupeDeleteAllButOne(ctx, 0, remote, objs)
+ case DeduplicateRename:
+ dedupeRename(ctx, f, remote, objs)
+ case DeduplicateLargest:
+ sortSmallestFirst(objs)
+ dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
+ case DeduplicateSmallest:
+ sortSmallestFirst(objs)
+ dedupeDeleteAllButOne(ctx, 0, remote, objs)
+ case DeduplicateSkip:
+ fs.Logf(remote, "Skipping %d files with duplicate %s", len(objs), what)
+ case DeduplicateList:
+ dedupeList(ctx, f, ht, remote, objs, byHash)
+ default:
+ //skip
+ }
+ }
+ return nil
+}