1 files changed, 506 insertions, 0 deletions
diff --git a/fs/operations/dedupe.go b/fs/operations/dedupe.go
new file mode 100644
index 0000000..60c66c4
--- /dev/null
+++ b/fs/operations/dedupe.go
@@ -0,0 +1,506 @@
+// dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega)
+
+package operations
+
+import (
+	"context"
+	"fmt"
+	"path"
+	"sort"
+	"strings"
+
+	"github.com/rclone/rclone/fs"
+	"github.com/rclone/rclone/fs/accounting"
+	"github.com/rclone/rclone/fs/config"
+	"github.com/rclone/rclone/fs/hash"
+	"github.com/rclone/rclone/fs/walk"
+)
+
+// dedupeRename renames the objs slice to different names
+func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) {
+	doMove := f.Features().Move
+	if doMove == nil {
+		fs.Fatalf(nil, "Fs %v doesn't support Move", f)
+	}
+	ext := path.Ext(remote)
+	base := remote[:len(remote)-len(ext)]
+
+outer:
+	for i, o := range objs {
+		suffix := 1
+		newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
+		_, err := f.NewObject(ctx, newName)
+		for ; err != fs.ErrorObjectNotFound; suffix++ {
+			if err != nil {
+				err = fs.CountError(ctx, err)
+				fs.Errorf(o, "Failed to check for existing object: %v", err)
+				continue outer
+			}
+			if suffix > 100 {
+				fs.Errorf(o, "Could not find an available new name")
+				continue outer
+			}
+			newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
+			_, err = f.NewObject(ctx, newName)
+		}
+		if !SkipDestructive(ctx, o, "rename") {
+			newObj, err := doMove(ctx, o, newName)
+			if err != nil {
+				err = fs.CountError(ctx, err)
+				fs.Errorf(o, "Failed to rename: %v", err)
+				continue
+			}
+			fs.Infof(newObj, "renamed from: %v", o)
+		}
+	}
+}
+
+// dedupeDeleteAllButOne deletes all but the one in keep
+func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) {
+	count := 0
+	for i, o := range objs {
+		if i == keep {
+			continue
+		}
+		err := DeleteFile(ctx, o)
+		if err == nil {
+			count++
+		}
+	}
+	if count > 0 {
+		fs.Logf(remote, "Deleted %d extra copies", count)
+	}
+}
+
+// dedupeDeleteIdentical deletes all but one of identical (by hash) copies
+func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) {
+	ci := fs.GetConfig(ctx)
+
+	// Make map of IDs
+	IDs := make(map[string]int, len(objs))
+	for _, o := range objs {
+		if do, ok := o.(fs.IDer); ok {
+			if ID := do.ID(); ID != "" {
+				IDs[ID]++
+			}
+		}
+	}
+
+	// Remove duplicate IDs
+	newObjs := objs[:0]
+	for _, o := range objs {
+		if do, ok := o.(fs.IDer); ok {
+			if ID := do.ID(); ID != "" {
+				if IDs[ID] <= 1 {
+					newObjs = append(newObjs, o)
+				} else {
+					fs.Logf(o, "Ignoring as it appears %d times in the listing and deleting would lead to data loss", IDs[ID])
+				}
+			}
+		}
+	}
+	objs = newObjs
+
+	// See how many of these duplicates are identical
+	dupesByID := make(map[string][]fs.Object, len(objs))
+	for _, o := range objs {
+		ID := ""
+		if ci.SizeOnly && o.Size() >= 0 {
+			ID = fmt.Sprintf("size %d", o.Size())
+		} else if ht != hash.None {
+			hashValue, err := o.Hash(ctx, ht)
+			if err == nil && hashValue != "" {
+				ID = fmt.Sprintf("%v %s", ht, hashValue)
+			}
+		}
+		if ID == "" {
+			remainingObjs = append(remainingObjs, o)
+		} else {
+			dupesByID[ID] = append(dupesByID[ID], o)
+		}
+	}
+
+	// Delete identical duplicates, filling remainingObjs with the ones remaining
+	for ID, dupes := range dupesByID {
+		remainingObjs = append(remainingObjs, dupes[0])
+		if len(dupes) > 1 {
+			fs.Logf(remote, "Deleting %d/%d identical duplicates (%s)", len(dupes)-1, len(dupes), ID)
+			for _, o := range dupes[1:] {
+				err := DeleteFile(ctx, o)
+				if err != nil {
+					remainingObjs = append(remainingObjs, o)
+				}
+			}
+		}
+	}
+
+	return remainingObjs
+}
+
+// dedupeList lists the duplicates and does nothing
+func dedupeList(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) {
+	fmt.Printf("%s: %d duplicates\n", remote, len(objs))
+	for i, o := range objs {
+		hashValue := ""
+		if ht != hash.None {
+			var err error
+			hashValue, err = o.Hash(ctx, ht)
+			if err != nil {
+				hashValue = err.Error()
+			}
+		}
+		if byHash {
+			fmt.Printf("  %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote())
+		} else {
+			fmt.Printf("  %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
+		}
+	}
+}
+
+// dedupeInteractive interactively dedupes the slice of objects
+func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) bool {
+	dedupeList(ctx, f, ht, remote, objs, byHash)
+	commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"}
+	if !byHash {
+		commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)")
+	}
+	commands = append(commands, "qQuit")
+	switch config.Command(commands) {
+	case 's':
+	case 'k':
+		keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
+		dedupeDeleteAllButOne(ctx, keep-1, remote, objs)
+	case 'r':
+		dedupeRename(ctx, f, remote, objs)
+	case 'q':
+		return false
+	}
+	return true
+}
+
+// DeduplicateMode is how the dedupe command chooses what to do
+type DeduplicateMode int
+
+// Deduplicate modes
+const (
+	DeduplicateInteractive DeduplicateMode = iota // interactively ask the user
+	DeduplicateSkip                               // skip all conflicts
+	DeduplicateFirst                              // choose the first object
+	DeduplicateNewest                             // choose the newest object
+	DeduplicateOldest                             // choose the oldest object
+	DeduplicateRename                             // rename the objects
+	DeduplicateLargest                            // choose the largest object
+	DeduplicateSmallest                           // choose the smallest object
+	DeduplicateList                               // list duplicates only
+)
+
+func (x DeduplicateMode) String() string {
+	switch x {
+	case DeduplicateInteractive:
+		return "interactive"
+	case DeduplicateSkip:
+		return "skip"
+	case DeduplicateFirst:
+		return "first"
+	case DeduplicateNewest:
+		return "newest"
+	case DeduplicateOldest:
+		return "oldest"
+	case DeduplicateRename:
+		return "rename"
+	case DeduplicateLargest:
+		return "largest"
+	case DeduplicateSmallest:
+		return "smallest"
+	case DeduplicateList:
+		return "list"
+	}
+	return "unknown"
+}
+
+// Set a DeduplicateMode from a string
+func (x *DeduplicateMode) Set(s string) error {
+	switch strings.ToLower(s) {
+	case "interactive":
+		*x = DeduplicateInteractive
+	case "skip":
+		*x = DeduplicateSkip
+	case "first":
+		*x = DeduplicateFirst
+	case "newest":
+		*x = DeduplicateNewest
+	case "oldest":
+		*x = DeduplicateOldest
+	case "rename":
+		*x = DeduplicateRename
+	case "largest":
+		*x = DeduplicateLargest
+	case "smallest":
+		*x = DeduplicateSmallest
+	case "list":
+		*x = DeduplicateList
+	default:
+		return fmt.Errorf("unknown mode for dedupe %q", s)
+	}
+	return nil
+}
+
+// Type of the value
+func (x *DeduplicateMode) Type() string {
+	return "string"
+}
+
+// Directory with entry count and links to parents
+type dedupeDir struct {
+	dir    fs.Directory
+	parent string
+	count  int
+}
+
+// Map of directories by ID with recursive counts
+type dedupeDirsMap map[string]*dedupeDir
+
+func (dm dedupeDirsMap) get(id string) *dedupeDir {
+	d := dm[id]
+	if d == nil {
+		d = &dedupeDir{}
+		dm[id] = d
+	}
+	return d
+}
+
+func (dm dedupeDirsMap) increment(parent string) {
+	if parent != "" {
+		d := dm.get(parent)
+		d.count++
+		dm.increment(d.parent)
+	}
+}
+
+// dedupeFindDuplicateDirs scans f for duplicate directories
+func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) (duplicateDirs [][]*dedupeDir, err error) {
+	dirsByID := dedupeDirsMap{}
+	dirs := map[string][]*dedupeDir{}
+
+	ci := fs.GetConfig(ctx)
+	err = walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListAll, func(entries fs.DirEntries) error {
+		for _, entry := range entries {
+			tr := accounting.Stats(ctx).NewCheckingTransfer(entry, "merging")
+
+			remote := entry.Remote()
+			parentRemote := path.Dir(remote)
+			if parentRemote == "." {
+				parentRemote = ""
+			}
+
+			// Obtain ID of the object parent, if known.
+			// (This usually means that backend allows duplicate paths)
+			// Fall back to remote parent path, if unavailable.
+			var parent string
+			if entryParentIDer, ok := entry.(fs.ParentIDer); ok {
+				parent = entryParentIDer.ParentID()
+			}
+			if parent == "" {
+				parent = parentRemote
+			}
+
+			var ID string
+			if entryIDer, ok := entry.(fs.IDer); ok {
+				ID = entryIDer.ID()
+			}
+			if ID == "" {
+				ID = remote
+			}
+
+			if fsDir, ok := entry.(fs.Directory); ok {
+				d := dirsByID.get(ID)
+				d.dir = fsDir
+				d.parent = parent
+				dirs[remote] = append(dirs[remote], d)
+			}
+
+			dirsByID.increment(parent)
+			tr.Done(ctx, nil)
+		}
+		return nil
+	})
+	if err != nil {
+		return nil, fmt.Errorf("find duplicate dirs: %w", err)
+	}
+
+	// Make sure parents are before children
+	duplicateNames := []string{}
+	for name, ds := range dirs {
+		if len(ds) > 1 {
+			duplicateNames = append(duplicateNames, name)
+		}
+	}
+	sort.Strings(duplicateNames)
+	for _, name := range duplicateNames {
+		duplicateDirs = append(duplicateDirs, dirs[name])
+	}
+
+	return
+}
+
+// dedupeMergeDuplicateDirs merges all the duplicate directories found
+func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]*dedupeDir) error {
+	mergeDirs := f.Features().MergeDirs
+	if mergeDirs == nil {
+		return fmt.Errorf("%v: can't merge directories", f)
+	}
+	dirCacheFlush := f.Features().DirCacheFlush
+	if dirCacheFlush == nil {
+		return fmt.Errorf("%v: can't flush dir cache", f)
+	}
+	for _, dedupeDirs := range duplicateDirs {
+		if SkipDestructive(ctx, dedupeDirs[0].dir, "merge duplicate directories") {
+			continue
+		}
+
+		// Put largest directory in front to minimize movements
+		fsDirs := []fs.Directory{}
+		largestCount := -1
+		largestIdx := 0
+		for i, d := range dedupeDirs {
+			fsDirs = append(fsDirs, d.dir)
+			if d.count > largestCount {
+				largestIdx = i
+				largestCount = d.count
+			}
+		}
+		fsDirs[largestIdx], fsDirs[0] = fsDirs[0], fsDirs[largestIdx]
+
+		fs.Infof(fsDirs[0], "Merging contents of duplicate directories")
+		err := mergeDirs(ctx, fsDirs)
+		if err != nil {
+			err = fs.CountError(ctx, err)
+			fs.Errorf(nil, "merge duplicate dirs: %v", err)
+		}
+	}
+	dirCacheFlush()
+	return nil
+}
+
+// sort oldest first
+func sortOldestFirst(objs []fs.Object) {
+	sort.Slice(objs, func(i, j int) bool {
+		return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO()))
+	})
+}
+
+// sort smallest first
+func sortSmallestFirst(objs []fs.Object) {
+	sort.Slice(objs, func(i, j int) bool {
+		return objs[i].Size() < objs[j].Size()
+	})
+}
+
+// Deduplicate interactively finds duplicate files and offers to
+// delete all but one or rename them to be different. Only useful with
+// Google Drive which can have duplicate file names.
+func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error {
+	ci := fs.GetConfig(ctx)
+	// find a hash to use
+	ht := f.Hashes().GetOne()
+	what := "names"
+	if byHash {
+		if ht == hash.None {
+			return fmt.Errorf("%v has no hashes", f)
+		}
+		what = ht.String() + " hashes"
+	}
+	fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode)
+
+	// Find duplicate directories first and fix them
+	if !byHash {
+		duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
+		if err != nil {
+			return err
+		}
+		if len(duplicateDirs) > 0 {
+			if mode != DeduplicateList {
+				err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
+				if err != nil {
+					return err
+				}
+			} else {
+				for _, dedupeDirs := range duplicateDirs {
+					remote := dedupeDirs[0].dir.Remote()
+					fmt.Printf("%s: %d duplicates of this directory\n", remote, len(dedupeDirs))
+				}
+			}
+		}
+	}
+
+	// Now find duplicate files
+	files := map[string][]fs.Object{}
+	err := walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
+		entries.ForObject(func(o fs.Object) {
+			tr := accounting.Stats(ctx).NewCheckingTransfer(o, "checking")
+			defer tr.Done(ctx, nil)
+
+			var remote string
+			var err error
+			if byHash {
+				remote, err = o.Hash(ctx, ht)
+				if err != nil {
+					fs.Errorf(o, "Failed to hash: %v", err)
+					remote = ""
+				}
+			} else {
+				remote = o.Remote()
+			}
+			if remote != "" {
+				files[remote] = append(files[remote], o)
+			}
+		})
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+
+	for remote, objs := range files {
+		if len(objs) <= 1 {
+			continue
+		}
+		fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
+		if !byHash && mode != DeduplicateList {
+			objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
+			if len(objs) <= 1 {
+				fs.Logf(remote, "All duplicates removed")
+				continue
+			}
+		}
+		switch mode {
+		case DeduplicateInteractive:
+			if !dedupeInteractive(ctx, f, ht, remote, objs, byHash) {
+				return nil
+			}
+		case DeduplicateFirst:
+			dedupeDeleteAllButOne(ctx, 0, remote, objs)
+		case DeduplicateNewest:
+			sortOldestFirst(objs)
+			dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
+		case DeduplicateOldest:
+			sortOldestFirst(objs)
+			dedupeDeleteAllButOne(ctx, 0, remote, objs)
+		case DeduplicateRename:
+			dedupeRename(ctx, f, remote, objs)
+		case DeduplicateLargest:
+			sortSmallestFirst(objs)
+			dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
+		case DeduplicateSmallest:
+			sortSmallestFirst(objs)
+			dedupeDeleteAllButOne(ctx, 0, remote, objs)
+		case DeduplicateSkip:
+			fs.Logf(remote, "Skipping %d files with duplicate %s", len(objs), what)
+		case DeduplicateList:
+			dedupeList(ctx, f, ht, remote, objs, byHash)
+		default:
+			//skip
+		}
+	}
+	return nil
+}