diff options
Diffstat (limited to 'fs/operations/dedupe.go')
| -rw-r--r-- | fs/operations/dedupe.go | 506 |
1 files changed, 506 insertions, 0 deletions
diff --git a/fs/operations/dedupe.go b/fs/operations/dedupe.go new file mode 100644 index 0000000..60c66c4 --- /dev/null +++ b/fs/operations/dedupe.go @@ -0,0 +1,506 @@ +// dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega) + +package operations + +import ( + "context" + "fmt" + "path" + "sort" + "strings" + + "github.com/rclone/rclone/fs" + "github.com/rclone/rclone/fs/accounting" + "github.com/rclone/rclone/fs/config" + "github.com/rclone/rclone/fs/hash" + "github.com/rclone/rclone/fs/walk" +) + +// dedupeRename renames the objs slice to different names +func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) { + doMove := f.Features().Move + if doMove == nil { + fs.Fatalf(nil, "Fs %v doesn't support Move", f) + } + ext := path.Ext(remote) + base := remote[:len(remote)-len(ext)] + +outer: + for i, o := range objs { + suffix := 1 + newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext) + _, err := f.NewObject(ctx, newName) + for ; err != fs.ErrorObjectNotFound; suffix++ { + if err != nil { + err = fs.CountError(ctx, err) + fs.Errorf(o, "Failed to check for existing object: %v", err) + continue outer + } + if suffix > 100 { + fs.Errorf(o, "Could not find an available new name") + continue outer + } + newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext) + _, err = f.NewObject(ctx, newName) + } + if !SkipDestructive(ctx, o, "rename") { + newObj, err := doMove(ctx, o, newName) + if err != nil { + err = fs.CountError(ctx, err) + fs.Errorf(o, "Failed to rename: %v", err) + continue + } + fs.Infof(newObj, "renamed from: %v", o) + } + } +} + +// dedupeDeleteAllButOne deletes all but the one in keep +func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) { + count := 0 + for i, o := range objs { + if i == keep { + continue + } + err := DeleteFile(ctx, o) + if err == nil { + count++ + } + } + if count > 0 { + fs.Logf(remote, "Deleted %d extra copies", count) + } +} + +// dedupeDeleteIdentical deletes all but one of identical (by hash) copies +func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) { + ci := fs.GetConfig(ctx) + + // Make map of IDs + IDs := make(map[string]int, len(objs)) + for _, o := range objs { + if do, ok := o.(fs.IDer); ok { + if ID := do.ID(); ID != "" { + IDs[ID]++ + } + } + } + + // Remove duplicate IDs + newObjs := objs[:0] + for _, o := range objs { + if do, ok := o.(fs.IDer); ok { + if ID := do.ID(); ID != "" { + if IDs[ID] <= 1 { + newObjs = append(newObjs, o) + } else { + fs.Logf(o, "Ignoring as it appears %d times in the listing and deleting would lead to data loss", IDs[ID]) + } + } + } + } + objs = newObjs + + // See how many of these duplicates are identical + dupesByID := make(map[string][]fs.Object, len(objs)) + for _, o := range objs { + ID := "" + if ci.SizeOnly && o.Size() >= 0 { + ID = fmt.Sprintf("size %d", o.Size()) + } else if ht != hash.None { + hashValue, err := o.Hash(ctx, ht) + if err == nil && hashValue != "" { + ID = fmt.Sprintf("%v %s", ht, hashValue) + } + } + if ID == "" { + remainingObjs = append(remainingObjs, o) + } else { + dupesByID[ID] = append(dupesByID[ID], o) + } + } + + // Delete identical duplicates, filling remainingObjs with the ones remaining + for ID, dupes := range dupesByID { + remainingObjs = append(remainingObjs, dupes[0]) + if len(dupes) > 1 { + fs.Logf(remote, "Deleting %d/%d identical duplicates (%s)", len(dupes)-1, len(dupes), ID) + for _, o := range dupes[1:] { + err := DeleteFile(ctx, o) + if err != nil { + remainingObjs = append(remainingObjs, o) + } + } + } + } + + return remainingObjs +} + +// dedupeList lists the duplicates and does nothing +func dedupeList(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) { + fmt.Printf("%s: %d duplicates\n", remote, len(objs)) + for i, o := range objs { + hashValue := "" + if ht != hash.None { + var err error + hashValue, err = o.Hash(ctx, ht) + if err != nil { + hashValue = err.Error() + } + } + if byHash { + fmt.Printf(" %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote()) + } else { + fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue) + } + } +} + +// dedupeInteractive interactively dedupes the slice of objects +func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) bool { + dedupeList(ctx, f, ht, remote, objs, byHash) + commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"} + if !byHash { + commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)") + } + commands = append(commands, "qQuit") + switch config.Command(commands) { + case 's': + case 'k': + keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs)) + dedupeDeleteAllButOne(ctx, keep-1, remote, objs) + case 'r': + dedupeRename(ctx, f, remote, objs) + case 'q': + return false + } + return true +} + +// DeduplicateMode is how the dedupe command chooses what to do +type DeduplicateMode int + +// Deduplicate modes +const ( + DeduplicateInteractive DeduplicateMode = iota // interactively ask the user + DeduplicateSkip // skip all conflicts + DeduplicateFirst // choose the first object + DeduplicateNewest // choose the newest object + DeduplicateOldest // choose the oldest object + DeduplicateRename // rename the objects + DeduplicateLargest // choose the largest object + DeduplicateSmallest // choose the smallest object + DeduplicateList // list duplicates only +) + +func (x DeduplicateMode) String() string { + switch x { + case DeduplicateInteractive: + return "interactive" + case DeduplicateSkip: + return "skip" + case DeduplicateFirst: + return "first" + case DeduplicateNewest: + return "newest" + case DeduplicateOldest: + return "oldest" + case DeduplicateRename: + return "rename" + case DeduplicateLargest: + return "largest" + case DeduplicateSmallest: + return "smallest" + case DeduplicateList: + return "list" + } + return "unknown" +} + +// Set a DeduplicateMode from a string +func (x *DeduplicateMode) Set(s string) error { + switch strings.ToLower(s) { + case "interactive": + *x = DeduplicateInteractive + case "skip": + *x = DeduplicateSkip + case "first": + *x = DeduplicateFirst + case "newest": + *x = DeduplicateNewest + case "oldest": + *x = DeduplicateOldest + case "rename": + *x = DeduplicateRename + case "largest": + *x = DeduplicateLargest + case "smallest": + *x = DeduplicateSmallest + case "list": + *x = DeduplicateList + default: + return fmt.Errorf("unknown mode for dedupe %q", s) + } + return nil +} + +// Type of the value +func (x *DeduplicateMode) Type() string { + return "string" +} + +// Directory with entry count and links to parents +type dedupeDir struct { + dir fs.Directory + parent string + count int +} + +// Map of directories by ID with recursive counts +type dedupeDirsMap map[string]*dedupeDir + +func (dm dedupeDirsMap) get(id string) *dedupeDir { + d := dm[id] + if d == nil { + d = &dedupeDir{} + dm[id] = d + } + return d +} + +func (dm dedupeDirsMap) increment(parent string) { + if parent != "" { + d := dm.get(parent) + d.count++ + dm.increment(d.parent) + } +} + +// dedupeFindDuplicateDirs scans f for duplicate directories +func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) (duplicateDirs [][]*dedupeDir, err error) { + dirsByID := dedupeDirsMap{} + dirs := map[string][]*dedupeDir{} + + ci := fs.GetConfig(ctx) + err = walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListAll, func(entries fs.DirEntries) error { + for _, entry := range entries { + tr := accounting.Stats(ctx).NewCheckingTransfer(entry, "merging") + + remote := entry.Remote() + parentRemote := path.Dir(remote) + if parentRemote == "." { + parentRemote = "" + } + + // Obtain ID of the object parent, if known. + // (This usually means that backend allows duplicate paths) + // Fall back to remote parent path, if unavailable. + var parent string + if entryParentIDer, ok := entry.(fs.ParentIDer); ok { + parent = entryParentIDer.ParentID() + } + if parent == "" { + parent = parentRemote + } + + var ID string + if entryIDer, ok := entry.(fs.IDer); ok { + ID = entryIDer.ID() + } + if ID == "" { + ID = remote + } + + if fsDir, ok := entry.(fs.Directory); ok { + d := dirsByID.get(ID) + d.dir = fsDir + d.parent = parent + dirs[remote] = append(dirs[remote], d) + } + + dirsByID.increment(parent) + tr.Done(ctx, nil) + } + return nil + }) + if err != nil { + return nil, fmt.Errorf("find duplicate dirs: %w", err) + } + + // Make sure parents are before children + duplicateNames := []string{} + for name, ds := range dirs { + if len(ds) > 1 { + duplicateNames = append(duplicateNames, name) + } + } + sort.Strings(duplicateNames) + for _, name := range duplicateNames { + duplicateDirs = append(duplicateDirs, dirs[name]) + } + + return +} + +// dedupeMergeDuplicateDirs merges all the duplicate directories found +func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]*dedupeDir) error { + mergeDirs := f.Features().MergeDirs + if mergeDirs == nil { + return fmt.Errorf("%v: can't merge directories", f) + } + dirCacheFlush := f.Features().DirCacheFlush + if dirCacheFlush == nil { + return fmt.Errorf("%v: can't flush dir cache", f) + } + for _, dedupeDirs := range duplicateDirs { + if SkipDestructive(ctx, dedupeDirs[0].dir, "merge duplicate directories") { + continue + } + + // Put largest directory in front to minimize movements + fsDirs := []fs.Directory{} + largestCount := -1 + largestIdx := 0 + for i, d := range dedupeDirs { + fsDirs = append(fsDirs, d.dir) + if d.count > largestCount { + largestIdx = i + largestCount = d.count + } + } + fsDirs[largestIdx], fsDirs[0] = fsDirs[0], fsDirs[largestIdx] + + fs.Infof(fsDirs[0], "Merging contents of duplicate directories") + err := mergeDirs(ctx, fsDirs) + if err != nil { + err = fs.CountError(ctx, err) + fs.Errorf(nil, "merge duplicate dirs: %v", err) + } + } + dirCacheFlush() + return nil +} + +// sort oldest first +func sortOldestFirst(objs []fs.Object) { + sort.Slice(objs, func(i, j int) bool { + return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO())) + }) +} + +// sort smallest first +func sortSmallestFirst(objs []fs.Object) { + sort.Slice(objs, func(i, j int) bool { + return objs[i].Size() < objs[j].Size() + }) +} + +// Deduplicate interactively finds duplicate files and offers to +// delete all but one or rename them to be different. Only useful with +// Google Drive which can have duplicate file names. +func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error { + ci := fs.GetConfig(ctx) + // find a hash to use + ht := f.Hashes().GetOne() + what := "names" + if byHash { + if ht == hash.None { + return fmt.Errorf("%v has no hashes", f) + } + what = ht.String() + " hashes" + } + fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode) + + // Find duplicate directories first and fix them + if !byHash { + duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f) + if err != nil { + return err + } + if len(duplicateDirs) > 0 { + if mode != DeduplicateList { + err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs) + if err != nil { + return err + } + } else { + for _, dedupeDirs := range duplicateDirs { + remote := dedupeDirs[0].dir.Remote() + fmt.Printf("%s: %d duplicates of this directory\n", remote, len(dedupeDirs)) + } + } + } + } + + // Now find duplicate files + files := map[string][]fs.Object{} + err := walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error { + entries.ForObject(func(o fs.Object) { + tr := accounting.Stats(ctx).NewCheckingTransfer(o, "checking") + defer tr.Done(ctx, nil) + + var remote string + var err error + if byHash { + remote, err = o.Hash(ctx, ht) + if err != nil { + fs.Errorf(o, "Failed to hash: %v", err) + remote = "" + } + } else { + remote = o.Remote() + } + if remote != "" { + files[remote] = append(files[remote], o) + } + }) + return nil + }) + if err != nil { + return err + } + + for remote, objs := range files { + if len(objs) <= 1 { + continue + } + fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what) + if !byHash && mode != DeduplicateList { + objs = dedupeDeleteIdentical(ctx, ht, remote, objs) + if len(objs) <= 1 { + fs.Logf(remote, "All duplicates removed") + continue + } + } + switch mode { + case DeduplicateInteractive: + if !dedupeInteractive(ctx, f, ht, remote, objs, byHash) { + return nil + } + case DeduplicateFirst: + dedupeDeleteAllButOne(ctx, 0, remote, objs) + case DeduplicateNewest: + sortOldestFirst(objs) + dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs) + case DeduplicateOldest: + sortOldestFirst(objs) + dedupeDeleteAllButOne(ctx, 0, remote, objs) + case DeduplicateRename: + dedupeRename(ctx, f, remote, objs) + case DeduplicateLargest: + sortSmallestFirst(objs) + dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs) + case DeduplicateSmallest: + sortSmallestFirst(objs) + dedupeDeleteAllButOne(ctx, 0, remote, objs) + case DeduplicateSkip: + fs.Logf(remote, "Skipping %d files with duplicate %s", len(objs), what) + case DeduplicateList: + dedupeList(ctx, f, ht, remote, objs, byHash) + default: + //skip + } + } + return nil +} |
