rclone/fs/operations/dedupe.go

// dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega)

package operations

import (
	"context"
	"fmt"
	"path"
	"sort"
	"strings"

	"github.com/rclone/rclone/fs"
	"github.com/rclone/rclone/fs/accounting"
	"github.com/rclone/rclone/fs/config"
	"github.com/rclone/rclone/fs/hash"
	"github.com/rclone/rclone/fs/walk"
)

// dedupeRename renames the objs slice to different names
func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) {
	doMove := f.Features().Move
	if doMove == nil {
		fs.Fatalf(nil, "Fs %v doesn't support Move", f)
	}
	ext := path.Ext(remote)
	base := remote[:len(remote)-len(ext)]

outer:
	for i, o := range objs {
		suffix := 1
		newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
		_, err := f.NewObject(ctx, newName)
		for ; err != fs.ErrorObjectNotFound; suffix++ {
			if err != nil {
				err = fs.CountError(err)
				fs.Errorf(o, "Failed to check for existing object: %v", err)
				continue outer
			}
			if suffix > 100 {
				fs.Errorf(o, "Could not find an available new name")
				continue outer
			}
			newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
			_, err = f.NewObject(ctx, newName)
		}
		if !SkipDestructive(ctx, o, "rename") {
			newObj, err := doMove(ctx, o, newName)
			if err != nil {
				err = fs.CountError(err)
				fs.Errorf(o, "Failed to rename: %v", err)
				continue
			}
			fs.Infof(newObj, "renamed from: %v", o)
		}
	}
}

// dedupeDeleteAllButOne deletes all but the one in keep
func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) {
	count := 0
	for i, o := range objs {
		if i == keep {
			continue
		}
		err := DeleteFile(ctx, o)
		if err == nil {
			count++
		}
	}
	if count > 0 {
		fs.Logf(remote, "Deleted %d extra copies", count)
	}
}

// dedupeDeleteIdentical deletes all but one of identical (by hash) copies
func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) {
	ci := fs.GetConfig(ctx)

	// Make map of IDs
	IDs := make(map[string]int, len(objs))
	for _, o := range objs {
		if do, ok := o.(fs.IDer); ok {
			if ID := do.ID(); ID != "" {
				IDs[ID]++
			}
		}
	}

	// Remove duplicate IDs
	newObjs := objs[:0]
	for _, o := range objs {
		if do, ok := o.(fs.IDer); ok {
			if ID := do.ID(); ID != "" {
				if IDs[ID] <= 1 {
					newObjs = append(newObjs, o)
				} else {
					fs.Logf(o, "Ignoring as it appears %d times in the listing and deleting would lead to data loss", IDs[ID])
				}
			}
		}
	}
	objs = newObjs

	// See how many of these duplicates are identical
	dupesByID := make(map[string][]fs.Object, len(objs))
	for _, o := range objs {
		ID := ""
		if ci.SizeOnly && o.Size() >= 0 {
			ID = fmt.Sprintf("size %d", o.Size())
		} else if ht != hash.None {
			hashValue, err := o.Hash(ctx, ht)
			if err == nil && hashValue != "" {
				ID = fmt.Sprintf("%v %s", ht, hashValue)
			}
		}
		if ID == "" {
			remainingObjs = append(remainingObjs, o)
		} else {
			dupesByID[ID] = append(dupesByID[ID], o)
		}
	}

	// Delete identical duplicates, filling remainingObjs with the ones remaining
	for ID, dupes := range dupesByID {
		remainingObjs = append(remainingObjs, dupes[0])
		if len(dupes) > 1 {
			fs.Logf(remote, "Deleting %d/%d identical duplicates (%s)", len(dupes)-1, len(dupes), ID)
			for _, o := range dupes[1:] {
				err := DeleteFile(ctx, o)
				if err != nil {
					remainingObjs = append(remainingObjs, o)
				}
			}
		}
	}

	return remainingObjs
}

// dedupeList lists the duplicates and does nothing
func dedupeList(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) {
	fmt.Printf("%s: %d duplicates\n", remote, len(objs))
	for i, o := range objs {
		hashValue := ""
		if ht != hash.None {
			var err error
			hashValue, err = o.Hash(ctx, ht)
			if err != nil {
				hashValue = err.Error()
			}
		}
		if byHash {
			fmt.Printf("  %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote())
		} else {
			fmt.Printf("  %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
		}
	}
}

// dedupeInteractive interactively dedupes the slice of objects
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) bool {
	dedupeList(ctx, f, ht, remote, objs, byHash)
	commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"}
	if !byHash {
		commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)")
	}
	commands = append(commands, "qQuit")
	switch config.Command(commands) {
	case 's':
	case 'k':
		keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
		dedupeDeleteAllButOne(ctx, keep-1, remote, objs)
	case 'r':
		dedupeRename(ctx, f, remote, objs)
	case 'q':
		return false
	}
	return true
}

// DeduplicateMode is how the dedupe command chooses what to do
type DeduplicateMode int

// Deduplicate modes
const (
	DeduplicateInteractive DeduplicateMode = iota // interactively ask the user
	DeduplicateSkip                               // skip all conflicts
	DeduplicateFirst                              // choose the first object
	DeduplicateNewest                             // choose the newest object
	DeduplicateOldest                             // choose the oldest object
	DeduplicateRename                             // rename the objects
	DeduplicateLargest                            // choose the largest object
	DeduplicateSmallest                           // choose the smallest object
	DeduplicateList                               // list duplicates only
)

func (x DeduplicateMode) String() string {
	switch x {
	case DeduplicateInteractive:
		return "interactive"
	case DeduplicateSkip:
		return "skip"
	case DeduplicateFirst:
		return "first"
	case DeduplicateNewest:
		return "newest"
	case DeduplicateOldest:
		return "oldest"
	case DeduplicateRename:
		return "rename"
	case DeduplicateLargest:
		return "largest"
	case DeduplicateSmallest:
		return "smallest"
	case DeduplicateList:
		return "list"
	}
	return "unknown"
}

// Set a DeduplicateMode from a string
func (x *DeduplicateMode) Set(s string) error {
	switch strings.ToLower(s) {
	case "interactive":
		*x = DeduplicateInteractive
	case "skip":
		*x = DeduplicateSkip
	case "first":
		*x = DeduplicateFirst
	case "newest":
		*x = DeduplicateNewest
	case "oldest":
		*x = DeduplicateOldest
	case "rename":
		*x = DeduplicateRename
	case "largest":
		*x = DeduplicateLargest
	case "smallest":
		*x = DeduplicateSmallest
	case "list":
		*x = DeduplicateList
	default:
		return fmt.Errorf("unknown mode for dedupe %q", s)
	}
	return nil
}

// Type of the value
func (x *DeduplicateMode) Type() string {
	return "string"
}

// Directory with entry count and links to parents
type dedupeDir struct {
	dir    fs.Directory
	parent string
	count  int
}

// Map of directories by ID with recursive counts
type dedupeDirsMap map[string]*dedupeDir

func (dm dedupeDirsMap) get(id string) *dedupeDir {
	d := dm[id]
	if d == nil {
		d = &dedupeDir{}
		dm[id] = d
	}
	return d
}

func (dm dedupeDirsMap) increment(parent string) {
	if parent != "" {
		d := dm.get(parent)
		d.count++
		dm.increment(d.parent)
	}
}

// dedupeFindDuplicateDirs scans f for duplicate directories
func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) (duplicateDirs [][]*dedupeDir, err error) {
	dirsByID := dedupeDirsMap{}
	dirs := map[string][]*dedupeDir{}

	ci := fs.GetConfig(ctx)
	err = walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListAll, func(entries fs.DirEntries) error {
		for _, entry := range entries {
			tr := accounting.Stats(ctx).NewCheckingTransfer(entry, "merging")

			remote := entry.Remote()
			parentRemote := path.Dir(remote)
			if parentRemote == "." {
				parentRemote = ""
			}

			// Obtain ID of the object parent, if known.
			// (This usually means that backend allows duplicate paths)
			// Fall back to remote parent path, if unavailable.
			var parent string
			if entryParentIDer, ok := entry.(fs.ParentIDer); ok {
				parent = entryParentIDer.ParentID()
			}
			if parent == "" {
				parent = parentRemote
			}

			var ID string
			if entryIDer, ok := entry.(fs.IDer); ok {
				ID = entryIDer.ID()
			}
			if ID == "" {
				ID = remote
			}

			if fsDir, ok := entry.(fs.Directory); ok {
				d := dirsByID.get(ID)
				d.dir = fsDir
				d.parent = parent
				dirs[remote] = append(dirs[remote], d)
			}

			dirsByID.increment(parent)
			tr.Done(ctx, nil)
		}
		return nil
	})
	if err != nil {
		return nil, fmt.Errorf("find duplicate dirs: %w", err)
	}

	// Make sure parents are before children
	duplicateNames := []string{}
	for name, ds := range dirs {
		if len(ds) > 1 {
			duplicateNames = append(duplicateNames, name)
		}
	}
	sort.Strings(duplicateNames)
	for _, name := range duplicateNames {
		duplicateDirs = append(duplicateDirs, dirs[name])
	}

	return
}

// dedupeMergeDuplicateDirs merges all the duplicate directories found
func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]*dedupeDir) error {
	mergeDirs := f.Features().MergeDirs
	if mergeDirs == nil {
		return fmt.Errorf("%v: can't merge directories", f)
	}
	dirCacheFlush := f.Features().DirCacheFlush
	if dirCacheFlush == nil {
		return fmt.Errorf("%v: can't flush dir cache", f)
	}
	for _, dedupeDirs := range duplicateDirs {
		if SkipDestructive(ctx, dedupeDirs[0].dir, "merge duplicate directories") {
			continue
		}

		// Put largest directory in front to minimize movements
		fsDirs := []fs.Directory{}
		largestCount := -1
		largestIdx := 0
		for i, d := range dedupeDirs {
			fsDirs = append(fsDirs, d.dir)
			if d.count > largestCount {
				largestIdx = i
				largestCount = d.count
			}
		}
		fsDirs[largestIdx], fsDirs[0] = fsDirs[0], fsDirs[largestIdx]

		fs.Infof(fsDirs[0], "Merging contents of duplicate directories")
		err := mergeDirs(ctx, fsDirs)
		if err != nil {
			err = fs.CountError(err)
			fs.Errorf(nil, "merge duplicate dirs: %v", err)
		}
	}
	dirCacheFlush()
	return nil
}

// sort oldest first
func sortOldestFirst(objs []fs.Object) {
	sort.Slice(objs, func(i, j int) bool {
		return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO()))
	})
}

// sort smallest first
func sortSmallestFirst(objs []fs.Object) {
	sort.Slice(objs, func(i, j int) bool {
		return objs[i].Size() < objs[j].Size()
	})
}

// Deduplicate interactively finds duplicate files and offers to
// delete all but one or rename them to be different. Only useful with
// Google Drive which can have duplicate file names.
func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error {
	ci := fs.GetConfig(ctx)
	// find a hash to use
	ht := f.Hashes().GetOne()
	what := "names"
	if byHash {
		if ht == hash.None {
			return fmt.Errorf("%v has no hashes", f)
		}
		what = ht.String() + " hashes"
	}
	fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode)

	// Find duplicate directories first and fix them
	if !byHash {
		duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
		if err != nil {
			return err
		}
		if len(duplicateDirs) > 0 {
			if mode != DeduplicateList {
				err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
				if err != nil {
					return err
				}
			} else {
				for _, dedupeDirs := range duplicateDirs {
					remote := dedupeDirs[0].dir.Remote()
					fmt.Printf("%s: %d duplicates of this directory\n", remote, len(dedupeDirs))
				}
			}
		}
	}

	// Now find duplicate files
	files := map[string][]fs.Object{}
	err := walk.ListR(ctx, f, "", false, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
		entries.ForObject(func(o fs.Object) {
			tr := accounting.Stats(ctx).NewCheckingTransfer(o, "checking")
			defer tr.Done(ctx, nil)

			var remote string
			var err error
			if byHash {
				remote, err = o.Hash(ctx, ht)
				if err != nil {
					fs.Errorf(o, "Failed to hash: %v", err)
					remote = ""
				}
			} else {
				remote = o.Remote()
			}
			if remote != "" {
				files[remote] = append(files[remote], o)
			}
		})
		return nil
	})
	if err != nil {
		return err
	}

	for remote, objs := range files {
		if len(objs) <= 1 {
			continue
		}
		fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
		if !byHash && mode != DeduplicateList {
			objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
			if len(objs) <= 1 {
				fs.Logf(remote, "All duplicates removed")
				continue
			}
		}
		switch mode {
		case DeduplicateInteractive:
			if !dedupeInteractive(ctx, f, ht, remote, objs, byHash) {
				return nil
			}
		case DeduplicateFirst:
			dedupeDeleteAllButOne(ctx, 0, remote, objs)
		case DeduplicateNewest:
			sortOldestFirst(objs)
			dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
		case DeduplicateOldest:
			sortOldestFirst(objs)
			dedupeDeleteAllButOne(ctx, 0, remote, objs)
		case DeduplicateRename:
			dedupeRename(ctx, f, remote, objs)
		case DeduplicateLargest:
			sortSmallestFirst(objs)
			dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
		case DeduplicateSmallest:
			sortSmallestFirst(objs)
			dedupeDeleteAllButOne(ctx, 0, remote, objs)
		case DeduplicateSkip:
			fs.Logf(remote, "Skipping %d files with duplicate %s", len(objs), what)
		case DeduplicateList:
			dedupeList(ctx, f, ht, remote, objs, byHash)
		default:
			//skip
		}
	}
	return nil
}