dirtree: fix performance with large directories of directories and --fast-list

Before this change if using --fast-list on a directory with more than
a few thousand directories in it DirTree.CheckParents became very slow
taking up to 24 hours for a directory with 1,000,000 directories in
it.

This is because it becomes an O(N²) operation as DirTree.Find has to
search each directory in a linear fashion as it is stored as a slice.

This patch fixes the problem by scanning the DirTree for directories
before starting the CheckParents process so it never has to call
DirTree.Find.

After the fix calling DirTree.CheckParents on a directory with
1,000,000 directories in it will take about 1 second.

Anything which calls DirTree.Find can potentially have bad performance
so in the future we should redesign the DirTree to use a different
underlying datastructure or have an index.

https://forum.rclone.org/t/almost-24-hours-cpu-compute-time-during-sync-between-two-large-s3-buckets/39375/
This commit is contained in:
Nick Craig-Wood 2023-06-29 17:38:18 +01:00
parent a8ca18165e
commit 07133b892d
2 changed files with 63 additions and 13 deletions

View File

@ -63,10 +63,12 @@ func (dt DirTree) AddEntry(entry fs.DirEntry) {
panic("unknown entry type")
}
remoteParent := parentDir(entry.Remote())
dt.CheckParent("", remoteParent)
dt.checkParent("", remoteParent, nil)
}
// Find returns the DirEntry for filePath or nil if not found
//
// None that Find does a O(N) search so can be slow
func (dt DirTree) Find(filePath string) (parentPath string, entry fs.DirEntry) {
parentPath = parentDir(filePath)
for _, entry := range dt[parentPath] {
@ -77,23 +79,52 @@ func (dt DirTree) Find(filePath string) (parentPath string, entry fs.DirEntry) {
return parentPath, nil
}
// CheckParent checks that dirPath has a *Dir in its parent
func (dt DirTree) CheckParent(root, dirPath string) {
if dirPath == root {
return
// checkParent checks that dirPath has a *Dir in its parent
//
// If dirs is not nil it must contain entries for every *Dir found in
// the tree. It is used to speed up the checking when calling this
// repeatedly.
func (dt DirTree) checkParent(root, dirPath string, dirs map[string]struct{}) {
var parentPath string
for {
if dirPath == root {
return
}
// Can rely on dirs to have all directories in it so
// we don't need to call Find.
if dirs != nil {
if _, found := dirs[dirPath]; found {
return
}
parentPath = parentDir(dirPath)
} else {
var entry fs.DirEntry
parentPath, entry = dt.Find(dirPath)
if entry != nil {
return
}
}
dt[parentPath] = append(dt[parentPath], fs.NewDir(dirPath, time.Now()))
if dirs != nil {
dirs[dirPath] = struct{}{}
}
dirPath = parentPath
}
parentPath, entry := dt.Find(dirPath)
if entry != nil {
return
}
dt[parentPath] = append(dt[parentPath], fs.NewDir(dirPath, time.Now()))
dt.CheckParent(root, parentPath)
}
// CheckParents checks every directory in the tree has *Dir in its parent
func (dt DirTree) CheckParents(root string) {
dirs := make(map[string]struct{})
// Find all the directories and stick them in dirs
for _, entries := range dt {
for _, entry := range entries {
if _, ok := entry.(fs.Directory); ok {
dirs[entry.Remote()] = struct{}{}
}
}
}
for dirPath := range dt {
dt.CheckParent(root, dirPath)
dt.checkParent(root, dirPath, dirs)
}
}

View File

@ -1,6 +1,7 @@
package dirtree
import (
"fmt"
"testing"
"github.com/rclone/rclone/fstest/mockdir"
@ -108,7 +109,7 @@ func TestDirTreeCheckParent(t *testing.T) {
sausage
`, dt.String())
dt.CheckParent("", "dir/subdir")
dt.checkParent("", "dir/subdir", nil)
assert.Equal(t, `/
dir/
@ -200,3 +201,21 @@ dir2/
`, dt.String())
}
func BenchmarkCheckParents(b *testing.B) {
for _, N := range []int{1e2, 1e3, 1e4, 1e5, 1e6} {
b.Run(fmt.Sprintf("%d", N), func(b *testing.B) {
b.StopTimer()
dt := New()
for i := 0; i < N; i++ {
remote := fmt.Sprintf("dir%09d/file%09d.txt", i, 1)
o := mockobject.New(remote)
dt.Add(o)
}
b.StartTimer()
for n := 0; n < b.N; n++ {
dt.CheckParents("")
}
})
}
}