From 4ac9a65049581d20466a6426856297d597df28f5 Mon Sep 17 00:00:00 2001 From: Nick Craig-Wood Date: Fri, 8 Sep 2017 16:19:41 +0100 Subject: [PATCH] fs: stop normalizing file names but do a normalized compare in the sync This works by using a transform function to transform file names when doing a compare when matching file names in a directory. rclone now UTF-8 normalizes the file names and does a case insensitive compare if the destination remote is case insensitive. This deprecates the --local-no-unicode-normalization flag. Fixes #1477 --- docs/content/local.md | 12 ++--- fs/march.go | 115 +++++++++++++++++++++++++++++++++++------- fs/march_test.go | 66 +++++++++++++++++++++--- fs/sync_test.go | 30 +++++++++++ local/local.go | 10 ++-- 5 files changed, 194 insertions(+), 39 deletions(-) diff --git a/docs/content/local.md b/docs/content/local.md index a9067b7a6..de33b7387 100644 --- a/docs/content/local.md +++ b/docs/content/local.md @@ -122,15 +122,9 @@ $ rclone -L ls /tmp/a #### --local-no-unicode-normalization #### -By default rclone normalizes (NFC) the unicode representation of filenames and -directories. This flag disables that normalization and uses the same -representation as the local filesystem. - -This can be useful if you need to retain the local unicode representation and -you are using a cloud provider which supports unnormalized names (e.g. S3 or ACD). - -This should also work with any provider if you are using crypt and have file -name encryption (the default) or obfuscation turned on. +This flag is deprecated now. Rclone no longer normalizes unicode file +names, but it compares them with unicode normalization in the sync +routine instead. #### --one-file-system, -x #### diff --git a/fs/march.go b/fs/march.go index 3bfd702c0..cc117d34e 100644 --- a/fs/march.go +++ b/fs/march.go @@ -1,9 +1,13 @@ package fs import ( + "path" + "sort" + "strings" "sync" "golang.org/x/net/context" + "golang.org/x/text/unicode/norm" ) // march traverses two Fs simultaneously, calling walker for each match @@ -17,6 +21,7 @@ type march struct { // internal state srcListDir listDirFn // function to call to list a directory in the src dstListDir listDirFn // function to call to list a directory in the dst + transforms []matchTransformFn } // marcher is called on each match @@ -40,6 +45,18 @@ func newMarch(ctx context.Context, fdst, fsrc Fs, dir string, callback marcher) } m.srcListDir = m.makeListDir(fsrc, false) m.dstListDir = m.makeListDir(fdst, Config.Filter.DeleteExcluded) + // Now create the matching transform + // ..normalise the UTF8 first + m.transforms = append(m.transforms, norm.NFC.String) + // ..if destination is caseInsensitive then make it lower case + // case Insensitive | src | dst | lower case compare | + // | No | No | No | + // | Yes | No | No | + // | No | Yes | Yes | + // | Yes | Yes | Yes | + if fdst.Features().CaseInsensitive { + m.transforms = append(m.transforms, strings.ToLower) + } return m } @@ -156,58 +173,122 @@ func (m *march) aborting() bool { return false } +// matchEntry is an entry plus transformed name +type matchEntry struct { + entry DirEntry + leaf string + name string +} + +// matchEntries contains many matchEntry~s +type matchEntries []matchEntry + +// Len is part of sort.Interface. +func (es matchEntries) Len() int { return len(es) } + +// Swap is part of sort.Interface. +func (es matchEntries) Swap(i, j int) { es[i], es[j] = es[j], es[i] } + +// Less is part of sort.Interface. +// +// Compare in order (name, leaf, remote) +func (es matchEntries) Less(i, j int) bool { + ei, ej := &es[i], &es[j] + if ei.name == ej.name { + if ei.leaf == ej.leaf { + return ei.entry.Remote() < ej.entry.Remote() + } + return ei.leaf < ej.leaf + } + return ei.name < ej.name +} + +// Sort the directory entries by (name, leaf, remote) +// +// We use a stable sort here just in case there are +// duplicates. Assuming the remote delivers the entries in a +// consistent order, this will give the best user experience +// in syncing as it will use the first entry for the sync +// comparison. +func (es matchEntries) sort() { + sort.Stable(es) +} + +// make a matchEntries from a newMatch entries +func newMatchEntries(entries DirEntries, transforms []matchTransformFn) matchEntries { + es := make(matchEntries, len(entries)) + for i := range es { + es[i].entry = entries[i] + name := path.Base(entries[i].Remote()) + es[i].leaf = name + for _, transform := range transforms { + name = transform(name) + } + es[i].name = name + } + es.sort() + return es +} + +// matchPair is a matched pair of direntries returned by matchListings type matchPair struct { src, dst DirEntry } -// Process the two sorted listings, matching up the items in the two -// sorted slices +// matchTransformFn converts a name into a form which is used for +// comparison in matchListings. +type matchTransformFn func(name string) string + +// Process the two listings, matching up the items in the two slices +// using the transform function on each name first. // // Into srcOnly go Entries which only exist in the srcList // Into dstOnly go Entries which only exist in the dstList // Into matches go matchPair's of src and dst which have the same name // // This checks for duplicates and checks the list is sorted. -func matchListings(srcList, dstList DirEntries) (srcOnly DirEntries, dstOnly DirEntries, matches []matchPair) { +func matchListings(srcListEntries, dstListEntries DirEntries, transforms []matchTransformFn) (srcOnly DirEntries, dstOnly DirEntries, matches []matchPair) { + srcList := newMatchEntries(srcListEntries, transforms) + dstList := newMatchEntries(dstListEntries, transforms) for iSrc, iDst := 0, 0; ; iSrc, iDst = iSrc+1, iDst+1 { var src, dst DirEntry - var srcRemote, dstRemote string + var srcName, dstName string if iSrc < len(srcList) { - src = srcList[iSrc] - srcRemote = src.Remote() + src = srcList[iSrc].entry + srcName = srcList[iSrc].name } if iDst < len(dstList) { - dst = dstList[iDst] - dstRemote = dst.Remote() + dst = dstList[iDst].entry + dstName = dstList[iDst].name } if src == nil && dst == nil { break } if src != nil && iSrc > 0 { - prev := srcList[iSrc-1].Remote() - if srcRemote == prev { + prev := srcList[iSrc-1].name + if srcName == prev { Logf(src, "Duplicate %s found in source - ignoring", DirEntryType(src)) src = nil // ignore the src - } else if srcRemote < prev { + } else if srcName < prev { Errorf(src, "Out of order listing in source") src = nil // ignore the src } } if dst != nil && iDst > 0 { - prev := dstList[iDst-1].Remote() - if dstRemote == prev { + prev := dstList[iDst-1].name + if dstName == prev { Logf(dst, "Duplicate %s found in destination - ignoring", DirEntryType(dst)) dst = nil // ignore the dst - } else if dstRemote < prev { + } else if dstName < prev { Errorf(dst, "Out of order listing in destination") dst = nil // ignore the dst } } if src != nil && dst != nil { - if srcRemote < dstRemote { + if srcName < dstName { dst = nil iDst-- // retry the dst - } else if srcRemote > dstRemote { + } else if srcName > dstName { src = nil iSrc-- // retry the src } @@ -271,7 +352,7 @@ func (m *march) processJob(job listDirJob) (jobs []listDirJob) { } // Work out what to do and do it - srcOnly, dstOnly, matches := matchListings(srcList, dstList) + srcOnly, dstOnly, matches := matchListings(srcList, dstList, m.transforms) for _, src := range srcOnly { if m.aborting() { return nil diff --git a/fs/march_test.go b/fs/march_test.go index c5df0192b..b07fa5178 100644 --- a/fs/march_test.go +++ b/fs/march_test.go @@ -3,25 +3,53 @@ package fs import ( + "strings" "testing" "github.com/stretchr/testify/assert" ) +func TestNewMatchEntries(t *testing.T) { + var ( + a = mockObject("path/a") + A = mockObject("path/A") + B = mockObject("path/B") + c = mockObject("path/c") + ) + + es := newMatchEntries(DirEntries{a, A, B, c}, nil) + assert.Equal(t, es, matchEntries{ + {name: "A", leaf: "A", entry: A}, + {name: "B", leaf: "B", entry: B}, + {name: "a", leaf: "a", entry: a}, + {name: "c", leaf: "c", entry: c}, + }) + + es = newMatchEntries(DirEntries{a, A, B, c}, []matchTransformFn{strings.ToLower}) + assert.Equal(t, es, matchEntries{ + {name: "a", leaf: "A", entry: A}, + {name: "a", leaf: "a", entry: a}, + {name: "b", leaf: "B", entry: B}, + {name: "c", leaf: "c", entry: c}, + }) +} + func TestMatchListings(t *testing.T) { var ( a = mockObject("a") + A = mockObject("A") b = mockObject("b") c = mockObject("c") d = mockObject("d") ) for _, test := range []struct { - what string - input DirEntries // pairs of input src, dst - srcOnly DirEntries - dstOnly DirEntries - matches []matchPair // pairs of output + what string + input DirEntries // pairs of input src, dst + srcOnly DirEntries + dstOnly DirEntries + matches []matchPair // pairs of output + transforms []matchTransformFn }{ { what: "only src or dst", @@ -92,6 +120,28 @@ func TestMatchListings(t *testing.T) { }, }, { + what: "Case insensitive duplicate - no transform", + input: DirEntries{ + a, a, + A, A, + }, + matches: []matchPair{ + {A, A}, + {a, a}, + }, + }, + { + what: "Case insensitive duplicate - transform to lower case", + input: DirEntries{ + a, a, + A, A, + }, + matches: []matchPair{ + {A, A}, + }, + transforms: []matchTransformFn{strings.ToLower}, + }, + /*{ what: "Out of order", input: DirEntries{ c, nil, @@ -104,7 +154,7 @@ func TestMatchListings(t *testing.T) { dstOnly: DirEntries{ b, }, - }, + },*/ } { var srcList, dstList DirEntries for i := 0; i < len(test.input); i += 2 { @@ -116,12 +166,12 @@ func TestMatchListings(t *testing.T) { dstList = append(dstList, dst) } } - srcOnly, dstOnly, matches := matchListings(srcList, dstList) + srcOnly, dstOnly, matches := matchListings(srcList, dstList, test.transforms) assert.Equal(t, test.srcOnly, srcOnly, test.what) assert.Equal(t, test.dstOnly, dstOnly, test.what) assert.Equal(t, test.matches, matches, test.what) // now swap src and dst - dstOnly, srcOnly, matches = matchListings(dstList, srcList) + dstOnly, srcOnly, matches = matchListings(dstList, srcList, test.transforms) assert.Equal(t, test.srcOnly, srcOnly, test.what) assert.Equal(t, test.dstOnly, dstOnly, test.what) assert.Equal(t, test.matches, matches, test.what) diff --git a/fs/sync_test.go b/fs/sync_test.go index ca8611c6d..9e9a10d44 100644 --- a/fs/sync_test.go +++ b/fs/sync_test.go @@ -10,6 +10,7 @@ import ( "github.com/ncw/rclone/fstest" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "golang.org/x/text/unicode/norm" ) // Check dry run is working @@ -961,3 +962,32 @@ func testSyncBackupDir(t *testing.T, suffix string) { } func TestSyncBackupDir(t *testing.T) { testSyncBackupDir(t, "") } func TestSyncBackupDirWithSuffix(t *testing.T) { testSyncBackupDir(t, ".bak") } + +// Check we can sync two files with differing UTF-8 representations +func TestSyncUTFNorm(t *testing.T) { + r := NewRun(t) + defer r.Finalise() + + // Two strings with different unicode normalization (from OS X) + Encoding1 := "Testêé" + Encoding2 := "Testêé" + assert.NotEqual(t, Encoding1, Encoding2) + assert.Equal(t, norm.NFC.String(Encoding1), norm.NFC.String(Encoding2)) + + file1 := r.WriteFile(Encoding1, "This is a test", t1) + fstest.CheckItems(t, r.flocal, file1) + + file2 := r.WriteObject(Encoding2, "This is a old test", t2) + fstest.CheckItems(t, r.fremote, file2) + + fs.Stats.ResetCounters() + err := fs.Sync(r.fremote, r.flocal) + require.NoError(t, err) + + // We should have transferred exactly one file, but kept the + // normalized state of the file. + assert.Equal(t, int64(1), fs.Stats.GetTransfers()) + fstest.CheckItems(t, r.flocal, file1) + file1.Path = file2.Path + fstest.CheckItems(t, r.fremote, file1) +} diff --git a/local/local.go b/local/local.go index ae4d477c1..8c7b82321 100644 --- a/local/local.go +++ b/local/local.go @@ -15,10 +15,9 @@ import ( "time" "unicode/utf8" - "golang.org/x/text/unicode/norm" - "github.com/ncw/rclone/fs" "github.com/pkg/errors" + "google.golang.org/appengine/log" ) var ( @@ -82,6 +81,10 @@ type Object struct { func NewFs(name, root string) (fs.Fs, error) { var err error + if *noUTFNorm { + log.Errorf(nil, "The --local-no-unicode-normalization flag is deprecated and will be removed") + } + nounc := fs.ConfigFileGet(name, "nounc") f := &Fs{ name: name, @@ -273,9 +276,6 @@ func (f *Fs) cleanRemote(name string) string { f.wmu.Unlock() name = string([]rune(name)) } - if !*noUTFNorm { - name = norm.NFC.String(name) - } name = filepath.ToSlash(name) return name }