// Package internetarchive provides an interface to Internet Archive's Item // via their native API than using S3-compatible endpoints. package internetarchive import ( "bytes" "context" "encoding/json" "errors" "fmt" "io" "net/http" "net/url" "path" "regexp" "strconv" "strings" "time" "github.com/ncw/swift/v2" "github.com/rclone/rclone/fs" "github.com/rclone/rclone/fs/config" "github.com/rclone/rclone/fs/config/configmap" "github.com/rclone/rclone/fs/config/configstruct" "github.com/rclone/rclone/fs/fserrors" "github.com/rclone/rclone/fs/fshttp" "github.com/rclone/rclone/fs/hash" "github.com/rclone/rclone/lib/bucket" "github.com/rclone/rclone/lib/encoder" "github.com/rclone/rclone/lib/pacer" "github.com/rclone/rclone/lib/random" "github.com/rclone/rclone/lib/rest" ) // Register with Fs func init() { fs.Register(&fs.RegInfo{ Name: "internetarchive", Description: "Internet Archive", NewFs: NewFs, MetadataInfo: &fs.MetadataInfo{ System: map[string]fs.MetadataHelp{ "name": { Help: "Full file path, without the bucket part", Type: "filename", Example: "backend/internetarchive/internetarchive.go", ReadOnly: true, }, "source": { Help: "The source of the file", Type: "string", Example: "original", ReadOnly: true, }, "mtime": { Help: "Time of last modification, managed by Rclone", Type: "RFC 3339", Example: "2006-01-02T15:04:05.999999999Z", ReadOnly: true, }, "size": { Help: "File size in bytes", Type: "decimal number", Example: "123456", ReadOnly: true, }, "md5": { Help: "MD5 hash calculated by Internet Archive", Type: "string", Example: "01234567012345670123456701234567", ReadOnly: true, }, "crc32": { Help: "CRC32 calculated by Internet Archive", Type: "string", Example: "01234567", ReadOnly: true, }, "sha1": { Help: "SHA1 hash calculated by Internet Archive", Type: "string", Example: "0123456701234567012345670123456701234567", ReadOnly: true, }, "format": { Help: "Name of format identified by Internet Archive", Type: "string", Example: "Comma-Separated Values", ReadOnly: true, }, "old_version": { Help: "Whether the file was replaced and moved by keep-old-version flag", Type: "boolean", Example: "true", ReadOnly: true, }, "viruscheck": { Help: "The last time viruscheck process was run for the file (?)", Type: "unixtime", Example: "1654191352", ReadOnly: true, }, "summation": { Help: "Check https://forum.rclone.org/t/31922 for how it is used", Type: "string", Example: "md5", ReadOnly: true, }, "rclone-ia-mtime": { Help: "Time of last modification, managed by Internet Archive", Type: "RFC 3339", Example: "2006-01-02T15:04:05.999999999Z", }, "rclone-mtime": { Help: "Time of last modification, managed by Rclone", Type: "RFC 3339", Example: "2006-01-02T15:04:05.999999999Z", }, "rclone-update-track": { Help: "Random value used by Rclone for tracking changes inside Internet Archive", Type: "string", Example: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", }, }, Help: `Metadata fields provided by Internet Archive. If there are multiple values for a key, only the first one is returned. This is a limitation of Rclone, that supports one value per one key. Owner is able to add custom keys. Metadata feature grabs all the keys including them. `, }, Options: []fs.Option{{ Name: "access_key_id", Help: "IAS3 Access Key.\n\nLeave blank for anonymous access.\nYou can find one here: https://archive.org/account/s3.php", }, { Name: "secret_access_key", Help: "IAS3 Secret Key (password).\n\nLeave blank for anonymous access.", }, { // their official client (https://github.com/jjjake/internetarchive) hardcodes following the two Name: "endpoint", Help: "IAS3 Endpoint.\n\nLeave blank for default value.", Default: "https://s3.us.archive.org", Advanced: true, }, { Name: "front_endpoint", Help: "Host of InternetArchive Frontend.\n\nLeave blank for default value.", Default: "https://archive.org", Advanced: true, }, { Name: "disable_checksum", Help: `Don't ask the server to test against MD5 checksum calculated by rclone. Normally rclone will calculate the MD5 checksum of the input before uploading it so it can ask the server to check the object against checksum. This is great for data integrity checking but can cause long delays for large files to start uploading.`, Default: true, Advanced: true, }, { Name: "wait_archive", Help: `Timeout for waiting the server's processing tasks (specifically archive and book_op) to finish. Only enable if you need to be guaranteed to be reflected after write operations. 0 to disable waiting. No errors to be thrown in case of timeout.`, Default: fs.Duration(0), Advanced: true, }, { Name: config.ConfigEncoding, Help: config.ConfigEncodingHelp, Advanced: true, Default: encoder.EncodeZero | encoder.EncodeSlash | encoder.EncodeLtGt | encoder.EncodeCrLf | encoder.EncodeDel | encoder.EncodeCtl | encoder.EncodeInvalidUtf8 | encoder.EncodeDot, }, }}) } // maximum size of an item. this is constant across all items const iaItemMaxSize int64 = 1099511627776 // metadata keys that are not writeable var roMetadataKey = map[string]interface{}{ // do not add mtime here, it's a documented exception "name": nil, "source": nil, "size": nil, "md5": nil, "crc32": nil, "sha1": nil, "format": nil, "old_version": nil, "viruscheck": nil, "summation": nil, } // Options defines the configuration for this backend type Options struct { AccessKeyID string `config:"access_key_id"` SecretAccessKey string `config:"secret_access_key"` Endpoint string `config:"endpoint"` FrontEndpoint string `config:"front_endpoint"` DisableChecksum bool `config:"disable_checksum"` WaitArchive fs.Duration `config:"wait_archive"` Enc encoder.MultiEncoder `config:"encoding"` } // Fs represents an IAS3 remote type Fs struct { name string // name of this remote root string // the path we are working on if any opt Options // parsed config options features *fs.Features // optional features srv *rest.Client // the connection to IAS3 front *rest.Client // the connection to frontend pacer *fs.Pacer // pacer for API calls ctx context.Context } // Object describes a file at IA type Object struct { fs *Fs // reference to Fs remote string // the remote path modTime time.Time // last modified time size int64 // size of the file in bytes md5 string // md5 hash of the file presented by the server sha1 string // sha1 hash of the file presented by the server crc32 string // crc32 of the file presented by the server rawData json.RawMessage } // IAFile represents a subset of object in MetadataResponse.Files type IAFile struct { Name string `json:"name"` // Source string `json:"source"` Mtime string `json:"mtime"` RcloneMtime json.RawMessage `json:"rclone-mtime"` UpdateTrack json.RawMessage `json:"rclone-update-track"` Size string `json:"size"` Md5 string `json:"md5"` Crc32 string `json:"crc32"` Sha1 string `json:"sha1"` Summation string `json:"summation"` rawData json.RawMessage } // MetadataResponse represents subset of the JSON object returned by (frontend)/metadata/ type MetadataResponse struct { Files []IAFile `json:"files"` ItemSize int64 `json:"item_size"` } // MetadataResponseRaw is the form of MetadataResponse to deal with metadata type MetadataResponseRaw struct { Files []json.RawMessage `json:"files"` ItemSize int64 `json:"item_size"` } // ModMetadataResponse represents response for amending metadata type ModMetadataResponse struct { // https://archive.org/services/docs/api/md-write.html#example Success bool `json:"success"` Error string `json:"error"` } // Name of the remote (as passed into NewFs) func (f *Fs) Name() string { return f.name } // Root of the remote (as passed into NewFs) func (f *Fs) Root() string { return f.root } // String converts this Fs to a string func (f *Fs) String() string { bucket, file := f.split("") if bucket == "" { return "Internet Archive root" } if file == "" { return fmt.Sprintf("Internet Archive item %s", bucket) } return fmt.Sprintf("Internet Archive item %s path %s", bucket, file) } // Features returns the optional features of this Fs func (f *Fs) Features() *fs.Features { return f.features } // Hashes returns type of hashes supported by IA func (f *Fs) Hashes() hash.Set { return hash.NewHashSet(hash.MD5, hash.SHA1, hash.CRC32) } // Precision returns the precision of mtime that the server responds func (f *Fs) Precision() time.Duration { if f.opt.WaitArchive == 0 { return fs.ModTimeNotSupported } return time.Nanosecond } // retryErrorCodes is a slice of error codes that we will retry // See: https://docs.aws.amazon.com/AmazonS3/latest/API/ErrorResponses.html var retryErrorCodes = []int{ 429, // Too Many Requests 500, // Internal Server Error - "We encountered an internal error. Please try again." 503, // Service Unavailable/Slow Down - "Reduce your request rate" } // NewFs constructs an Fs from the path func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, error) { // Parse config into Options struct opt := new(Options) err := configstruct.Set(m, opt) if err != nil { return nil, err } // Parse the endpoints ep, err := url.Parse(opt.Endpoint) if err != nil { return nil, err } fe, err := url.Parse(opt.FrontEndpoint) if err != nil { return nil, err } root = strings.Trim(root, "/") f := &Fs{ name: name, opt: *opt, ctx: ctx, } f.setRoot(root) f.features = (&fs.Features{ BucketBased: true, ReadMetadata: true, WriteMetadata: true, UserMetadata: true, }).Fill(ctx, f) f.srv = rest.NewClient(fshttp.NewClient(ctx)) f.srv.SetRoot(ep.String()) f.front = rest.NewClient(fshttp.NewClient(ctx)) f.front.SetRoot(fe.String()) if opt.AccessKeyID != "" && opt.SecretAccessKey != "" { auth := fmt.Sprintf("LOW %s:%s", opt.AccessKeyID, opt.SecretAccessKey) f.srv.SetHeader("Authorization", auth) f.front.SetHeader("Authorization", auth) } f.pacer = fs.NewPacer(ctx, pacer.NewS3(pacer.MinSleep(10*time.Millisecond))) // test if the root exists as a file _, err = f.NewObject(ctx, "/") if err == nil { f.setRoot(betterPathDir(root)) return f, fs.ErrorIsFile } return f, nil } // setRoot changes the root of the Fs func (f *Fs) setRoot(root string) { f.root = strings.Trim(root, "/") } // Remote returns the remote path func (o *Object) Remote() string { return o.remote } // ModTime is the last modified time (read-only) func (o *Object) ModTime(ctx context.Context) time.Time { return o.modTime } // Size is the file length func (o *Object) Size() int64 { return o.size } // Fs returns the parent Fs func (o *Object) Fs() fs.Info { return o.fs } // Hash returns the hash value presented by IA func (o *Object) Hash(ctx context.Context, ty hash.Type) (string, error) { if ty == hash.MD5 { return o.md5, nil } if ty == hash.SHA1 { return o.sha1, nil } if ty == hash.CRC32 { return o.crc32, nil } return "", hash.ErrUnsupported } // Storable returns if this object is storable func (o *Object) Storable() bool { return true } // SetModTime sets modTime on a particular file func (o *Object) SetModTime(ctx context.Context, t time.Time) (err error) { bucket, reqDir := o.split() if bucket == "" { return fs.ErrorCantSetModTime } if reqDir == "" { return fs.ErrorCantSetModTime } // https://archive.org/services/docs/api/md-write.html // the following code might be useful for modifying metadata of an uploaded file patch := []map[string]string{ // we should drop it first to clear all rclone-provided mtimes { "op": "remove", "path": "/rclone-mtime", }, { "op": "add", "path": "/rclone-mtime", "value": t.Format(time.RFC3339Nano), }} res, err := json.Marshal(patch) if err != nil { return err } params := url.Values{} params.Add("-target", fmt.Sprintf("files/%s", reqDir)) params.Add("-patch", string(res)) body := []byte(params.Encode()) bodyLen := int64(len(body)) var resp *http.Response var result ModMetadataResponse // make a POST request to (frontend)/metadata/:item/ opts := rest.Opts{ Method: "POST", Path: path.Join("/metadata/", bucket), Body: bytes.NewReader(body), ContentLength: &bodyLen, ContentType: "application/x-www-form-urlencoded", } err = o.fs.pacer.Call(func() (bool, error) { resp, err = o.fs.front.CallJSON(ctx, &opts, nil, &result) return o.fs.shouldRetry(resp, err) }) if err != nil { return err } if result.Success { o.modTime = t return nil } return errors.New(result.Error) } // List files and directories in a directory func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) { bucket, reqDir := f.split(dir) if bucket == "" { if reqDir != "" { return nil, fs.ErrorListBucketRequired } return entries, nil } grandparent := f.opt.Enc.ToStandardPath(strings.Trim(path.Join(bucket, reqDir), "/") + "/") allEntries, err := f.listAllUnconstrained(ctx, bucket) if err != nil { return entries, err } for _, ent := range allEntries { obj, ok := ent.(*Object) if ok && strings.HasPrefix(obj.remote, grandparent) { path := trimPathPrefix(obj.remote, grandparent, f.opt.Enc) if !strings.Contains(path, "/") { obj.remote = trimPathPrefix(obj.remote, f.root, f.opt.Enc) entries = append(entries, obj) } } dire, ok := ent.(*fs.Dir) if ok && strings.HasPrefix(dire.Remote(), grandparent) { path := trimPathPrefix(dire.Remote(), grandparent, f.opt.Enc) if !strings.Contains(path, "/") { dire.SetRemote(trimPathPrefix(dire.Remote(), f.root, f.opt.Enc)) entries = append(entries, dire) } } } return entries, nil } // Mkdir can't be performed on IA like git repositories func (f *Fs) Mkdir(ctx context.Context, dir string) (err error) { return nil } // Rmdir as well, unless we're asked for recursive deletion func (f *Fs) Rmdir(ctx context.Context, dir string) error { return nil } // NewObject finds the Object at remote. If it can't be found // it returns the error fs.ErrorObjectNotFound. func (f *Fs) NewObject(ctx context.Context, remote string) (ret fs.Object, err error) { bucket, filepath := f.split(remote) filepath = strings.Trim(filepath, "/") if bucket == "" { if filepath != "" { return nil, fs.ErrorListBucketRequired } return nil, fs.ErrorIsDir } grandparent := f.opt.Enc.ToStandardPath(strings.Trim(path.Join(bucket, filepath), "/")) allEntries, err := f.listAllUnconstrained(ctx, bucket) if err != nil { return nil, err } for _, ent := range allEntries { obj, ok := ent.(*Object) if ok && obj.remote == grandparent { obj.remote = trimPathPrefix(obj.remote, f.root, f.opt.Enc) return obj, nil } } return nil, fs.ErrorObjectNotFound } // Put uploads a file func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) { o := &Object{ fs: f, remote: src.Remote(), modTime: src.ModTime(ctx), size: src.Size(), } err := o.Update(ctx, in, src, options...) if err == nil { return o, nil } return nil, err } // PublicLink generates a public link to the remote path (usually readable by anyone) func (f *Fs) PublicLink(ctx context.Context, remote string, expire fs.Duration, unlink bool) (link string, err error) { if strings.HasSuffix(remote, "/") { return "", fs.ErrorCantShareDirectories } if _, err := f.NewObject(ctx, remote); err != nil { return "", err } bucket, bucketPath := f.split(remote) return path.Join(f.opt.FrontEndpoint, "/download/", bucket, quotePath(bucketPath)), nil } // Copy src to this remote using server-side copy operations. // // This is stored with the remote path given. // // It returns the destination Object and a possible error. // // Will only be called if src.Fs().Name() == f.Name() // // If it isn't possible then return fs.ErrorCantCopy func (f *Fs) Copy(ctx context.Context, src fs.Object, remote string) (_ fs.Object, err error) { dstBucket, dstPath := f.split(remote) srcObj, ok := src.(*Object) if !ok { fs.Debugf(src, "Can't copy - not same remote type") return nil, fs.ErrorCantCopy } srcBucket, srcPath := srcObj.split() if dstBucket == srcBucket && dstPath == srcPath { // https://github.com/jjjake/internetarchive/blob/2456376533251df9d05e0a14d796ec1ced4959f5/internetarchive/cli/ia_copy.py#L68 fs.Debugf(src, "Can't copy - the source and destination files cannot be the same!") return nil, fs.ErrorCantCopy } updateTracker := random.String(32) headers := map[string]string{ "x-archive-auto-make-bucket": "1", "x-archive-queue-derive": "0", "x-archive-keep-old-version": "0", "x-amz-copy-source": quotePath(path.Join("/", srcBucket, srcPath)), "x-amz-metadata-directive": "COPY", "x-archive-filemeta-sha1": srcObj.sha1, "x-archive-filemeta-md5": srcObj.md5, "x-archive-filemeta-crc32": srcObj.crc32, "x-archive-filemeta-size": fmt.Sprint(srcObj.size), // add this too for sure "x-archive-filemeta-rclone-mtime": srcObj.modTime.Format(time.RFC3339Nano), "x-archive-filemeta-rclone-update-track": updateTracker, } // make a PUT request at (IAS3)/:item/:path without body var resp *http.Response opts := rest.Opts{ Method: "PUT", Path: "/" + url.PathEscape(path.Join(dstBucket, dstPath)), ExtraHeaders: headers, } err = f.pacer.Call(func() (bool, error) { resp, err = f.srv.Call(ctx, &opts) return f.shouldRetry(resp, err) }) if err != nil { return nil, err } // we can't update/find metadata here as IA will also // queue server-side copy as well as upload/delete. return f.waitFileUpload(ctx, trimPathPrefix(path.Join(dstBucket, dstPath), f.root, f.opt.Enc), updateTracker, srcObj.size) } // ListR lists the objects and directories of the Fs starting // from dir recursively into out. // // dir should be "" to start from the root, and should not // have trailing slashes. // // This should return ErrDirNotFound if the directory isn't // found. // // It should call callback for each tranche of entries read. // These need not be returned in any particular order. If // callback returns an error then the listing will stop // immediately. // // Don't implement this unless you have a more efficient way // of listing recursively than doing a directory traversal. func (f *Fs) ListR(ctx context.Context, dir string, callback fs.ListRCallback) (err error) { var allEntries, entries fs.DirEntries bucket, reqDir := f.split(dir) if bucket == "" { if reqDir != "" { return fs.ErrorListBucketRequired } return callback(entries) } grandparent := f.opt.Enc.ToStandardPath(strings.Trim(path.Join(bucket, reqDir), "/") + "/") allEntries, err = f.listAllUnconstrained(ctx, bucket) if err != nil { return err } for _, ent := range allEntries { obj, ok := ent.(*Object) if ok && strings.HasPrefix(obj.remote, grandparent) { obj.remote = trimPathPrefix(obj.remote, f.root, f.opt.Enc) entries = append(entries, obj) } dire, ok := ent.(*fs.Dir) if ok && strings.HasPrefix(dire.Remote(), grandparent) { dire.SetRemote(trimPathPrefix(dire.Remote(), f.root, f.opt.Enc)) entries = append(entries, dire) } } return callback(entries) } // CleanUp removes all files inside history/ func (f *Fs) CleanUp(ctx context.Context) (err error) { bucket, _ := f.split("/") if bucket == "" { return fs.ErrorListBucketRequired } entries, err := f.listAllUnconstrained(ctx, bucket) if err != nil { return err } for _, ent := range entries { obj, ok := ent.(*Object) if ok && strings.HasPrefix(obj.remote, bucket+"/history/") { err = obj.Remove(ctx) if err != nil { return err } } // we can fully ignore directories, as they're just virtual entries to // comply with rclone's requirement } return nil } // About returns things about remaining and used spaces func (f *Fs) About(ctx context.Context) (_ *fs.Usage, err error) { bucket, _ := f.split("/") if bucket == "" { return nil, fs.ErrorListBucketRequired } result, err := f.requestMetadata(ctx, bucket) if err != nil { return nil, err } // perform low-level operation here since it's ridiculous to make 2 same requests var historySize int64 for _, ent := range result.Files { if strings.HasPrefix(ent.Name, "history/") { size := parseSize(ent.Size) if size < 0 { // parse error can be ignored since it's not fatal continue } historySize += size } } usage := &fs.Usage{ Total: fs.NewUsageValue(iaItemMaxSize), Free: fs.NewUsageValue(iaItemMaxSize - result.ItemSize), Used: fs.NewUsageValue(result.ItemSize), Trashed: fs.NewUsageValue(historySize), // bytes in trash } return usage, nil } // Open an object for read func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) { var optionsFixed []fs.OpenOption for _, opt := range options { if optRange, ok := opt.(*fs.RangeOption); ok { // Ignore range option if file is empty if o.Size() == 0 && optRange.Start == 0 && optRange.End > 0 { continue } } optionsFixed = append(optionsFixed, opt) } var resp *http.Response // make a GET request to (frontend)/download/:item/:path opts := rest.Opts{ Method: "GET", Path: path.Join("/download/", o.fs.root, quotePath(o.fs.opt.Enc.FromStandardPath(o.remote))), Options: optionsFixed, } err = o.fs.pacer.Call(func() (bool, error) { resp, err = o.fs.front.Call(ctx, &opts) return o.fs.shouldRetry(resp, err) }) if err != nil { return nil, err } return resp.Body, nil } // Update the Object from in with modTime and size func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (err error) { bucket, bucketPath := o.split() modTime := src.ModTime(ctx) size := src.Size() updateTracker := random.String(32) // Set the mtime in the metadata // internetarchive backend builds at header level as IAS3 has extension outside X-Amz- headers := map[string]string{ // https://github.com/jjjake/internetarchive/blob/2456376533251df9d05e0a14d796ec1ced4959f5/internetarchive/iarequest.py#L158 "x-amz-filemeta-rclone-mtime": modTime.Format(time.RFC3339Nano), "x-amz-filemeta-rclone-update-track": updateTracker, // we add some more headers for intuitive actions "x-amz-auto-make-bucket": "1", // create an item if does not exist, do nothing if already "x-archive-auto-make-bucket": "1", // same as above in IAS3 original way "x-archive-keep-old-version": "0", // do not keep old versions (a.k.a. trashes in other clouds) "x-archive-meta-mediatype": "data", // mark media type of the uploading file as "data" "x-archive-queue-derive": "0", // skip derivation process (e.g. encoding to smaller files, OCR on PDFs) "x-archive-cascade-delete": "1", // enable "cascate delete" (delete all derived files in addition to the file itself) } if size >= 0 { headers["Content-Length"] = fmt.Sprintf("%d", size) headers["x-archive-size-hint"] = fmt.Sprintf("%d", size) } var mdata fs.Metadata mdata, err = fs.GetMetadataOptions(ctx, src, options) if err == nil && mdata != nil { for mk, mv := range mdata { mk = strings.ToLower(mk) if strings.HasPrefix(mk, "rclone-") { fs.LogPrintf(fs.LogLevelWarning, o, "reserved metadata key %s is about to set", mk) } else if _, ok := roMetadataKey[mk]; ok { fs.LogPrintf(fs.LogLevelWarning, o, "setting or modifying read-only key %s is requested, skipping", mk) continue } else if mk == "mtime" { // redirect to make it work mk = "rclone-mtime" } headers[fmt.Sprintf("x-amz-filemeta-%s", mk)] = mv } } // read the md5sum if available var md5sumHex string if !o.fs.opt.DisableChecksum { md5sumHex, err = src.Hash(ctx, hash.MD5) if err == nil && matchMd5.MatchString(md5sumHex) { // Set the md5sum in header on the object if // the user wants it // https://github.com/jjjake/internetarchive/blob/245637653/internetarchive/item.py#L969 headers["Content-MD5"] = md5sumHex } } // make a PUT request at (IAS3)/encoded(:item/:path) var resp *http.Response opts := rest.Opts{ Method: "PUT", Path: "/" + url.PathEscape(path.Join(bucket, bucketPath)), Body: in, ContentLength: &size, ExtraHeaders: headers, } err = o.fs.pacer.Call(func() (bool, error) { resp, err = o.fs.srv.Call(ctx, &opts) return o.fs.shouldRetry(resp, err) }) // we can't update/find metadata here as IA will "ingest" uploaded file(s) // upon uploads. (you can find its progress at https://archive.org/history/ItemNameHere ) // or we have to wait for finish? (needs polling (frontend)/metadata/:item or scraping (frontend)/history/:item) var newObj *Object if err == nil { newObj, err = o.fs.waitFileUpload(ctx, o.remote, updateTracker, size) } else { newObj = &Object{} } o.crc32 = newObj.crc32 o.md5 = newObj.md5 o.sha1 = newObj.sha1 o.modTime = newObj.modTime o.size = newObj.size return err } // Remove an object func (o *Object) Remove(ctx context.Context) (err error) { bucket, bucketPath := o.split() // make a DELETE request at (IAS3)/:item/:path var resp *http.Response opts := rest.Opts{ Method: "DELETE", Path: "/" + url.PathEscape(path.Join(bucket, bucketPath)), } err = o.fs.pacer.Call(func() (bool, error) { resp, err = o.fs.srv.Call(ctx, &opts) return o.fs.shouldRetry(resp, err) }) // deleting files can take bit longer as // it'll be processed on same queue as uploads if err == nil { err = o.fs.waitDelete(ctx, bucket, bucketPath) } return err } // String converts this Fs to a string func (o *Object) String() string { if o == nil { return "" } return o.remote } // Metadata returns all file metadata provided by Internet Archive func (o *Object) Metadata(ctx context.Context) (m fs.Metadata, err error) { if o.rawData == nil { return nil, nil } raw := make(map[string]json.RawMessage) err = json.Unmarshal(o.rawData, &raw) if err != nil { // fatal: json parsing failed return } for k, v := range raw { items, err := listOrString(v) if len(items) == 0 || err != nil { // skip: an entry failed to parse continue } m.Set(k, items[0]) } // move the old mtime to an another key if v, ok := m["mtime"]; ok { m["rclone-ia-mtime"] = v } // overwrite with a correct mtime m["mtime"] = o.modTime.Format(time.RFC3339Nano) return } func (f *Fs) shouldRetry(resp *http.Response, err error) (bool, error) { if resp != nil { for _, e := range retryErrorCodes { if resp.StatusCode == e { return true, err } } } // Ok, not an awserr, check for generic failure conditions return fserrors.ShouldRetry(err), err } var matchMd5 = regexp.MustCompile(`^[0-9a-f]{32}$`) // split returns bucket and bucketPath from the rootRelativePath // relative to f.root func (f *Fs) split(rootRelativePath string) (bucketName, bucketPath string) { bucketName, bucketPath = bucket.Split(path.Join(f.root, rootRelativePath)) return f.opt.Enc.FromStandardName(bucketName), f.opt.Enc.FromStandardPath(bucketPath) } // split returns bucket and bucketPath from the object func (o *Object) split() (bucket, bucketPath string) { return o.fs.split(o.remote) } func (f *Fs) requestMetadata(ctx context.Context, bucket string) (result *MetadataResponse, err error) { var resp *http.Response // make a GET request to (frontend)/metadata/:item/ opts := rest.Opts{ Method: "GET", Path: path.Join("/metadata/", bucket), } var temp MetadataResponseRaw err = f.pacer.Call(func() (bool, error) { resp, err = f.front.CallJSON(ctx, &opts, nil, &temp) return f.shouldRetry(resp, err) }) if err != nil { return } return temp.unraw() } // list up all files/directories without any filters func (f *Fs) listAllUnconstrained(ctx context.Context, bucket string) (entries fs.DirEntries, err error) { result, err := f.requestMetadata(ctx, bucket) if err != nil { return nil, err } knownDirs := map[string]time.Time{ "": time.Unix(0, 0), } for _, file := range result.Files { dir := strings.Trim(betterPathDir(file.Name), "/") nameWithBucket := path.Join(bucket, file.Name) mtimeTime := file.parseMtime() // populate children directories child := dir for { if _, ok := knownDirs[child]; ok { break } // directory d := fs.NewDir(f.opt.Enc.ToStandardPath(path.Join(bucket, child)), mtimeTime) entries = append(entries, d) knownDirs[child] = mtimeTime child = strings.Trim(betterPathDir(child), "/") } if _, ok := knownDirs[betterPathDir(file.Name)]; !ok { continue } size := parseSize(file.Size) o := makeValidObject(f, f.opt.Enc.ToStandardPath(nameWithBucket), file, mtimeTime, size) entries = append(entries, o) } return entries, nil } func (f *Fs) waitFileUpload(ctx context.Context, reqPath, tracker string, newSize int64) (ret *Object, err error) { bucket, bucketPath := f.split(reqPath) ret = &Object{ fs: f, remote: trimPathPrefix(path.Join(bucket, bucketPath), f.root, f.opt.Enc), modTime: time.Unix(0, 0), size: -1, } if f.opt.WaitArchive == 0 { // user doesn't want to poll, let's not ret2, err := f.NewObject(ctx, reqPath) if err == nil { ret2, ok := ret2.(*Object) if ok { ret = ret2 ret.crc32 = "" ret.md5 = "" ret.sha1 = "" ret.size = -1 } } return ret, nil } retC := make(chan struct { *Object error }, 1) go func() { isFirstTime := true existed := false for { if !isFirstTime { // depending on the queue, it takes time time.Sleep(10 * time.Second) } metadata, err := f.requestMetadata(ctx, bucket) if err != nil { retC <- struct { *Object error }{ret, err} return } var iaFile *IAFile for _, f := range metadata.Files { if f.Name == bucketPath { iaFile = &f break } } if isFirstTime { isFirstTime = false existed = iaFile != nil } if iaFile == nil { continue } if !existed && !isFirstTime { // fast path: file wasn't exited before retC <- struct { *Object error }{makeValidObject2(f, *iaFile, bucket), nil} return } fileTrackers, _ := listOrString(iaFile.UpdateTrack) trackerMatch := false for _, v := range fileTrackers { if v == tracker { trackerMatch = true break } } if !trackerMatch { continue } if !compareSize(parseSize(iaFile.Size), newSize) { continue } // voila! retC <- struct { *Object error }{makeValidObject2(f, *iaFile, bucket), nil} return } }() select { case res := <-retC: return res.Object, res.error case <-time.After(time.Duration(f.opt.WaitArchive)): return ret, nil } } func (f *Fs) waitDelete(ctx context.Context, bucket, bucketPath string) (err error) { if f.opt.WaitArchive == 0 { // user doesn't want to poll, let's not return nil } retC := make(chan error, 1) go func() { for { metadata, err := f.requestMetadata(ctx, bucket) if err != nil { retC <- err return } found := false for _, f := range metadata.Files { if f.Name == bucketPath { found = true break } } if !found { retC <- nil return } // depending on the queue, it takes time time.Sleep(10 * time.Second) } }() select { case res := <-retC: return res case <-time.After(time.Duration(f.opt.WaitArchive)): return nil } } func makeValidObject(f *Fs, remote string, file IAFile, mtime time.Time, size int64) *Object { ret := &Object{ fs: f, remote: remote, modTime: mtime, size: size, rawData: file.rawData, } // hashes from _files.xml (where summation != "") is different from one in other files // https://forum.rclone.org/t/internet-archive-md5-tag-in-id-files-xml-interpreted-incorrectly/31922 if file.Summation == "" { ret.md5 = file.Md5 ret.crc32 = file.Crc32 ret.sha1 = file.Sha1 } return ret } func makeValidObject2(f *Fs, file IAFile, bucket string) *Object { mtimeTime := file.parseMtime() size := parseSize(file.Size) return makeValidObject(f, trimPathPrefix(path.Join(bucket, file.Name), f.root, f.opt.Enc), file, mtimeTime, size) } func listOrString(jm json.RawMessage) (rmArray []string, err error) { // rclone-metadata can be an array or string // try to deserialize it as array first err = json.Unmarshal(jm, &rmArray) if err != nil { // if not, it's a string dst := new(string) err = json.Unmarshal(jm, dst) if err == nil { rmArray = []string{*dst} } } return } func (file IAFile) parseMtime() (mtime time.Time) { // method 1: use metadata added by rclone rmArray, err := listOrString(file.RcloneMtime) // let's take the first value we can deserialize for _, value := range rmArray { mtime, err = time.Parse(time.RFC3339Nano, value) if err == nil { break } } if err != nil { // method 2: use metadata added by IAS3 mtime, err = swift.FloatStringToTime(file.Mtime) } if err != nil { // metadata files don't have some of the fields mtime = time.Unix(0, 0) } return mtime } func (mrr *MetadataResponseRaw) unraw() (_ *MetadataResponse, err error) { var files []IAFile for _, raw := range mrr.Files { var parsed IAFile err = json.Unmarshal(raw, &parsed) if err != nil { return nil, err } parsed.rawData = raw files = append(files, parsed) } return &MetadataResponse{ Files: files, ItemSize: mrr.ItemSize, }, nil } func compareSize(a, b int64) bool { if a < 0 || b < 0 { // we won't compare if any of them is not known return true } return a == b } func parseSize(str string) int64 { size, err := strconv.ParseInt(str, 10, 64) if err != nil { size = -1 } return size } func betterPathDir(p string) string { d := path.Dir(p) if d == "." { return "" } return d } func betterPathClean(p string) string { d := path.Clean(p) if d == "." { return "" } return d } func trimPathPrefix(s, prefix string, enc encoder.MultiEncoder) string { // we need to clean the paths to make tests pass! s = betterPathClean(s) prefix = betterPathClean(prefix) if s == prefix || s == prefix+"/" { return "" } prefix = enc.ToStandardPath(strings.TrimRight(prefix, "/")) return enc.ToStandardPath(strings.TrimPrefix(s, prefix+"/")) } // mimics urllib.parse.quote() on Python; exclude / from url.PathEscape func quotePath(s string) string { seg := strings.Split(s, "/") newValues := []string{} for _, v := range seg { newValues = append(newValues, url.PathEscape(v)) } return strings.Join(newValues, "/") } var ( _ fs.Fs = &Fs{} _ fs.Copier = &Fs{} _ fs.ListRer = &Fs{} _ fs.CleanUpper = &Fs{} _ fs.PublicLinker = &Fs{} _ fs.Abouter = &Fs{} _ fs.Object = &Object{} _ fs.Metadataer = &Object{} )