Merge branch '17507-listobjectsv2'
authorTom Clegg <tom@curii.com>
Mon, 26 Apr 2021 14:14:15 +0000 (10:14 -0400)
committerTom Clegg <tom@curii.com>
Mon, 26 Apr 2021 14:14:15 +0000 (10:14 -0400)
fixes #17507

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

sdk/go/arvados/fs_lookup.go
services/keep-web/s3.go
services/keep-web/s3_test.go

index 56b5953234784424e51676a90b4c148661cb8c4d..021e8241cfeda647334781499ac60396e0961ac1 100644 (file)
@@ -50,9 +50,11 @@ func (ln *lookupnode) Readdir() ([]os.FileInfo, error) {
                        return nil, err
                }
                for _, child := range all {
+                       ln.treenode.Lock()
                        _, err = ln.treenode.Child(child.FileInfo().Name(), func(inode) (inode, error) {
                                return child, nil
                        })
+                       ln.treenode.Unlock()
                        if err != nil {
                                return nil, err
                        }
index 620a21b883aa428748a8d4116abca74f8fe12c10..f03ff01b8127d6ee1d1056ee72bee5bd6507d26b 100644 (file)
@@ -7,6 +7,7 @@ package main
 import (
        "crypto/hmac"
        "crypto/sha256"
+       "encoding/base64"
        "encoding/xml"
        "errors"
        "fmt"
@@ -33,6 +34,42 @@ const (
        s3MaxClockSkew  = 5 * time.Minute
 )
 
+type commonPrefix struct {
+       Prefix string
+}
+
+type listV1Resp struct {
+       XMLName string `xml:"http://s3.amazonaws.com/doc/2006-03-01/ ListBucketResult"`
+       s3.ListResp
+       // s3.ListResp marshals an empty tag when
+       // CommonPrefixes is nil, which confuses some clients.
+       // Fix by using this nested struct instead.
+       CommonPrefixes []commonPrefix
+       // Similarly, we need omitempty here, because an empty
+       // tag confuses some clients (e.g.,
+       // github.com/aws/aws-sdk-net never terminates its
+       // paging loop).
+       NextMarker string `xml:"NextMarker,omitempty"`
+       // ListObjectsV2 has a KeyCount response field.
+       KeyCount int
+}
+
+type listV2Resp struct {
+       XMLName               string `xml:"http://s3.amazonaws.com/doc/2006-03-01/ ListBucketResult"`
+       IsTruncated           bool
+       Contents              []s3.Key
+       Name                  string
+       Prefix                string
+       Delimiter             string
+       MaxKeys               int
+       CommonPrefixes        []commonPrefix
+       EncodingType          string `xml:",omitempty"`
+       KeyCount              int
+       ContinuationToken     string `xml:",omitempty"`
+       NextContinuationToken string `xml:",omitempty"`
+       StartAfter            string `xml:",omitempty"`
+}
+
 func hmacstring(msg string, key []byte) []byte {
        h := hmac.New(sha256.New, key)
        io.WriteString(h, msg)
@@ -559,19 +596,50 @@ var errDone = errors.New("done")
 
 func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request, fs arvados.CustomFileSystem) {
        var params struct {
-               delimiter string
-               marker    string
-               maxKeys   int
-               prefix    string
+               v2                bool
+               delimiter         string
+               maxKeys           int
+               prefix            string
+               marker            string // decoded continuationToken (v2) or provided by client (v1)
+               startAfter        string // v2
+               continuationToken string // v2
+               encodingTypeURL   bool   // v2
        }
        params.delimiter = r.FormValue("delimiter")
-       params.marker = r.FormValue("marker")
        if mk, _ := strconv.ParseInt(r.FormValue("max-keys"), 10, 64); mk > 0 && mk < s3MaxKeys {
                params.maxKeys = int(mk)
        } else {
                params.maxKeys = s3MaxKeys
        }
        params.prefix = r.FormValue("prefix")
+       switch r.FormValue("list-type") {
+       case "":
+       case "2":
+               params.v2 = true
+       default:
+               http.Error(w, "invalid list-type parameter", http.StatusBadRequest)
+               return
+       }
+       if params.v2 {
+               params.continuationToken = r.FormValue("continuation-token")
+               marker, err := base64.StdEncoding.DecodeString(params.continuationToken)
+               if err != nil {
+                       http.Error(w, "invalid continuation token", http.StatusBadRequest)
+                       return
+               }
+               params.marker = string(marker)
+               params.startAfter = r.FormValue("start-after")
+               switch r.FormValue("encoding-type") {
+               case "":
+               case "url":
+                       params.encodingTypeURL = true
+               default:
+                       http.Error(w, "invalid encoding-type parameter", http.StatusBadRequest)
+                       return
+               }
+       } else {
+               params.marker = r.FormValue("marker")
+       }
 
        bucketdir := "by_id/" + bucket
        // walkpath is the directory (relative to bucketdir) we need
@@ -588,33 +656,16 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
                walkpath = ""
        }
 
-       type commonPrefix struct {
-               Prefix string
-       }
-       type listResp struct {
-               XMLName string `xml:"http://s3.amazonaws.com/doc/2006-03-01/ ListBucketResult"`
-               s3.ListResp
-               // s3.ListResp marshals an empty tag when
-               // CommonPrefixes is nil, which confuses some clients.
-               // Fix by using this nested struct instead.
-               CommonPrefixes []commonPrefix
-               // Similarly, we need omitempty here, because an empty
-               // tag confuses some clients (e.g.,
-               // github.com/aws/aws-sdk-net never terminates its
-               // paging loop).
-               NextMarker string `xml:"NextMarker,omitempty"`
-               // ListObjectsV2 has a KeyCount response field.
-               KeyCount int
-       }
-       resp := listResp{
-               ListResp: s3.ListResp{
-                       Name:      bucket,
-                       Prefix:    params.prefix,
-                       Delimiter: params.delimiter,
-                       Marker:    params.marker,
-                       MaxKeys:   params.maxKeys,
-               },
-       }
+       resp := listV2Resp{
+               Name:              bucket,
+               Prefix:            params.prefix,
+               Delimiter:         params.delimiter,
+               MaxKeys:           params.maxKeys,
+               ContinuationToken: r.FormValue("continuation-token"),
+               StartAfter:        params.startAfter,
+       }
+       nextMarker := ""
+
        commonPrefixes := map[string]bool{}
        err := walkFS(fs, strings.TrimSuffix(bucketdir+"/"+walkpath, "/"), true, func(path string, fi os.FileInfo) error {
                if path == bucketdir {
@@ -654,7 +705,7 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
                                return errDone
                        }
                }
-               if path < params.marker || path < params.prefix {
+               if path < params.marker || path < params.prefix || path <= params.startAfter {
                        return nil
                }
                if fi.IsDir() && !h.Config.cluster.Collections.S3FolderObjects {
@@ -665,6 +716,13 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
                        // finding a regular file inside it.
                        return nil
                }
+               if len(resp.Contents)+len(commonPrefixes) >= params.maxKeys {
+                       resp.IsTruncated = true
+                       if params.delimiter != "" || params.v2 {
+                               nextMarker = path
+                       }
+                       return errDone
+               }
                if params.delimiter != "" {
                        idx := strings.Index(path[len(params.prefix):], params.delimiter)
                        if idx >= 0 {
@@ -676,13 +734,6 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
                                return filepath.SkipDir
                        }
                }
-               if len(resp.Contents)+len(commonPrefixes) >= params.maxKeys {
-                       resp.IsTruncated = true
-                       if params.delimiter != "" {
-                               resp.NextMarker = path
-                       }
-                       return errDone
-               }
                resp.Contents = append(resp.Contents, s3.Key{
                        Key:          path,
                        LastModified: fi.ModTime().UTC().Format("2006-01-02T15:04:05.999") + "Z",
@@ -702,9 +753,66 @@ func (h *handler) s3list(bucket string, w http.ResponseWriter, r *http.Request,
                sort.Slice(resp.CommonPrefixes, func(i, j int) bool { return resp.CommonPrefixes[i].Prefix < resp.CommonPrefixes[j].Prefix })
        }
        resp.KeyCount = len(resp.Contents)
+       var respV1orV2 interface{}
+
+       if params.encodingTypeURL {
+               // https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html
+               // "If you specify the encoding-type request
+               // parameter, Amazon S3 includes this element in the
+               // response, and returns encoded key name values in
+               // the following response elements:
+               //
+               // Delimiter, Prefix, Key, and StartAfter.
+               //
+               //      Type: String
+               //
+               // Valid Values: url"
+               //
+               // This is somewhat vague but in practice it appears
+               // to mean x-www-form-urlencoded as in RFC1866 8.2.1
+               // para 1 (encode space as "+") rather than straight
+               // percent-encoding as in RFC1738 2.2.  Presumably,
+               // the intent is to allow the client to decode XML and
+               // then paste the strings directly into another URI
+               // query or POST form like "https://host/path?foo=" +
+               // foo + "&bar=" + bar.
+               resp.EncodingType = "url"
+               resp.Delimiter = url.QueryEscape(resp.Delimiter)
+               resp.Prefix = url.QueryEscape(resp.Prefix)
+               resp.StartAfter = url.QueryEscape(resp.StartAfter)
+               for i, ent := range resp.Contents {
+                       ent.Key = url.QueryEscape(ent.Key)
+                       resp.Contents[i] = ent
+               }
+               for i, ent := range resp.CommonPrefixes {
+                       ent.Prefix = url.QueryEscape(ent.Prefix)
+                       resp.CommonPrefixes[i] = ent
+               }
+       }
+
+       if params.v2 {
+               resp.NextContinuationToken = base64.StdEncoding.EncodeToString([]byte(nextMarker))
+               respV1orV2 = resp
+       } else {
+               respV1orV2 = listV1Resp{
+                       CommonPrefixes: resp.CommonPrefixes,
+                       NextMarker:     nextMarker,
+                       KeyCount:       resp.KeyCount,
+                       ListResp: s3.ListResp{
+                               IsTruncated: resp.IsTruncated,
+                               Name:        bucket,
+                               Prefix:      params.prefix,
+                               Delimiter:   params.delimiter,
+                               Marker:      params.marker,
+                               MaxKeys:     params.maxKeys,
+                               Contents:    resp.Contents,
+                       },
+               }
+       }
+
        w.Header().Set("Content-Type", "application/xml")
        io.WriteString(w, xml.Header)
-       if err := xml.NewEncoder(w).Encode(resp); err != nil {
+       if err := xml.NewEncoder(w).Encode(respV1orV2); err != nil {
                ctxlog.FromContext(r.Context()).WithError(err).Error("error writing xml response")
        }
 }
index e60b55c935779aefeb30a2e57e2670def7c2ff83..4f70168b5629ecfbdb06193d75344cb4e27673eb 100644 (file)
@@ -6,6 +6,7 @@ package main
 
 import (
        "bytes"
+       "context"
        "crypto/rand"
        "crypto/sha256"
        "fmt"
@@ -25,6 +26,10 @@ import (
        "git.arvados.org/arvados.git/sdk/go/keepclient"
        "github.com/AdRoll/goamz/aws"
        "github.com/AdRoll/goamz/s3"
+       aws_aws "github.com/aws/aws-sdk-go/aws"
+       aws_credentials "github.com/aws/aws-sdk-go/aws/credentials"
+       aws_session "github.com/aws/aws-sdk-go/aws/session"
+       aws_s3 "github.com/aws/aws-sdk-go/service/s3"
        check "gopkg.in/check.v1"
 )
 
@@ -886,6 +891,196 @@ func (s *IntegrationSuite) testS3CollectionListRollup(c *check.C) {
        }
 }
 
+func (s *IntegrationSuite) TestS3ListObjectsV2(c *check.C) {
+       stage := s.s3setup(c)
+       defer stage.teardown(c)
+       dirs := 2
+       filesPerDir := 40
+       stage.writeBigDirs(c, dirs, filesPerDir)
+
+       sess := aws_session.Must(aws_session.NewSession(&aws_aws.Config{
+               Region:           aws_aws.String("auto"),
+               Endpoint:         aws_aws.String("http://" + s.testServer.Addr),
+               Credentials:      aws_credentials.NewStaticCredentials(url.QueryEscape(arvadostest.ActiveTokenV2), url.QueryEscape(arvadostest.ActiveTokenV2), ""),
+               S3ForcePathStyle: aws_aws.Bool(true),
+       }))
+
+       stringOrNil := func(s string) *string {
+               if s == "" {
+                       return nil
+               } else {
+                       return &s
+               }
+       }
+
+       client := aws_s3.New(sess)
+       ctx := context.Background()
+
+       for _, trial := range []struct {
+               prefix               string
+               delimiter            string
+               startAfter           string
+               maxKeys              int
+               expectKeys           int
+               expectCommonPrefixes map[string]bool
+       }{
+               {
+                       // Expect {filesPerDir plus the dir itself}
+                       // for each dir, plus emptydir, emptyfile, and
+                       // sailboat.txt.
+                       expectKeys: (filesPerDir+1)*dirs + 3,
+               },
+               {
+                       maxKeys:    15,
+                       expectKeys: (filesPerDir+1)*dirs + 3,
+               },
+               {
+                       startAfter: "dir0/z",
+                       maxKeys:    15,
+                       // Expect {filesPerDir plus the dir itself}
+                       // for each dir except dir0, plus emptydir,
+                       // emptyfile, and sailboat.txt.
+                       expectKeys: (filesPerDir+1)*(dirs-1) + 3,
+               },
+               {
+                       maxKeys:              1,
+                       delimiter:            "/",
+                       expectKeys:           2, // emptyfile, sailboat.txt
+                       expectCommonPrefixes: map[string]bool{"dir0/": true, "dir1/": true, "emptydir/": true},
+               },
+               {
+                       startAfter:           "dir0/z",
+                       maxKeys:              15,
+                       delimiter:            "/",
+                       expectKeys:           2, // emptyfile, sailboat.txt
+                       expectCommonPrefixes: map[string]bool{"dir1/": true, "emptydir/": true},
+               },
+               {
+                       startAfter:           "dir0/file10.txt",
+                       maxKeys:              15,
+                       delimiter:            "/",
+                       expectKeys:           2,
+                       expectCommonPrefixes: map[string]bool{"dir0/": true, "dir1/": true, "emptydir/": true},
+               },
+               {
+                       startAfter:           "dir0/file10.txt",
+                       maxKeys:              15,
+                       prefix:               "d",
+                       delimiter:            "/",
+                       expectKeys:           0,
+                       expectCommonPrefixes: map[string]bool{"dir0/": true, "dir1/": true},
+               },
+       } {
+               c.Logf("[trial %+v]", trial)
+               params := aws_s3.ListObjectsV2Input{
+                       Bucket:     aws_aws.String(stage.collbucket.Name),
+                       Prefix:     stringOrNil(trial.prefix),
+                       Delimiter:  stringOrNil(trial.delimiter),
+                       StartAfter: stringOrNil(trial.startAfter),
+                       MaxKeys:    aws_aws.Int64(int64(trial.maxKeys)),
+               }
+               keySeen := map[string]bool{}
+               prefixSeen := map[string]bool{}
+               for {
+                       result, err := client.ListObjectsV2WithContext(ctx, &params)
+                       if !c.Check(err, check.IsNil) {
+                               break
+                       }
+                       c.Check(result.Name, check.DeepEquals, aws_aws.String(stage.collbucket.Name))
+                       c.Check(result.Prefix, check.DeepEquals, aws_aws.String(trial.prefix))
+                       c.Check(result.Delimiter, check.DeepEquals, aws_aws.String(trial.delimiter))
+                       // The following two fields are expected to be
+                       // nil (i.e., no tag in XML response) rather
+                       // than "" when the corresponding request
+                       // field was empty or nil.
+                       c.Check(result.StartAfter, check.DeepEquals, stringOrNil(trial.startAfter))
+                       c.Check(result.ContinuationToken, check.DeepEquals, params.ContinuationToken)
+
+                       if trial.maxKeys > 0 {
+                               c.Check(result.MaxKeys, check.DeepEquals, aws_aws.Int64(int64(trial.maxKeys)))
+                               c.Check(len(result.Contents)+len(result.CommonPrefixes) <= trial.maxKeys, check.Equals, true)
+                       } else {
+                               c.Check(result.MaxKeys, check.DeepEquals, aws_aws.Int64(int64(s3MaxKeys)))
+                       }
+
+                       for _, ent := range result.Contents {
+                               c.Assert(ent.Key, check.NotNil)
+                               c.Check(*ent.Key > trial.startAfter, check.Equals, true)
+                               c.Check(keySeen[*ent.Key], check.Equals, false, check.Commentf("dup key %q", *ent.Key))
+                               keySeen[*ent.Key] = true
+                       }
+                       for _, ent := range result.CommonPrefixes {
+                               c.Assert(ent.Prefix, check.NotNil)
+                               c.Check(strings.HasSuffix(*ent.Prefix, trial.delimiter), check.Equals, true, check.Commentf("bad CommonPrefix %q", *ent.Prefix))
+                               if strings.HasPrefix(trial.startAfter, *ent.Prefix) {
+                                       // If we asked for
+                                       // startAfter=dir0/file10.txt,
+                                       // we expect dir0/ to be
+                                       // returned as a common prefix
+                               } else {
+                                       c.Check(*ent.Prefix > trial.startAfter, check.Equals, true)
+                               }
+                               c.Check(prefixSeen[*ent.Prefix], check.Equals, false, check.Commentf("dup common prefix %q", *ent.Prefix))
+                               prefixSeen[*ent.Prefix] = true
+                       }
+                       if *result.IsTruncated && c.Check(result.NextContinuationToken, check.Not(check.Equals), "") {
+                               params.ContinuationToken = aws_aws.String(*result.NextContinuationToken)
+                       } else {
+                               break
+                       }
+               }
+               c.Check(keySeen, check.HasLen, trial.expectKeys)
+               c.Check(prefixSeen, check.HasLen, len(trial.expectCommonPrefixes))
+               if len(trial.expectCommonPrefixes) > 0 {
+                       c.Check(prefixSeen, check.DeepEquals, trial.expectCommonPrefixes)
+               }
+       }
+}
+
+func (s *IntegrationSuite) TestS3ListObjectsV2EncodingTypeURL(c *check.C) {
+       stage := s.s3setup(c)
+       defer stage.teardown(c)
+       dirs := 2
+       filesPerDir := 40
+       stage.writeBigDirs(c, dirs, filesPerDir)
+
+       sess := aws_session.Must(aws_session.NewSession(&aws_aws.Config{
+               Region:           aws_aws.String("auto"),
+               Endpoint:         aws_aws.String("http://" + s.testServer.Addr),
+               Credentials:      aws_credentials.NewStaticCredentials(url.QueryEscape(arvadostest.ActiveTokenV2), url.QueryEscape(arvadostest.ActiveTokenV2), ""),
+               S3ForcePathStyle: aws_aws.Bool(true),
+       }))
+
+       client := aws_s3.New(sess)
+       ctx := context.Background()
+
+       result, err := client.ListObjectsV2WithContext(ctx, &aws_s3.ListObjectsV2Input{
+               Bucket:       aws_aws.String(stage.collbucket.Name),
+               Prefix:       aws_aws.String("dir0/"),
+               Delimiter:    aws_aws.String("/"),
+               StartAfter:   aws_aws.String("dir0/"),
+               EncodingType: aws_aws.String("url"),
+       })
+       c.Assert(err, check.IsNil)
+       c.Check(*result.Prefix, check.Equals, "dir0%2F")
+       c.Check(*result.Delimiter, check.Equals, "%2F")
+       c.Check(*result.StartAfter, check.Equals, "dir0%2F")
+       for _, ent := range result.Contents {
+               c.Check(*ent.Key, check.Matches, "dir0%2F.*")
+       }
+       result, err = client.ListObjectsV2WithContext(ctx, &aws_s3.ListObjectsV2Input{
+               Bucket:       aws_aws.String(stage.collbucket.Name),
+               Delimiter:    aws_aws.String("/"),
+               EncodingType: aws_aws.String("url"),
+       })
+       c.Assert(err, check.IsNil)
+       c.Check(*result.Delimiter, check.Equals, "%2F")
+       c.Check(result.CommonPrefixes, check.HasLen, dirs+1)
+       for _, ent := range result.CommonPrefixes {
+               c.Check(*ent.Prefix, check.Matches, ".*%2F")
+       }
+}
+
 // TestS3cmd checks compatibility with the s3cmd command line tool, if
 // it's installed. As of Debian buster, s3cmd is only in backports, so
 // `arvados-server install` don't install it, and this test skips if