5824: Merge branch 'master' into 5824-keep-web
authorTom Clegg <tom@curoverse.com>
Thu, 29 Oct 2015 20:31:37 +0000 (16:31 -0400)
committerTom Clegg <tom@curoverse.com>
Thu, 29 Oct 2015 20:31:37 +0000 (16:31 -0400)
23 files changed:
doc/_config.yml
doc/install/install-keep-web.html.textile.liquid [new file with mode: 0644]
doc/install/install-keepproxy.html.textile.liquid
sdk/go/arvadostest/fixtures.go [new file with mode: 0644]
sdk/go/auth/auth.go
sdk/go/keepclient/collectionreader.go [new file with mode: 0644]
sdk/go/keepclient/collectionreader_test.go [new file with mode: 0644]
sdk/go/keepclient/keepclient.go
sdk/go/keepclient/support.go
sdk/go/manifest/manifest.go
sdk/go/manifest/manifest_test.go
services/api/test/fixtures/collections.yml
services/crunchstat/crunchstat_test.go
services/datamanager/keep/keep.go
services/keep-web/.gitignore [new file with mode: 0644]
services/keep-web/doc.go [new file with mode: 0644]
services/keep-web/handler.go [new file with mode: 0644]
services/keep-web/handler_test.go [new file with mode: 0644]
services/keep-web/main.go [new file with mode: 0644]
services/keep-web/server.go [new file with mode: 0644]
services/keep-web/server_test.go [new file with mode: 0644]
services/keepproxy/keepproxy.go
services/keepproxy/keepproxy_test.go

index 75cb997d78b0fe0d80d869023185a11dbfa0bfbd..2f37f5af0803c360f2c0cce2d08389d64b21249f 100644 (file)
@@ -155,6 +155,7 @@ navbar:
       - install/install-keepstore.html.textile.liquid
       - install/configure-azure-blob-storage.html.textile.liquid
       - install/install-keepproxy.html.textile.liquid
+      #- install/install-keep-web.html.textile.liquid
       - install/install-crunch-dispatch.html.textile.liquid
       - install/install-compute-node.html.textile.liquid
     - Helpful hints:
diff --git a/doc/install/install-keep-web.html.textile.liquid b/doc/install/install-keep-web.html.textile.liquid
new file mode 100644 (file)
index 0000000..08e7af2
--- /dev/null
@@ -0,0 +1,115 @@
+---
+layout: default
+navsection: installguide
+title: Install the keep-web server
+...
+
+The keep-web server provides read-only HTTP access to files stored in Keep. It serves public data to unauthenticated clients, and serves private data to clients that supply Arvados API tokens. It can be installed anywhere with access to Keep services, typically behind a web proxy that provides SSL support. See the "godoc page":http://godoc.org/github.com/curoverse/arvados/services/keep-web for more detail.
+
+By convention, we use the following hostname for the keep-web service:
+
+<notextile>
+<pre><code>collections.<span class="userinput">uuid_prefix</span>.your.domain
+</code></pre>
+</notextile>
+
+This hostname should resolve from anywhere on the internet.
+
+h2. Install keep-web
+
+On Debian-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo apt-get install keep-web</span>
+</code></pre>
+</notextile>
+
+On Red Hat-based systems:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo yum install keep-web</span>
+</code></pre>
+</notextile>
+
+Verify that @keep-web@ is functional:
+
+<notextile>
+<pre><code>~$ <span class="userinput">keep-web -h</span>
+Usage of keep-web:
+  -address string
+        Address to listen on: "host:port", or ":port" to listen on all interfaces. (default ":80")
+  -anonymous-token value
+        API token to try when none of the tokens provided in an HTTP request succeed in reading the desired collection. If this flag is used more than once, each token will be attempted in turn until one works. (default [])
+</code></pre>
+</notextile>
+
+If you intend to use Keep-web to serve public data to anonymous clients, configure it with an anonymous token. You can use the same one you used when you set up your Keepproxy server, or use the following command on the <strong>API server</strong> to create another:
+
+<notextile>
+<pre><code>/var/www/arvados-api/current/script$ <span class="userinput">RAILS_ENV=production bundle exec ./get_anonymous_user_token.rb</span>
+hoShoomoo2bai3Ju1xahg6aeng1siquuaZ1yae2gi2Uhaeng2r
+</code></pre></notextile>
+
+We recommend running @keep-web@ under "runit":https://packages.debian.org/search?keywords=runit or a similar supervisor. The basic command to start @keep-web@ is:
+
+<notextile>
+<pre><code>export ARVADOS_API_HOST=<span class="userinput">uuid_prefix</span>.your.domain
+exec sudo -u nobody keep-web -address=<span class="userinput">:9002</span> -anonymous-token=<span class="userinput">hoShoomoo2bai3Ju1xahg6aeng1siquuaZ1yae2gi2Uhaeng2r</span> 2&gt;&amp;1
+</code></pre>
+</notextile>
+
+Omit the @-anonymous-token@ arguments if you do not want to serve public data.
+
+Set @ARVADOS_API_HOST_INSECURE=1@ if your API server's SSL certificate is not signed by a recognized CA.
+
+h3. Set up a reverse proxy with SSL support
+
+The keep-web service will be accessible from anywhere on the internet, so we recommend using SSL for transport encryption.
+
+This is best achieved by putting a reverse proxy with SSL support in front of keep-web, running on port 443 and passing requests to keep-web on port 9002 (or whatever port you chose in your run script).
+
+Note: A wildcard SSL certificate is required in order to proxy keep-web effectively.
+
+For example, using Nginx:
+
+<notextile><pre>
+upstream keep-web {
+  server                127.0.0.1:<span class="userinput">9002</span>;
+}
+
+server {
+  listen                <span class="userinput">[your public IP address]</span>:443 ssl;
+  server_name           collections.<span class="userinput">uuid_prefix</span>.your.domain *.collections.<span class="userinput">uuid_prefix</span>.your.domain ~.*--collections.<span class="userinput">uuid_prefix</span>.your.domain;
+
+  proxy_connect_timeout 90s;
+  proxy_read_timeout    300s;
+
+  ssl                   on;
+  ssl_certificate       <span class="userinput"/>YOUR/PATH/TO/cert.pem</span>;
+  ssl_certificate_key   <span class="userinput"/>YOUR/PATH/TO/cert.key</span>;
+
+  location / {
+    proxy_pass          http://keep-web;
+    proxy_set_header    Host            $host;
+    proxy_set_header    X-Forwarded-For $proxy_add_x_forwarded_for;
+  }
+}
+</pre></notextile>
+
+h3. Configure DNS
+
+Configure your DNS servers so the following names resolve to your Nginx proxy's public IP address.
+* @*--collections.uuid_prefix.your.domain@, if your DNS server allows this without interfering with other DNS names; or
+* @*.collections.uuid_prefix.your.domain@, if you have a wildcard SSL certificate valid for these names; or
+* @collections.uuid_prefix.your.domain@, if neither of the above options is feasible. In this case, only unauthenticated requests will be served, i.e., public data and collection sharing links.
+
+h3. Tell Workbench about the keep-web service
+
+Add *one* of the following entries to your Workbench configuration file (@/etc/arvados/workbench/application.yml@), depending on your DNS setup:
+
+<notextile>
+<pre><code>keep_web_url: https://%{uuid_or_pdh}--collections.<span class="userinput">uuid_prefix</span>.your.domain
+keep_web_url: https://%{uuid_or_pdh}.collections.<span class="userinput">uuid_prefix</span>.your.domain
+keep_web_url: https://collections.<span class="userinput">uuid_prefix</span>.your.domain
+</code></pre>
+</notextile>
index 6a531a37848d2c2eaa5af34e20fb63c32351f4bf..e6e2b103ae286317e2ba88910d5897adeee72068 100644 (file)
@@ -4,9 +4,12 @@ navsection: installguide
 title: Install Keepproxy server
 ...
 
-The Keepproxy server is a gateway into your Keep storage. Unlike the Keepstore servers, which are only accessible on the local LAN, Keepproxy is designed to provide secure access into Keep from anywhere on the internet.
+The Keepproxy server is a gateway into your Keep storage. Unlike the Keepstore servers, which are only accessible on the local LAN, Keepproxy is suitable for clients located elsewhere on the internet. Specifically, in contrast to Keepstore:
+* A client writing through Keepproxy generates less network traffic: the client sends a single copy of a data block, and Keepproxy sends copies to the appropriate Keepstore servers.
+* A client can write through Keepproxy without precomputing content hashes. Notably, the browser-based upload feature in Workbench requires Keepproxy.
+* Keepproxy checks API token validity before processing requests. (Clients that can connect directly to Keepstore can use it as scratch space even without a valid API token.)
 
-By convention, we use the following hostname for the Keepproxy:
+By convention, we use the following hostname for the Keepproxy server:
 
 <div class="offset1">
 table(table table-bordered table-condensed).
@@ -36,12 +39,13 @@ Verify that Keepproxy is functional:
 
 <notextile>
 <pre><code>~$ <span class="userinput">keepproxy -h</span>
-Usage of default:
+Usage of keepproxy:
   -default-replicas=2: Default number of replicas to write if not specified by the client.
   -listen=":25107": Interface on which to listen for requests, in the format ipaddr:port. e.g. -listen=10.0.1.24:8000. Use -listen=:port to listen on all network interfaces.
   -no-get=false: If set, disable GET operations
   -no-put=false: If set, disable PUT operations
   -pid="": Path to write pid file
+  -timeout=15: Timeout on requests to internal Keep services (default 15 seconds)
 </code></pre>
 </notextile>
 
diff --git a/sdk/go/arvadostest/fixtures.go b/sdk/go/arvadostest/fixtures.go
new file mode 100644 (file)
index 0000000..d0270a6
--- /dev/null
@@ -0,0 +1,24 @@
+package arvadostest
+
+// IDs of API server's test fixtures
+const (
+       SpectatorToken        = "zw2f4gwx8hw8cjre7yp6v1zylhrhn3m5gvjq73rtpwhmknrybu"
+       ActiveToken           = "3kg6k6lzmp9kj5cpkcoxie963cmvjahbt2fod9zru30k1jqdmi"
+       AnonymousToken        = "4kg6k6lzmp9kj4cpkcoxie964cmvjahbt4fod9zru44k4jqdmi"
+       FooCollection         = "zzzzz-4zz18-fy296fx3hot09f7"
+       NonexistentCollection = "zzzzz-4zz18-totallynotexist"
+       HelloWorldCollection  = "zzzzz-4zz18-4en62shvi99lxd4"
+       FooBarDirCollection   = "zzzzz-4zz18-foonbarfilesdir"
+       FooPdh                = "1f4b0bc7583c2a7f9102c395f4ffc5e3+45"
+       HelloWorldPdh         = "55713e6a34081eb03609e7ad5fcad129+62"
+)
+
+// A valid manifest designed to test various edge cases and parsing
+// requirements
+const PathologicalManifest = ". acbd18db4cc2f85cedef654fccc4a4d8+3 37b51d194a7513e45b56f6524f2d51f2+3 73feffa4b7f6bb68e44cf984c85f6e88+3+Z+K@xyzzy acbd18db4cc2f85cedef654fccc4a4d8+3 0:0:zero@0 0:1:f 1:0:zero@1 1:4:ooba 4:0:zero@4 5:1:r 5:4:rbaz 9:0:zero@9\n" +
+       "./overlapReverse acbd18db4cc2f85cedef654fccc4a4d8+3 acbd18db4cc2f85cedef654fccc4a4d8+3 5:1:o 4:2:oo 2:4:ofoo\n" +
+       "./segmented acbd18db4cc2f85cedef654fccc4a4d8+3 37b51d194a7513e45b56f6524f2d51f2+3 0:1:frob 5:1:frob 1:1:frob 1:2:oof 0:1:oof 5:0:frob 3:1:frob\n" +
+       `./foo\040b\141r acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:baz` + "\n" +
+       `./foo\040b\141r acbd18db4cc2f85cedef654fccc4a4d8+3 0:3:b\141z\040w\141z` + "\n" +
+       "./foo acbd18db4cc2f85cedef654fccc4a4d8+3 0:0:zero 0:3:foo\n" +
+       ". acbd18db4cc2f85cedef654fccc4a4d8+3 0:0:foo/zero 0:3:foo/foo\n"
index 4a719e922dd8a98bdc88445f413ea88788b01a53..ca4eb948b1220c982ef25fa03c9175f40148a797 100644 (file)
@@ -1,6 +1,7 @@
 package auth
 
 import (
+       "encoding/base64"
        "net/http"
        "net/url"
        "strings"
@@ -20,6 +21,15 @@ func NewCredentialsFromHTTPRequest(r *http.Request) *Credentials {
        return c
 }
 
+// EncodeTokenCookie accepts a token and returns a byte slice suitable
+// for use as a cookie value, such that it will be decoded correctly
+// by LoadTokensFromHTTPRequest.
+var EncodeTokenCookie func([]byte) string = base64.URLEncoding.EncodeToString
+
+// DecodeTokenCookie accepts a cookie value and returns the encoded
+// token.
+var DecodeTokenCookie func(string) ([]byte, error) = base64.URLEncoding.DecodeString
+
 // LoadTokensFromHttpRequest loads all tokens it can find in the
 // headers and query string of an http query.
 func (a *Credentials) LoadTokensFromHTTPRequest(r *http.Request) {
@@ -51,10 +61,24 @@ func (a *Credentials) LoadTokensFromHTTPRequest(r *http.Request) {
                a.Tokens = append(a.Tokens, val...)
        }
 
+       a.loadTokenFromCookie(r)
+
        // TODO: Load token from Rails session cookie (if Rails site
        // secret is known)
 }
 
+func (a *Credentials) loadTokenFromCookie(r *http.Request) {
+       cookie, err := r.Cookie("arvados_api_token")
+       if err != nil || len(cookie.Value) == 0 {
+               return
+       }
+       token, err := DecodeTokenCookie(cookie.Value)
+       if err != nil {
+               return
+       }
+       a.Tokens = append(a.Tokens, string(token))
+}
+
 // TODO: LoadTokensFromHttpRequestBody(). We can't assume in
 // LoadTokensFromHttpRequest() that [or how] we should read and parse
 // the request body. This has to be requested explicitly by the
diff --git a/sdk/go/keepclient/collectionreader.go b/sdk/go/keepclient/collectionreader.go
new file mode 100644 (file)
index 0000000..68ecc6e
--- /dev/null
@@ -0,0 +1,252 @@
+package keepclient
+
+import (
+       "errors"
+       "io"
+       "os"
+
+       "git.curoverse.com/arvados.git/sdk/go/manifest"
+)
+
+// ReadCloserWithLen extends io.ReadCloser with a Len() method that
+// returns the total number of bytes available to read.
+type ReadCloserWithLen interface {
+       io.ReadCloser
+       Len() uint64
+}
+
+const (
+       // After reading a data block from Keep, cfReader slices it up
+       // and sends the slices to a buffered channel to be consumed
+       // by the caller via Read().
+       //
+       // dataSliceSize is the maximum size of the slices, and
+       // therefore the maximum number of bytes that will be returned
+       // by a single call to Read().
+       dataSliceSize = 1 << 20
+)
+
+// ErrNoManifest indicates the given collection has no manifest
+// information (e.g., manifest_text was excluded by a "select"
+// parameter when retrieving the collection record).
+var ErrNoManifest = errors.New("Collection has no manifest")
+
+// CollectionFileReader returns a ReadCloserWithLen that reads file
+// content from a collection. The filename must be given relative to
+// the root of the collection, without a leading "./".
+func (kc *KeepClient) CollectionFileReader(collection map[string]interface{}, filename string) (ReadCloserWithLen, error) {
+       mText, ok := collection["manifest_text"].(string)
+       if !ok {
+               return nil, ErrNoManifest
+       }
+       m := manifest.Manifest{Text: mText}
+       rdrChan := make(chan *cfReader)
+       go kc.queueSegmentsToGet(m, filename, rdrChan)
+       r, ok := <-rdrChan
+       if !ok {
+               return nil, os.ErrNotExist
+       }
+       return r, nil
+}
+
+// Send segments for the specified file to r.toGet. Send a *cfReader
+// to rdrChan if the specified file is found (even if it's empty).
+// Then, close rdrChan.
+func (kc *KeepClient) queueSegmentsToGet(m manifest.Manifest, filename string, rdrChan chan *cfReader) {
+       defer close(rdrChan)
+
+       // q is a queue of FileSegments that we have received but
+       // haven't yet been able to send to toGet.
+       var q []*manifest.FileSegment
+       var r *cfReader
+       for seg := range m.FileSegmentIterByName(filename) {
+               if r == nil {
+                       // We've just discovered that the requested
+                       // filename does appear in the manifest, so we
+                       // can return a real reader (not nil) from
+                       // CollectionFileReader().
+                       r = newCFReader(kc)
+                       rdrChan <- r
+               }
+               q = append(q, seg)
+               r.totalSize += uint64(seg.Len)
+               // Send toGet as many segments as we can until it
+               // blocks.
+       Q:
+               for len(q) > 0 {
+                       select {
+                       case r.toGet <- q[0]:
+                               q = q[1:]
+                       default:
+                               break Q
+                       }
+               }
+       }
+       if r == nil {
+               // File not found.
+               return
+       }
+       close(r.countDone)
+       for _, seg := range q {
+               r.toGet <- seg
+       }
+       close(r.toGet)
+}
+
+type cfReader struct {
+       keepClient *KeepClient
+
+       // doGet() reads FileSegments from toGet, gets the data from
+       // Keep, and sends byte slices to toRead to be consumed by
+       // Read().
+       toGet chan *manifest.FileSegment
+
+       // toRead is a buffered channel, sized to fit one full Keep
+       // block. This lets us verify checksums without having a
+       // store-and-forward delay between blocks: by the time the
+       // caller starts receiving data from block N, cfReader is
+       // starting to fetch block N+1. A larger buffer would be
+       // useful for a caller whose read speed varies a lot.
+       toRead chan []byte
+
+       // bytes ready to send next time someone calls Read()
+       buf []byte
+
+       // Total size of the file being read. Not safe to read this
+       // until countDone is closed.
+       totalSize uint64
+       countDone chan struct{}
+
+       // First error encountered.
+       err error
+
+       // errNotNil is closed IFF err contains a non-nil error.
+       // Receiving from it will block until an error occurs.
+       errNotNil chan struct{}
+
+       // rdrClosed is closed IFF the reader's Close() method has
+       // been called. Any goroutines associated with the reader will
+       // stop and free up resources when they notice this channel is
+       // closed.
+       rdrClosed chan struct{}
+}
+
+func (r *cfReader) Read(outbuf []byte) (int, error) {
+       if r.Error() != nil {
+               // Short circuit: the caller might as well find out
+               // now that we hit an error, even if there's buffered
+               // data we could return.
+               return 0, r.Error()
+       }
+       for len(r.buf) == 0 {
+               // Private buffer was emptied out by the last Read()
+               // (or this is the first Read() and r.buf is nil).
+               // Read from r.toRead until we get a non-empty slice
+               // or hit an error.
+               var ok bool
+               r.buf, ok = <-r.toRead
+               if r.Error() != nil {
+                       // Error encountered while waiting for bytes
+                       return 0, r.Error()
+               } else if !ok {
+                       // No more bytes to read, no error encountered
+                       return 0, io.EOF
+               }
+       }
+       // Copy as much as possible from our private buffer to the
+       // caller's buffer
+       n := len(r.buf)
+       if len(r.buf) > len(outbuf) {
+               n = len(outbuf)
+       }
+       copy(outbuf[:n], r.buf[:n])
+
+       // Next call to Read() will continue where we left off
+       r.buf = r.buf[n:]
+
+       return n, nil
+}
+
+// Close releases resources. It returns a non-nil error if an error
+// was encountered by the reader.
+func (r *cfReader) Close() error {
+       close(r.rdrClosed)
+       return r.Error()
+}
+
+// Error returns an error if one has been encountered, otherwise
+// nil. It is safe to call from any goroutine.
+func (r *cfReader) Error() error {
+       select {
+       case <-r.errNotNil:
+               return r.err
+       default:
+               return nil
+       }
+}
+
+// Len returns the total number of bytes in the file being read. If
+// necessary, it waits for manifest parsing to finish.
+func (r *cfReader) Len() uint64 {
+       // Wait for all segments to be counted
+       <-r.countDone
+       return r.totalSize
+}
+
+func (r *cfReader) doGet() {
+       defer close(r.toRead)
+GET:
+       for fs := range r.toGet {
+               rdr, _, _, err := r.keepClient.Get(fs.Locator)
+               if err != nil {
+                       r.err = err
+                       close(r.errNotNil)
+                       return
+               }
+               var buf = make([]byte, fs.Offset+fs.Len)
+               _, err = io.ReadFull(rdr, buf)
+               if err != nil {
+                       r.err = err
+                       close(r.errNotNil)
+                       return
+               }
+               for bOff, bLen := fs.Offset, dataSliceSize; bOff <= fs.Offset+fs.Len && bLen > 0; bOff += bLen {
+                       if bOff+bLen > fs.Offset+fs.Len {
+                               bLen = fs.Offset + fs.Len - bOff
+                       }
+                       select {
+                       case r.toRead <- buf[bOff : bOff+bLen]:
+                       case <-r.rdrClosed:
+                               // Reader is closed: no point sending
+                               // anything more to toRead.
+                               break GET
+                       }
+               }
+               // It is possible that r.rdrClosed is closed but we
+               // never noticed because r.toRead was also ready in
+               // every select{} above. Here we check before wasting
+               // a keepclient.Get() call.
+               select {
+               case <-r.rdrClosed:
+                       break GET
+               default:
+               }
+       }
+       // In case we exited the above loop early: before returning,
+       // drain the toGet channel so its sender doesn't sit around
+       // blocking forever.
+       for _ = range r.toGet {
+       }
+}
+
+func newCFReader(kc *KeepClient) (r *cfReader) {
+       r = new(cfReader)
+       r.keepClient = kc
+       r.rdrClosed = make(chan struct{})
+       r.errNotNil = make(chan struct{})
+       r.toGet = make(chan *manifest.FileSegment, 2)
+       r.toRead = make(chan []byte, (BLOCKSIZE+dataSliceSize-1)/dataSliceSize)
+       r.countDone = make(chan struct{})
+       go r.doGet()
+       return
+}
diff --git a/sdk/go/keepclient/collectionreader_test.go b/sdk/go/keepclient/collectionreader_test.go
new file mode 100644 (file)
index 0000000..9fb0d86
--- /dev/null
@@ -0,0 +1,223 @@
+package keepclient
+
+import (
+       "crypto/md5"
+       "crypto/rand"
+       "fmt"
+       "io"
+       "io/ioutil"
+       "net/http"
+       "os"
+       "strconv"
+       "strings"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+       "git.curoverse.com/arvados.git/sdk/go/arvadostest"
+       check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&CollectionReaderUnit{})
+
+type CollectionReaderUnit struct {
+       arv     arvadosclient.ArvadosClient
+       kc      *KeepClient
+       handler SuccessHandler
+}
+
+func (s *CollectionReaderUnit) SetUpTest(c *check.C) {
+       var err error
+       s.arv, err = arvadosclient.MakeArvadosClient()
+       c.Assert(err, check.IsNil)
+       s.arv.ApiToken = arvadostest.ActiveToken
+
+       s.kc, err = MakeKeepClient(&s.arv)
+       c.Assert(err, check.IsNil)
+
+       s.handler = SuccessHandler{
+               disk: make(map[string][]byte),
+               lock: make(chan struct{}, 1),
+               ops:  new(int),
+       }
+       localRoots := make(map[string]string)
+       for i, k := range RunSomeFakeKeepServers(s.handler, 4) {
+               localRoots[fmt.Sprintf("zzzzz-bi6l4-fakefakefake%03d", i)] = k.url
+       }
+       s.kc.SetServiceRoots(localRoots, localRoots, nil)
+}
+
+type SuccessHandler struct {
+       disk map[string][]byte
+       lock chan struct{} // channel with buffer==1: full when an operation is in progress.
+       ops  *int          // number of operations completed
+}
+
+func (h SuccessHandler) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
+       switch req.Method {
+       case "PUT":
+               buf, err := ioutil.ReadAll(req.Body)
+               if err != nil {
+                       resp.WriteHeader(500)
+                       return
+               }
+               pdh := fmt.Sprintf("%x+%d", md5.Sum(buf), len(buf))
+               h.lock <- struct{}{}
+               h.disk[pdh] = buf
+               if h.ops != nil {
+                       (*h.ops)++
+               }
+               <-h.lock
+               resp.Write([]byte(pdh))
+       case "GET":
+               pdh := req.URL.Path[1:]
+               h.lock <- struct{}{}
+               buf, ok := h.disk[pdh]
+               if h.ops != nil {
+                       (*h.ops)++
+               }
+               <-h.lock
+               if !ok {
+                       resp.WriteHeader(http.StatusNotFound)
+               } else {
+                       resp.Write(buf)
+               }
+       default:
+               resp.WriteHeader(http.StatusMethodNotAllowed)
+       }
+}
+
+type rdrTest struct {
+       mt   string      // manifest text
+       f    string      // filename
+       want interface{} // error or string to expect
+}
+
+func (s *CollectionReaderUnit) TestCollectionReaderContent(c *check.C) {
+       s.kc.PutB([]byte("foo"))
+       s.kc.PutB([]byte("bar"))
+       s.kc.PutB([]byte("Hello world\n"))
+       s.kc.PutB([]byte(""))
+
+       mt := arvadostest.PathologicalManifest
+
+       for _, testCase := range []rdrTest{
+               {mt: mt, f: "zzzz", want: os.ErrNotExist},
+               {mt: mt, f: "frob", want: os.ErrNotExist},
+               {mt: mt, f: "/segmented/frob", want: os.ErrNotExist},
+               {mt: mt, f: "./segmented/frob", want: os.ErrNotExist},
+               {mt: mt, f: "/f", want: os.ErrNotExist},
+               {mt: mt, f: "./f", want: os.ErrNotExist},
+               {mt: mt, f: "foo bar//baz", want: os.ErrNotExist},
+               {mt: mt, f: "foo/zero", want: ""},
+               {mt: mt, f: "zero@0", want: ""},
+               {mt: mt, f: "zero@1", want: ""},
+               {mt: mt, f: "zero@4", want: ""},
+               {mt: mt, f: "zero@9", want: ""},
+               {mt: mt, f: "f", want: "f"},
+               {mt: mt, f: "ooba", want: "ooba"},
+               {mt: mt, f: "overlapReverse/o", want: "o"},
+               {mt: mt, f: "overlapReverse/oo", want: "oo"},
+               {mt: mt, f: "overlapReverse/ofoo", want: "ofoo"},
+               {mt: mt, f: "foo bar/baz", want: "foo"},
+               {mt: mt, f: "segmented/frob", want: "frob"},
+               {mt: mt, f: "segmented/oof", want: "oof"},
+       } {
+               rdr, err := s.kc.CollectionFileReader(map[string]interface{}{"manifest_text": testCase.mt}, testCase.f)
+               switch want := testCase.want.(type) {
+               case error:
+                       c.Check(rdr, check.IsNil)
+                       c.Check(err, check.Equals, want)
+               case string:
+                       buf := make([]byte, len(want))
+                       n, err := io.ReadFull(rdr, buf)
+                       c.Check(err, check.IsNil)
+                       for i := 0; i < 4; i++ {
+                               c.Check(string(buf), check.Equals, want)
+                               n, err = rdr.Read(buf)
+                               c.Check(n, check.Equals, 0)
+                               c.Check(err, check.Equals, io.EOF)
+                       }
+                       c.Check(rdr.Close(), check.Equals, nil)
+               }
+       }
+}
+
+func (s *CollectionReaderUnit) TestCollectionReaderManyBlocks(c *check.C) {
+       h := md5.New()
+       buf := make([]byte, 4096)
+       locs := make([]string, len(buf))
+       filesize := 0
+       for i := 0; i < len(locs); i++ {
+               _, err := io.ReadFull(rand.Reader, buf[:i])
+               c.Assert(err, check.IsNil)
+               h.Write(buf[:i])
+               locs[i], _, err = s.kc.PutB(buf[:i])
+               c.Assert(err, check.IsNil)
+               filesize += i
+       }
+       manifest := "./random " + strings.Join(locs, " ") + " 0:" + strconv.Itoa(filesize) + ":bytes.bin\n"
+       dataMD5 := h.Sum(nil)
+
+       checkMD5 := md5.New()
+       rdr, err := s.kc.CollectionFileReader(map[string]interface{}{"manifest_text": manifest}, "random/bytes.bin")
+       c.Check(err, check.IsNil)
+       _, err = io.Copy(checkMD5, rdr)
+       c.Check(err, check.IsNil)
+       _, err = rdr.Read(make([]byte, 1))
+       c.Check(err, check.Equals, io.EOF)
+       c.Check(checkMD5.Sum(nil), check.DeepEquals, dataMD5)
+}
+
+func (s *CollectionReaderUnit) TestCollectionReaderCloseEarly(c *check.C) {
+       s.kc.PutB([]byte("foo"))
+
+       mt := ". "
+       for i := 0; i < 1000; i++ {
+               mt += "acbd18db4cc2f85cedef654fccc4a4d8+3 "
+       }
+       mt += "0:3000:foo1000.txt\n"
+
+       // Grab the stub server's lock, ensuring our cfReader doesn't
+       // get anything back from its first call to kc.Get() before we
+       // have a chance to call Close().
+       s.handler.lock <- struct{}{}
+       opsBeforeRead := *s.handler.ops
+
+       rdr, err := s.kc.CollectionFileReader(map[string]interface{}{"manifest_text": mt}, "foo1000.txt")
+       c.Assert(err, check.IsNil)
+
+       firstReadDone := make(chan struct{})
+       go func() {
+               rdr.Read(make([]byte, 6))
+               firstReadDone <- struct{}{}
+       }()
+       err = rdr.Close()
+       c.Assert(err, check.IsNil)
+       c.Assert(rdr.(*cfReader).Error(), check.IsNil)
+
+       // Release the stub server's lock. The first GET operation will proceed.
+       <-s.handler.lock
+
+       // Make sure our first read operation consumes the data
+       // received from the first GET.
+       <-firstReadDone
+
+       // doGet() should close toRead before sending any more bufs to it.
+       if what, ok := <-rdr.(*cfReader).toRead; ok {
+               c.Errorf("Got %q, expected toRead to be closed", string(what))
+       }
+
+       // Stub should have handled exactly one GET request.
+       c.Assert(*s.handler.ops, check.Equals, opsBeforeRead+1)
+}
+
+func (s *CollectionReaderUnit) TestCollectionReaderDataError(c *check.C) {
+       manifest := ". ffffffffffffffffffffffffffffffff+1 0:1:notfound.txt\n"
+       buf := make([]byte, 1)
+       rdr, err := s.kc.CollectionFileReader(map[string]interface{}{"manifest_text": manifest}, "notfound.txt")
+       c.Check(err, check.IsNil)
+       for i := 0; i < 2; i++ {
+               _, err = io.ReadFull(rdr, buf)
+               c.Check(err, check.NotNil)
+               c.Check(err, check.Not(check.Equals), io.EOF)
+       }
+}
index 2f809b32560b7e6072fd6242963e18b2ab9430df..0e6fadcc3548c99d2e85c30219a74ffdce42bf72 100644 (file)
@@ -200,7 +200,7 @@ func (kc *KeepClient) getOrHead(method string, locator string) (io.ReadCloser, i
                                retryList = append(retryList, host)
                        } else if resp.StatusCode != http.StatusOK {
                                var respbody []byte
-                               respbody, _ = ioutil.ReadAll(&io.LimitedReader{resp.Body, 4096})
+                               respbody, _ = ioutil.ReadAll(&io.LimitedReader{R: resp.Body, N: 4096})
                                resp.Body.Close()
                                errs = append(errs, fmt.Sprintf("%s: HTTP %d %q",
                                        url, resp.StatusCode, bytes.TrimSpace(respbody)))
index bd9b2e35f79be1fe5c6c1fc125b4449875f97f2c..4a210243defcd0bd33ea130dd17c96b2c151534f 100644 (file)
@@ -169,7 +169,7 @@ type uploadStatus struct {
        response        string
 }
 
-func (this KeepClient) uploadToKeepServer(host string, hash string, body io.ReadCloser,
+func (this *KeepClient) uploadToKeepServer(host string, hash string, body io.ReadCloser,
        upload_status chan<- uploadStatus, expectedLength int64, requestId string) {
 
        var req *http.Request
@@ -214,7 +214,7 @@ func (this KeepClient) uploadToKeepServer(host string, hash string, body io.Read
        defer resp.Body.Close()
        defer io.Copy(ioutil.Discard, resp.Body)
 
-       respbody, err2 := ioutil.ReadAll(&io.LimitedReader{resp.Body, 4096})
+       respbody, err2 := ioutil.ReadAll(&io.LimitedReader{R: resp.Body, N: 4096})
        response := strings.TrimSpace(string(respbody))
        if err2 != nil && err2 != io.EOF {
                log.Printf("[%v] Upload %v error: %v response: %v", requestId, url, err2.Error(), response)
@@ -228,7 +228,7 @@ func (this KeepClient) uploadToKeepServer(host string, hash string, body io.Read
        }
 }
 
-func (this KeepClient) putReplicas(
+func (this *KeepClient) putReplicas(
        hash string,
        tr *streamer.AsyncStream,
        expectedLength int64) (locator string, replicas int, err error) {
index 4e816cd73b30abbb7afce9f4b51d3767a897cfa8..f104d9a1035127c3f75502854e176c72a4017e1d 100644 (file)
 package manifest
 
 import (
+       "errors"
+       "fmt"
        "git.curoverse.com/arvados.git/sdk/go/blockdigest"
        "log"
+       "regexp"
+       "strconv"
        "strings"
 )
 
+var ErrInvalidToken = errors.New("Invalid token")
+
+var LocatorPattern = regexp.MustCompile(
+       "^[0-9a-fA-F]{32}\\+[0-9]+(\\+[A-Z][A-Za-z0-9@_-]+)*$")
+
 type Manifest struct {
        Text string
 }
 
+type BlockLocator struct {
+       Digest blockdigest.BlockDigest
+       Size   int
+       Hints  []string
+}
+
+type DataSegment struct {
+       BlockLocator
+       Locator      string
+       StreamOffset uint64
+}
+
+// FileSegment is a portion of a file that is contained within a
+// single block.
+type FileSegment struct {
+       Locator string
+       // Offset (within this block) of this data segment
+       Offset int
+       Len    int
+}
+
 // Represents a single line from a manifest.
 type ManifestStream struct {
        StreamName string
        Blocks     []string
-       Files      []string
+       FileTokens []string
+}
+
+var escapeSeq = regexp.MustCompile(`\\([0-9]{3}|\\)`)
+
+func unescapeSeq(seq string) string {
+       if seq == `\\` {
+               return `\`
+       }
+       i, err := strconv.ParseUint(seq[1:], 8, 8)
+       if err != nil {
+               // Invalid escape sequence: can't unescape.
+               return seq
+       }
+       return string([]byte{byte(i)})
+}
+
+func UnescapeName(s string) string {
+       return escapeSeq.ReplaceAllStringFunc(s, unescapeSeq)
+}
+
+func ParseBlockLocator(s string) (b BlockLocator, err error) {
+       if !LocatorPattern.MatchString(s) {
+               err = fmt.Errorf("String \"%s\" does not match BlockLocator pattern "+
+                       "\"%s\".",
+                       s,
+                       LocatorPattern.String())
+       } else {
+               tokens := strings.Split(s, "+")
+               var blockSize int64
+               var blockDigest blockdigest.BlockDigest
+               // We expect both of the following to succeed since LocatorPattern
+               // restricts the strings appropriately.
+               blockDigest, err = blockdigest.FromString(tokens[0])
+               if err != nil {
+                       return
+               }
+               blockSize, err = strconv.ParseInt(tokens[1], 10, 0)
+               if err != nil {
+                       return
+               }
+               b.Digest = blockDigest
+               b.Size = int(blockSize)
+               b.Hints = tokens[2:]
+       }
+       return
+}
+
+func parseFileToken(tok string) (segPos, segLen uint64, name string, err error) {
+       parts := strings.SplitN(tok, ":", 3)
+       if len(parts) != 3 {
+               err = ErrInvalidToken
+               return
+       }
+       segPos, err = strconv.ParseUint(parts[0], 10, 64)
+       if err != nil {
+               return
+       }
+       segLen, err = strconv.ParseUint(parts[1], 10, 64)
+       if err != nil {
+               return
+       }
+       name = UnescapeName(parts[2])
+       return
+}
+
+func (s *ManifestStream) FileSegmentIterByName(filepath string) <-chan *FileSegment {
+       ch := make(chan *FileSegment)
+       go func() {
+               s.sendFileSegmentIterByName(filepath, ch)
+               close(ch)
+       }()
+       return ch
+}
+
+func (s *ManifestStream) sendFileSegmentIterByName(filepath string, ch chan<- *FileSegment) {
+       blockLens := make([]int, 0, len(s.Blocks))
+       // This is what streamName+"/"+fileName will look like:
+       target := "./" + filepath
+       for _, fTok := range s.FileTokens {
+               wantPos, wantLen, name, err := parseFileToken(fTok)
+               if err != nil {
+                       // Skip (!) invalid file tokens.
+                       continue
+               }
+               if s.StreamName+"/"+name != target {
+                       continue
+               }
+               if wantLen == 0 {
+                       ch <- &FileSegment{Locator: "d41d8cd98f00b204e9800998ecf8427e+0", Offset: 0, Len: 0}
+                       continue
+               }
+               // Linear search for blocks containing data for this
+               // file
+               var blockPos uint64 = 0 // position of block in stream
+               for i, loc := range s.Blocks {
+                       if blockPos >= wantPos+wantLen {
+                               break
+                       }
+                       if len(blockLens) <= i {
+                               blockLens = blockLens[:i+1]
+                               b, err := ParseBlockLocator(loc)
+                               if err != nil {
+                                       // Unparseable locator -> unusable
+                                       // stream.
+                                       ch <- nil
+                                       return
+                               }
+                               blockLens[i] = b.Size
+                       }
+                       blockLen := uint64(blockLens[i])
+                       if blockPos+blockLen <= wantPos {
+                               blockPos += blockLen
+                               continue
+                       }
+                       fseg := FileSegment{
+                               Locator: loc,
+                               Offset:  0,
+                               Len:     blockLens[i],
+                       }
+                       if blockPos < wantPos {
+                               fseg.Offset = int(wantPos - blockPos)
+                               fseg.Len -= fseg.Offset
+                       }
+                       if blockPos+blockLen > wantPos+wantLen {
+                               fseg.Len = int(wantPos+wantLen-blockPos) - fseg.Offset
+                       }
+                       ch <- &fseg
+                       blockPos += blockLen
+               }
+       }
 }
 
 func parseManifestStream(s string) (m ManifestStream) {
        tokens := strings.Split(s, " ")
-       m.StreamName = tokens[0]
+       m.StreamName = UnescapeName(tokens[0])
        tokens = tokens[1:]
        var i int
        for i = range tokens {
@@ -32,7 +192,7 @@ func parseManifestStream(s string) (m ManifestStream) {
                }
        }
        m.Blocks = tokens[:i]
-       m.Files = tokens[i:]
+       m.FileTokens = tokens[i:]
        return
 }
 
@@ -58,6 +218,20 @@ func (m *Manifest) StreamIter() <-chan ManifestStream {
        return ch
 }
 
+func (m *Manifest) FileSegmentIterByName(filepath string) <-chan *FileSegment {
+       ch := make(chan *FileSegment)
+       go func() {
+               for stream := range m.StreamIter() {
+                       if !strings.HasPrefix("./"+filepath, stream.StreamName+"/") {
+                               continue
+                       }
+                       stream.sendFileSegmentIterByName(filepath, ch)
+               }
+               close(ch)
+       }()
+       return ch
+}
+
 // Blocks may appear mulitple times within the same manifest if they
 // are used by multiple files. In that case this Iterator will output
 // the same block multiple times.
index 8cfe3d907721e4f7d33048dfa16ef91e38dec12d..364648d643cea64dadc706eb1f5c42e227ccbb39 100644 (file)
@@ -1,10 +1,13 @@
 package manifest
 
 import (
-       "git.curoverse.com/arvados.git/sdk/go/blockdigest"
        "io/ioutil"
+       "reflect"
        "runtime"
        "testing"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvadostest"
+       "git.curoverse.com/arvados.git/sdk/go/blockdigest"
 )
 
 func getStackTrace() string {
@@ -60,7 +63,7 @@ func expectStringSlicesEqual(t *testing.T, actual []string, expected []string) {
 func expectManifestStream(t *testing.T, actual ManifestStream, expected ManifestStream) {
        expectEqual(t, actual.StreamName, expected.StreamName)
        expectStringSlicesEqual(t, actual.Blocks, expected.Blocks)
-       expectStringSlicesEqual(t, actual.Files, expected.Files)
+       expectStringSlicesEqual(t, actual.FileTokens, expected.FileTokens)
 }
 
 func expectBlockLocator(t *testing.T, actual blockdigest.BlockLocator, expected blockdigest.BlockLocator) {
@@ -72,8 +75,19 @@ func expectBlockLocator(t *testing.T, actual blockdigest.BlockLocator, expected
 func TestParseManifestStreamSimple(t *testing.T) {
        m := parseManifestStream(". 365f83f5f808896ec834c8b595288735+2310+K@qr1hi+Af0c9a66381f3b028677411926f0be1c6282fe67c@542b5ddf 0:2310:qr1hi-8i9sb-ienvmpve1a0vpoi.log.txt")
        expectManifestStream(t, m, ManifestStream{StreamName: ".",
-               Blocks: []string{"365f83f5f808896ec834c8b595288735+2310+K@qr1hi+Af0c9a66381f3b028677411926f0be1c6282fe67c@542b5ddf"},
-               Files:  []string{"0:2310:qr1hi-8i9sb-ienvmpve1a0vpoi.log.txt"}})
+               Blocks:     []string{"365f83f5f808896ec834c8b595288735+2310+K@qr1hi+Af0c9a66381f3b028677411926f0be1c6282fe67c@542b5ddf"},
+               FileTokens: []string{"0:2310:qr1hi-8i9sb-ienvmpve1a0vpoi.log.txt"}})
+}
+
+func TestParseBlockLocatorSimple(t *testing.T) {
+       b, err := ParseBlockLocator("365f83f5f808896ec834c8b595288735+2310+K@qr1hi+Af0c9a66381f3b028677411926f0be1c6282fe67c@542b5ddf")
+       if err != nil {
+               t.Fatalf("Unexpected error parsing block locator: %v", err)
+       }
+       expectBlockLocator(t, b, BlockLocator{Digest: blockdigest.AssertFromString("365f83f5f808896ec834c8b595288735"),
+               Size: 2310,
+               Hints: []string{"K@qr1hi",
+                       "Af0c9a66381f3b028677411926f0be1c6282fe67c@542b5ddf"}})
 }
 
 func TestStreamIterShortManifestWithBlankStreams(t *testing.T) {
@@ -88,8 +102,8 @@ func TestStreamIterShortManifestWithBlankStreams(t *testing.T) {
        expectManifestStream(t,
                firstStream,
                ManifestStream{StreamName: ".",
-                       Blocks: []string{"b746e3d2104645f2f64cd3cc69dd895d+15693477+E2866e643690156651c03d876e638e674dcd79475@5441920c"},
-                       Files:  []string{"0:15893477:chr10_band0_s0_e3000000.fj"}})
+                       Blocks:     []string{"b746e3d2104645f2f64cd3cc69dd895d+15693477+E2866e643690156651c03d876e638e674dcd79475@5441920c"},
+                       FileTokens: []string{"0:15893477:chr10_band0_s0_e3000000.fj"}})
 
        received, ok := <-streamIter
        if ok {
@@ -126,3 +140,58 @@ func TestBlockIterLongManifest(t *testing.T) {
                        Size:  31367794,
                        Hints: []string{"E53f903684239bcc114f7bf8ff9bd6089f33058db@5441920c"}})
 }
+
+func TestUnescape(t *testing.T) {
+       for _, testCase := range [][]string{
+               {`\040`, ` `},
+               {`\009`, `\009`},
+               {`\\\040\\`, `\ \`},
+               {`\\040\`, `\040\`},
+       } {
+               in := testCase[0]
+               expect := testCase[1]
+               got := UnescapeName(in)
+               if expect != got {
+                       t.Errorf("For '%s' got '%s' instead of '%s'", in, got, expect)
+               }
+       }
+}
+
+type fsegtest struct {
+       mt   string        // manifest text
+       f    string        // filename
+       want []FileSegment // segments should be received on channel
+}
+
+func TestFileSegmentIterByName(t *testing.T) {
+       mt := arvadostest.PathologicalManifest
+       for _, testCase := range []fsegtest{
+               {mt: mt, f: "zzzz", want: nil},
+               // This case is too sensitive: it would be acceptable
+               // (even preferable) to return only one empty segment.
+               {mt: mt, f: "foo/zero", want: []FileSegment{{"d41d8cd98f00b204e9800998ecf8427e+0", 0, 0}, {"d41d8cd98f00b204e9800998ecf8427e+0", 0, 0}}},
+               {mt: mt, f: "zero@0", want: []FileSegment{{"d41d8cd98f00b204e9800998ecf8427e+0", 0, 0}}},
+               {mt: mt, f: "zero@1", want: []FileSegment{{"d41d8cd98f00b204e9800998ecf8427e+0", 0, 0}}},
+               {mt: mt, f: "zero@4", want: []FileSegment{{"d41d8cd98f00b204e9800998ecf8427e+0", 0, 0}}},
+               {mt: mt, f: "zero@9", want: []FileSegment{{"d41d8cd98f00b204e9800998ecf8427e+0", 0, 0}}},
+               {mt: mt, f: "f", want: []FileSegment{{"acbd18db4cc2f85cedef654fccc4a4d8+3", 0, 1}}},
+               {mt: mt, f: "ooba", want: []FileSegment{{"acbd18db4cc2f85cedef654fccc4a4d8+3", 1, 2}, {"37b51d194a7513e45b56f6524f2d51f2+3", 0, 2}}},
+               {mt: mt, f: "overlapReverse/o", want: []FileSegment{{"acbd18db4cc2f85cedef654fccc4a4d8+3", 2, 1}}},
+               {mt: mt, f: "overlapReverse/oo", want: []FileSegment{{"acbd18db4cc2f85cedef654fccc4a4d8+3", 1, 2}}},
+               {mt: mt, f: "overlapReverse/ofoo", want: []FileSegment{{"acbd18db4cc2f85cedef654fccc4a4d8+3", 2, 1}, {"acbd18db4cc2f85cedef654fccc4a4d8+3", 0, 3}}},
+               {mt: mt, f: "foo bar/baz", want: []FileSegment{{"acbd18db4cc2f85cedef654fccc4a4d8+3", 0, 3}}},
+               // This case is too sensitive: it would be better to
+               // omit the empty segment.
+               {mt: mt, f: "segmented/frob", want: []FileSegment{{"acbd18db4cc2f85cedef654fccc4a4d8+3", 0, 1}, {"37b51d194a7513e45b56f6524f2d51f2+3", 2, 1}, {"acbd18db4cc2f85cedef654fccc4a4d8+3", 1, 1}, {"d41d8cd98f00b204e9800998ecf8427e+0", 0, 0}, {"37b51d194a7513e45b56f6524f2d51f2+3", 0, 1}}},
+               {mt: mt, f: "segmented/oof", want: []FileSegment{{"acbd18db4cc2f85cedef654fccc4a4d8+3", 1, 2}, {"acbd18db4cc2f85cedef654fccc4a4d8+3", 0, 1}}},
+       } {
+               m := Manifest{Text: testCase.mt}
+               var got []FileSegment
+               for fs := range m.FileSegmentIterByName(testCase.f) {
+                       got = append(got, *fs)
+               }
+               if !reflect.DeepEqual(got, testCase.want) {
+                       t.Errorf("For %#v:\n got  %#v\n want %#v", testCase.f, got, testCase.want)
+               }
+       }
+}
index d7f6f92f186341d9da550ff6b4e95f132787f04f..4ba5a3b52ddce951dad70f465e60ab7aa236969a 100644 (file)
@@ -299,7 +299,7 @@ collection_with_files_in_subdir:
   modified_by_user_uuid: zzzzz-tpzed-user1withloadab
   modified_at: 2014-02-03T17:22:54Z
   updated_at: 2014-02-03T17:22:54Z
-  manifest_text: ". 85877ca2d7e05498dd3d109baf2df106+95+A3a4e26a366ee7e4ed3e476ccf05354761be2e4ae@545a9920 0:95:file_in_subdir1\n./subdir2/subdir3 2bbc341c702df4d8f42ec31f16c10120+64+A315d7e7bad2ce937e711fc454fae2d1194d14d64@545a9920 0:32:file1_in_subdir3.txt 32:32:file2_in_subdir3.txt\n./subdir2/subdir3/subdir4 2bbc341c702df4d8f42ec31f16c10120+64+A315d7e7bad2ce937e711fc454fae2d1194d14d64@545a9920 0:32:file1_in_subdir4.txt 32:32:file2_in_subdir4.txt"
+  manifest_text: ". 85877ca2d7e05498dd3d109baf2df106+95 0:95:file_in_subdir1\n./subdir2/subdir3 2bbc341c702df4d8f42ec31f16c10120+64 0:32:file1_in_subdir3.txt 32:32:file2_in_subdir3.txt\n./subdir2/subdir3/subdir4 2bbc341c702df4d8f42ec31f16c10120+64 0:32:file1_in_subdir4.txt 32:32:file2_in_subdir4.txt"
 
 graph_test_collection1:
   uuid: zzzzz-4zz18-bv31uwvy3neko22
@@ -481,7 +481,7 @@ collection_with_repeated_filenames_and_contents_in_two_dirs_1:
   modified_at: 2014-02-03T17:22:54Z
   updated_at: 2014-02-03T17:22:54Z
   name: collection_with_repeated_filenames_and_contents_in_two_dirs_1
-  manifest_text: "./dir1 92b53930db60fe94be2a73fc771ba921+34+Af966b611a1e6a7df18e0f20ac742a255c27744b7@550a3f11 0:12:alice 12:12:alice.txt 24:10:bob.txt\n./dir2 56ac2557b1ded11ccab7293dc47d1e88+44+A1780092551dadcb9c74190a793a779cea84d632d@550a3f11 0:27:alice.txt\n"
+  manifest_text: "./dir1 92b53930db60fe94be2a73fc771ba921+34 0:12:alice 12:12:alice.txt 24:10:bob.txt\n./dir2 56ac2557b1ded11ccab7293dc47d1e88+44 0:27:alice.txt\n"
 
 collection_with_repeated_filenames_and_contents_in_two_dirs_2:
   uuid: zzzzz-4zz18-duplicatenames2
@@ -493,7 +493,7 @@ collection_with_repeated_filenames_and_contents_in_two_dirs_2:
   modified_at: 2014-02-03T17:22:54Z
   updated_at: 2014-02-03T17:22:54Z
   name: collection_with_repeated_filenames_and_contents_in_two_dirs_2
-  manifest_text: "./dir1 92b53930db60fe94be2a73fc771ba921+34+Af966b611a1e6a7df18e0f20ac742a255c27744b7@550a3f11 0:12:alice 12:12:alice.txt 24:10:carol.txt\n./dir2 56ac2557b1ded11ccab7293dc47d1e88+44+A1780092551dadcb9c74190a793a779cea84d632d@550a3f11 0:27:alice.txt\n"
+  manifest_text: "./dir1 92b53930db60fe94be2a73fc771ba921+34 0:12:alice 12:12:alice.txt 24:10:carol.txt\n./dir2 56ac2557b1ded11ccab7293dc47d1e88+44 0:27:alice.txt\n"
 
 foo_and_bar_files_in_dir:
   uuid: zzzzz-4zz18-foonbarfilesdir
@@ -505,7 +505,7 @@ foo_and_bar_files_in_dir:
   modified_at: 2014-02-03T17:22:54Z
   updated_at: 2014-02-03T17:22:54Z
   name: foo_file_in_dir
-  manifest_text: "./dir1 a84b928ebdbae3f658518c711beaec02+28+A0cff02249e70e8cd6e55dba49fef4afa3f5bfdfb@550acd28 0:3:bar 12:16:foo\n"
+  manifest_text: "./dir1 3858f62230ac3c915f300c664312c63f+6 3:3:bar 0:3:foo\n"
 
 multi_level_to_combine:
   uuid: zzzzz-4zz18-pyw8yp9g3ujh45f
index 13f4dc60f71db9567e26f534c2379a63c601289b..69f31afbc9589ce6cd6c9de2a731d5093e2c80cd 100644 (file)
@@ -101,7 +101,7 @@ func TestCopyPipeToChildLogLongLines(t *testing.T) {
        }
 
        if after, err := rcv.ReadBytes('\n'); err != nil || string(after) != "after\n" {
-               t.Fatal("\"after\n\" not received (got \"%s\", %s)", after, err)
+               t.Fatalf("\"after\n\" not received (got \"%s\", %s)", after, err)
        }
 
        select {
index 86c2b089aa13088d8da8f524ab21dcc8dafc9641..3a9c21a43855a472c4f5b43aa9b651fd85f506d4 100644 (file)
@@ -23,7 +23,7 @@ import (
 
 // ServerAddress struct
 type ServerAddress struct {
-       SSL         bool   `json:service_ssl_flag`
+       SSL         bool   `json:"service_ssl_flag"`
        Host        string `json:"service_host"`
        Port        int    `json:"service_port"`
        UUID        string `json:"uuid"`
diff --git a/services/keep-web/.gitignore b/services/keep-web/.gitignore
new file mode 100644 (file)
index 0000000..53997c2
--- /dev/null
@@ -0,0 +1 @@
+keep-web
diff --git a/services/keep-web/doc.go b/services/keep-web/doc.go
new file mode 100644 (file)
index 0000000..7e0a00f
--- /dev/null
@@ -0,0 +1,224 @@
+// Keep-web provides read-only HTTP access to files stored in Keep. It
+// serves public data to anonymous and unauthenticated clients, and
+// serves private data to clients that supply Arvados API tokens. It
+// can be installed anywhere with access to Keep services, typically
+// behind a web proxy that supports TLS.
+//
+// See http://doc.arvados.org/install/install-keep-web.html.
+//
+// Run "keep-web -help" to show all supported options.
+//
+// Starting the server
+//
+// Serve HTTP requests at port 1234 on all interfaces:
+//
+//   keep-web -address=:1234
+//
+// Serve HTTP requests at port 1234 on the interface with IP address 1.2.3.4:
+//
+//   keep-web -address=1.2.3.4:1234
+//
+// Proxy configuration
+//
+// Keep-web does not support SSL natively. Typically, it is installed
+// behind a proxy like nginx.
+//
+// Here is an example nginx configuration.
+//
+//     http {
+//       upstream keep-web {
+//         server localhost:1234;
+//       }
+//       server {
+//         listen *:443 ssl;
+//         server_name collections.example.com *.collections.example.com ~.*--collections.example.com;
+//         ssl_certificate /root/wildcard.example.com.crt;
+//         ssl_certificate_key /root/wildcard.example.com.key;
+//         location  / {
+//           proxy_pass http://keep-web;
+//           proxy_set_header Host $host;
+//           proxy_set_header X-Forwarded-For $remote_addr;
+//         }
+//       }
+//     }
+//
+// It is not necessary to run keep-web on the same host as the nginx
+// proxy. However, TLS is not used between nginx and keep-web, so
+// intervening networks must be secured by other means.
+//
+// Download URLs
+//
+// The following "same origin" URL patterns are supported for public
+// collections and collections shared anonymously via secret links
+// (i.e., collections which can be served by keep-web without making
+// use of any implicit credentials like cookies). See "Same-origin
+// URLs" below.
+//
+//   http://collections.example.com/c=uuid_or_pdh/path/file.txt
+//   http://collections.example.com/c=uuid_or_pdh/t=TOKEN/path/file.txt
+//
+// The following "multiple origin" URL patterns are supported for all
+// collections:
+//
+//   http://uuid_or_pdh--collections.example.com/path/file.txt
+//   http://uuid_or_pdh--collections.example.com/t=TOKEN/path/file.txt
+//
+// In the "multiple origin" form, the string "--" can be replaced with
+// "." with identical results (assuming the downstream proxy is
+// configured accordingly). These two are equivalent:
+//
+//   http://uuid_or_pdh--collections.example.com/path/file.txt
+//   http://uuid_or_pdh.collections.example.com/path/file.txt
+//
+// The first form (with "--" instead of ".") avoids the cost and
+// effort of deploying a wildcard TLS certificate for
+// *.collections.example.com at sites that already have a wildcard
+// certificate for *.example.com. The second form is likely to be
+// easier to configure, and more efficient to run, on a downstream
+// proxy.
+//
+// In all of the above forms, the "collections.example.com" part can
+// be anything at all: keep-web itself ignores everything after the
+// first "." or "--". (Of course, in order for clients to connect at
+// all, DNS and any relevant proxies must be configured accordingly.)
+//
+// In all of the above forms, the "uuid_or_pdh" part can be either a
+// collection UUID or a portable data hash with the "+" character
+// optionally replaced by "-". (Replacing "+" with "-" is mandatory
+// when "uuid_or_pdh" appears in the domain name only because "+" is
+// not a valid character in a domain name.)
+//
+// In all of the above forms, a top level directory called "_" is
+// skipped. In cases where the "path/file.txt" part might start with
+// "t=" or "c=" or "_/", links should be constructed with a leading
+// "_/" to ensure the top level directory is not interpreted as a
+// token or collection ID.
+//
+// Assuming there is a collection with UUID
+// zzzzz-4zz18-znfnqtbbv4spc3w and portable data hash
+// 1f4b0bc7583c2a7f9102c395f4ffc5e3+45, the following URLs are
+// interchangeable:
+//
+//   http://zzzzz-4zz18-znfnqtbbv4spc3w.collections.example.com/foo/bar.txt
+//   http://zzzzz-4zz18-znfnqtbbv4spc3w.collections.example.com/_/foo/bar.txt
+//   http://zzzzz-4zz18-znfnqtbbv4spc3w--collections.example.com/_/foo/bar.txt
+//   http://1f4b0bc7583c2a7f9102c395f4ffc5e3-45--foo.example.com/foo/bar.txt
+//   http://1f4b0bc7583c2a7f9102c395f4ffc5e3-45--.invalid/foo/bar.txt
+//
+// An additional form is supported specifically to make it more
+// convenient to maintain support for existing Workbench download
+// links:
+//
+//   http://collections.example.com/collections/download/uuid_or_pdh/TOKEN/foo/bar.txt
+//
+// A regular Workbench "download" link is also accepted, but
+// credentials passed via cookie, header, etc. are ignored. Only
+// public data can be served this way:
+//
+//   http://collections.example.com/collections/uuid_or_pdh/foo/bar.txt
+//
+// Authorization mechanisms
+//
+// A token can be provided in an Authorization header:
+//
+//   Authorization: OAuth2 o07j4px7RlJK4CuMYp7C0LDT4CzR1J1qBE5Avo7eCcUjOTikxK
+//
+// A base64-encoded token can be provided in a cookie named "api_token":
+//
+//   Cookie: api_token=bzA3ajRweDdSbEpLNEN1TVlwN0MwTERUNEN6UjFKMXFCRTVBdm83ZUNjVWpPVGlreEs=
+//
+// A token can be provided in an URL-encoded query string:
+//
+//   GET /foo/bar.txt?api_token=o07j4px7RlJK4CuMYp7C0LDT4CzR1J1qBE5Avo7eCcUjOTikxK
+//
+// A suitably encoded token can be provided in a POST body if the
+// request has a content type of application/x-www-form-urlencoded or
+// multipart/form-data:
+//
+//   POST /foo/bar.txt
+//   Content-Type: application/x-www-form-urlencoded
+//   [...]
+//   api_token=o07j4px7RlJK4CuMYp7C0LDT4CzR1J1qBE5Avo7eCcUjOTikxK
+//
+// If a token is provided in a query string or in a POST request, the
+// response is an HTTP 303 redirect to an equivalent GET request, with
+// the token stripped from the query string and added to a cookie
+// instead.
+//
+// Indexes
+//
+// Currently, keep-web does not generate HTML index listings, nor does
+// it serve a default file like "index.html" when a directory is
+// requested. These features are likely to be added in future
+// versions. Until then, keep-web responds with 404 if a directory
+// name (or any path ending with "/") is requested.
+//
+// Compatibility
+//
+// Client-provided authorization tokens are ignored if the client does
+// not provide a Host header.
+//
+// In order to use the query string or a POST form authorization
+// mechanisms, the client must follow 303 redirects; the client must
+// accept cookies with a 303 response and send those cookies when
+// performing the redirect; and either the client or an intervening
+// proxy must resolve a relative URL ("//host/path") if given in a
+// response Location header.
+//
+// Intranet mode
+//
+// Normally, Keep-web accepts requests for multiple collections using
+// the same host name, provided the client's credentials are not being
+// used. This provides insufficient XSS protection in an installation
+// where the "anonymously accessible" data is not truly public, but
+// merely protected by network topology.
+//
+// In such cases -- for example, a site which is not reachable from
+// the internet, where some data is world-readable from Arvados's
+// perspective but is intended to be available only to users within
+// the local network -- the downstream proxy should configured to
+// return 401 for all paths beginning with "/c=".
+//
+// Same-origin URLs
+//
+// Without the same-origin protection outlined above, a web page
+// stored in collection X could execute JavaScript code that uses the
+// current viewer's credentials to download additional data from
+// collection Y -- data which is accessible to the current viewer, but
+// not to the author of collection X -- from the same origin
+// (``https://collections.example.com/'') and upload it to some other
+// site chosen by the author of collection X.
+//
+// Attachment-Only host
+//
+// It is possible to serve untrusted content and accept user
+// credentials at the same origin as long as the content is only
+// downloaded, never executed by browsers. A single origin (hostname
+// and port) can be designated as an "attachment-only" origin: cookies
+// will be accepted and all responses will have a
+// "Content-Disposition: attachment" header. This behavior is invoked
+// only when the designated origin matches exactly the Host header
+// provided by the client or downstream proxy.
+//
+//   keep-web -address :9999 -attachment-only-host domain.example:9999
+//
+// Trust All Content mode
+//
+// In "trust all content" mode, Keep-web will accept credentials (API
+// tokens) and serve any collection X at
+// "https://collections.example.com/collections/X/path/file.ext".
+// This is UNSAFE except in the special case where everyone who is
+// able write ANY data to Keep, and every JavaScript and HTML file
+// written to Keep, is also trusted to read ALL of the data in Keep.
+//
+// In such cases you can enable trust-all-content mode.
+//
+//   keep-web -address :9999 -trust-all-content
+//
+// When using trust-all-content mode, the only effect of the
+// -attachment-only-host option is to add a "Content-Disposition:
+// attachment" header.
+//
+//   keep-web -address :9999 -attachment-only-host domain.example:9999 -trust-all-content
+//
+package main
diff --git a/services/keep-web/handler.go b/services/keep-web/handler.go
new file mode 100644 (file)
index 0000000..2704263
--- /dev/null
@@ -0,0 +1,319 @@
+package main
+
+import (
+       "flag"
+       "fmt"
+       "html"
+       "io"
+       "mime"
+       "net/http"
+       "net/url"
+       "os"
+       "strings"
+       "time"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+       "git.curoverse.com/arvados.git/sdk/go/auth"
+       "git.curoverse.com/arvados.git/sdk/go/httpserver"
+       "git.curoverse.com/arvados.git/sdk/go/keepclient"
+)
+
+type handler struct{}
+
+var (
+       clientPool         = arvadosclient.MakeClientPool()
+       trustAllContent    = false
+       anonymousTokens    []string
+       attachmentOnlyHost = ""
+)
+
+func init() {
+       flag.BoolVar(&trustAllContent, "trust-all-content", false,
+               "Serve non-public content from a single origin. Dangerous: read docs before using!")
+       flag.StringVar(&attachmentOnlyHost, "attachment-only-host", "",
+               "Accept credentials, and add \"Content-Disposition: attachment\" response headers, for requests at this hostname:port. Prohibiting inline display makes it possible to serve untrusted and non-public content from a single origin, i.e., without wildcard DNS or SSL.")
+}
+
+// return a UUID or PDH if s begins with a UUID or URL-encoded PDH;
+// otherwise return "".
+func parseCollectionIDFromDNSName(s string) string {
+       // Strip domain.
+       if i := strings.IndexRune(s, '.'); i >= 0 {
+               s = s[:i]
+       }
+       // Names like {uuid}--collections.example.com serve the same
+       // purpose as {uuid}.collections.example.com but can reduce
+       // cost/effort of using [additional] wildcard certificates.
+       if i := strings.Index(s, "--"); i >= 0 {
+               s = s[:i]
+       }
+       if arvadosclient.UUIDMatch(s) {
+               return s
+       }
+       if pdh := strings.Replace(s, "-", "+", 1); arvadosclient.PDHMatch(pdh) {
+               return pdh
+       }
+       return ""
+}
+
+var urlPDHDecoder = strings.NewReplacer(" ", "+", "-", "+")
+
+// return a UUID or PDH if s is a UUID or a PDH (even if it is a PDH
+// with "+" replaced by " " or "-"); otherwise return "".
+func parseCollectionIDFromURL(s string) string {
+       if arvadosclient.UUIDMatch(s) {
+               return s
+       }
+       if pdh := urlPDHDecoder.Replace(s); arvadosclient.PDHMatch(pdh) {
+               return pdh
+       }
+       return ""
+}
+
+func (h *handler) ServeHTTP(wOrig http.ResponseWriter, r *http.Request) {
+       var statusCode = 0
+       var statusText string
+
+       remoteAddr := r.RemoteAddr
+       if xff := r.Header.Get("X-Forwarded-For"); xff != "" {
+               remoteAddr = xff + "," + remoteAddr
+       }
+
+       w := httpserver.WrapResponseWriter(wOrig)
+       defer func() {
+               if statusCode == 0 {
+                       statusCode = w.WroteStatus()
+               } else if w.WroteStatus() == 0 {
+                       w.WriteHeader(statusCode)
+               } else if w.WroteStatus() != statusCode {
+                       httpserver.Log(r.RemoteAddr, "WARNING",
+                               fmt.Sprintf("Our status changed from %d to %d after we sent headers", w.WroteStatus(), statusCode))
+               }
+               if statusText == "" {
+                       statusText = http.StatusText(statusCode)
+               }
+               httpserver.Log(remoteAddr, statusCode, statusText, w.WroteBodyBytes(), r.Method, r.Host, r.URL.Path, r.URL.RawQuery)
+       }()
+
+       if r.Method != "GET" && r.Method != "POST" {
+               statusCode, statusText = http.StatusMethodNotAllowed, r.Method
+               return
+       }
+
+       arv := clientPool.Get()
+       if arv == nil {
+               statusCode, statusText = http.StatusInternalServerError, "Pool failed: "+clientPool.Err().Error()
+               return
+       }
+       defer clientPool.Put(arv)
+
+       pathParts := strings.Split(r.URL.Path[1:], "/")
+
+       var targetID string
+       var targetPath []string
+       var tokens []string
+       var reqTokens []string
+       var pathToken bool
+       var attachment bool
+       credentialsOK := trustAllContent
+
+       if r.Host != "" && r.Host == attachmentOnlyHost {
+               credentialsOK = true
+               attachment = true
+       } else if r.FormValue("disposition") == "attachment" {
+               attachment = true
+       }
+
+       if targetID = parseCollectionIDFromDNSName(r.Host); targetID != "" {
+               // http://ID.collections.example/PATH...
+               credentialsOK = true
+               targetPath = pathParts
+       } else if len(pathParts) >= 2 && strings.HasPrefix(pathParts[0], "c=") {
+               // /c=ID/PATH...
+               targetID = parseCollectionIDFromURL(pathParts[0][2:])
+               targetPath = pathParts[1:]
+       } else if len(pathParts) >= 3 && pathParts[0] == "collections" {
+               if len(pathParts) >= 5 && pathParts[1] == "download" {
+                       // /collections/download/ID/TOKEN/PATH...
+                       targetID = pathParts[2]
+                       tokens = []string{pathParts[3]}
+                       targetPath = pathParts[4:]
+                       pathToken = true
+               } else {
+                       // /collections/ID/PATH...
+                       targetID = pathParts[1]
+                       tokens = anonymousTokens
+                       targetPath = pathParts[2:]
+               }
+       } else {
+               statusCode = http.StatusNotFound
+               return
+       }
+       if t := r.FormValue("api_token"); t != "" {
+               // The client provided an explicit token in the query
+               // string, or a form in POST body. We must put the
+               // token in an HttpOnly cookie, and redirect to the
+               // same URL with the query param redacted and method =
+               // GET.
+
+               if !credentialsOK {
+                       // It is not safe to copy the provided token
+                       // into a cookie unless the current vhost
+                       // (origin) serves only a single collection or
+                       // we are in trustAllContent mode.
+                       statusCode = http.StatusBadRequest
+                       return
+               }
+
+               // The HttpOnly flag is necessary to prevent
+               // JavaScript code (included in, or loaded by, a page
+               // in the collection being served) from employing the
+               // user's token beyond reading other files in the same
+               // domain, i.e., same collection.
+               //
+               // The 303 redirect is necessary in the case of a GET
+               // request to avoid exposing the token in the Location
+               // bar, and in the case of a POST request to avoid
+               // raising warnings when the user refreshes the
+               // resulting page.
+
+               http.SetCookie(w, &http.Cookie{
+                       Name:     "arvados_api_token",
+                       Value:    auth.EncodeTokenCookie([]byte(t)),
+                       Path:     "/",
+                       Expires:  time.Now().AddDate(10, 0, 0),
+                       HttpOnly: true,
+               })
+               redir := (&url.URL{Host: r.Host, Path: r.URL.Path}).String()
+
+               w.Header().Add("Location", redir)
+               statusCode, statusText = http.StatusSeeOther, redir
+               w.WriteHeader(statusCode)
+               io.WriteString(w, `<A href="`)
+               io.WriteString(w, html.EscapeString(redir))
+               io.WriteString(w, `">Continue</A>`)
+               return
+       }
+
+       if tokens == nil && strings.HasPrefix(targetPath[0], "t=") {
+               // http://ID.example/t=TOKEN/PATH...
+               // /c=ID/t=TOKEN/PATH...
+               //
+               // This form must only be used to pass scoped tokens
+               // that give permission for a single collection. See
+               // FormValue case above.
+               tokens = []string{targetPath[0][2:]}
+               pathToken = true
+               targetPath = targetPath[1:]
+       }
+
+       if tokens == nil {
+               if credentialsOK {
+                       reqTokens = auth.NewCredentialsFromHTTPRequest(r).Tokens
+               }
+               tokens = append(reqTokens, anonymousTokens...)
+       }
+
+       if len(targetPath) > 0 && targetPath[0] == "_" {
+               // If a collection has a directory called "t=foo" or
+               // "_", it can be served at
+               // //collections.example/_/t=foo/ or
+               // //collections.example/_/_/ respectively:
+               // //collections.example/t=foo/ won't work because
+               // t=foo will be interpreted as a token "foo".
+               targetPath = targetPath[1:]
+       }
+
+       tokenResult := make(map[string]int)
+       collection := make(map[string]interface{})
+       found := false
+       for _, arv.ApiToken = range tokens {
+               err := arv.Get("collections", targetID, nil, &collection)
+               if err == nil {
+                       // Success
+                       found = true
+                       break
+               }
+               if srvErr, ok := err.(arvadosclient.APIServerError); ok {
+                       switch srvErr.HttpStatusCode {
+                       case 404, 401:
+                               // Token broken or insufficient to
+                               // retrieve collection
+                               tokenResult[arv.ApiToken] = srvErr.HttpStatusCode
+                               continue
+                       }
+               }
+               // Something more serious is wrong
+               statusCode, statusText = http.StatusInternalServerError, err.Error()
+               return
+       }
+       if !found {
+               if pathToken || !credentialsOK {
+                       // Either the URL is a "secret sharing link"
+                       // that didn't work out (and asking the client
+                       // for additional credentials would just be
+                       // confusing), or we don't even accept
+                       // credentials at this path.
+                       statusCode = http.StatusNotFound
+                       return
+               }
+               for _, t := range reqTokens {
+                       if tokenResult[t] == 404 {
+                               // The client provided valid token(s), but the
+                               // collection was not found.
+                               statusCode = http.StatusNotFound
+                               return
+                       }
+               }
+               // The client's token was invalid (e.g., expired), or
+               // the client didn't even provide one.  Propagate the
+               // 401 to encourage the client to use a [different]
+               // token.
+               //
+               // TODO(TC): This response would be confusing to
+               // someone trying (anonymously) to download public
+               // data that has been deleted.  Allow a referrer to
+               // provide this context somehow?
+               w.Header().Add("WWW-Authenticate", "Basic realm=\"collections\"")
+               statusCode = http.StatusUnauthorized
+               return
+       }
+
+       filename := strings.Join(targetPath, "/")
+       kc, err := keepclient.MakeKeepClient(arv)
+       if err != nil {
+               statusCode, statusText = http.StatusInternalServerError, err.Error()
+               return
+       }
+       rdr, err := kc.CollectionFileReader(collection, filename)
+       if os.IsNotExist(err) {
+               statusCode = http.StatusNotFound
+               return
+       } else if err != nil {
+               statusCode, statusText = http.StatusBadGateway, err.Error()
+               return
+       }
+       defer rdr.Close()
+
+       // One or both of these can be -1 if not found:
+       basenamePos := strings.LastIndex(filename, "/")
+       extPos := strings.LastIndex(filename, ".")
+       if extPos > basenamePos {
+               // Now extPos is safely >= 0.
+               if t := mime.TypeByExtension(filename[extPos:]); t != "" {
+                       w.Header().Set("Content-Type", t)
+               }
+       }
+       if rdr, ok := rdr.(keepclient.ReadCloserWithLen); ok {
+               w.Header().Set("Content-Length", fmt.Sprintf("%d", rdr.Len()))
+       }
+       if attachment {
+               w.Header().Set("Content-Disposition", "attachment")
+       }
+
+       w.WriteHeader(http.StatusOK)
+       _, err = io.Copy(w, rdr)
+       if err != nil {
+               statusCode, statusText = http.StatusBadGateway, err.Error()
+       }
+}
diff --git a/services/keep-web/handler_test.go b/services/keep-web/handler_test.go
new file mode 100644 (file)
index 0000000..392de94
--- /dev/null
@@ -0,0 +1,356 @@
+package main
+
+import (
+       "html"
+       "io/ioutil"
+       "net/http"
+       "net/http/httptest"
+       "net/url"
+       "regexp"
+       "strings"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvadostest"
+       "git.curoverse.com/arvados.git/sdk/go/auth"
+       check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&UnitSuite{})
+
+type UnitSuite struct{}
+
+func mustParseURL(s string) *url.URL {
+       r, err := url.Parse(s)
+       if err != nil {
+               panic("parse URL: " + s)
+       }
+       return r
+}
+
+func (s *IntegrationSuite) TestVhost404(c *check.C) {
+       for _, testURL := range []string{
+               arvadostest.NonexistentCollection + ".example.com/theperthcountyconspiracy",
+               arvadostest.NonexistentCollection + ".example.com/t=" + arvadostest.ActiveToken + "/theperthcountyconspiracy",
+       } {
+               resp := httptest.NewRecorder()
+               req := &http.Request{
+                       Method: "GET",
+                       URL:    mustParseURL(testURL),
+               }
+               (&handler{}).ServeHTTP(resp, req)
+               c.Check(resp.Code, check.Equals, http.StatusNotFound)
+               c.Check(resp.Body.String(), check.Equals, "")
+       }
+}
+
+// An authorizer modifies an HTTP request to make use of the given
+// token -- by adding it to a header, cookie, query param, or whatever
+// -- and returns the HTTP status code we should expect from keep-web if
+// the token is invalid.
+type authorizer func(*http.Request, string) int
+
+func (s *IntegrationSuite) TestVhostViaAuthzHeader(c *check.C) {
+       doVhostRequests(c, authzViaAuthzHeader)
+}
+func authzViaAuthzHeader(r *http.Request, tok string) int {
+       r.Header.Add("Authorization", "OAuth2 "+tok)
+       return http.StatusUnauthorized
+}
+
+func (s *IntegrationSuite) TestVhostViaCookieValue(c *check.C) {
+       doVhostRequests(c, authzViaCookieValue)
+}
+func authzViaCookieValue(r *http.Request, tok string) int {
+       r.AddCookie(&http.Cookie{
+               Name:  "arvados_api_token",
+               Value: auth.EncodeTokenCookie([]byte(tok)),
+       })
+       return http.StatusUnauthorized
+}
+
+func (s *IntegrationSuite) TestVhostViaPath(c *check.C) {
+       doVhostRequests(c, authzViaPath)
+}
+func authzViaPath(r *http.Request, tok string) int {
+       r.URL.Path = "/t=" + tok + r.URL.Path
+       return http.StatusNotFound
+}
+
+func (s *IntegrationSuite) TestVhostViaQueryString(c *check.C) {
+       doVhostRequests(c, authzViaQueryString)
+}
+func authzViaQueryString(r *http.Request, tok string) int {
+       r.URL.RawQuery = "api_token=" + tok
+       return http.StatusUnauthorized
+}
+
+func (s *IntegrationSuite) TestVhostViaPOST(c *check.C) {
+       doVhostRequests(c, authzViaPOST)
+}
+func authzViaPOST(r *http.Request, tok string) int {
+       r.Method = "POST"
+       r.Header.Add("Content-Type", "application/x-www-form-urlencoded")
+       r.Body = ioutil.NopCloser(strings.NewReader(
+               url.Values{"api_token": {tok}}.Encode()))
+       return http.StatusUnauthorized
+}
+
+// Try some combinations of {url, token} using the given authorization
+// mechanism, and verify the result is correct.
+func doVhostRequests(c *check.C, authz authorizer) {
+       for _, hostPath := range []string{
+               arvadostest.FooCollection + ".example.com/foo",
+               arvadostest.FooCollection + "--collections.example.com/foo",
+               arvadostest.FooCollection + "--collections.example.com/_/foo",
+               arvadostest.FooPdh + ".example.com/foo",
+               strings.Replace(arvadostest.FooPdh, "+", "-", -1) + "--collections.example.com/foo",
+               arvadostest.FooBarDirCollection + ".example.com/dir1/foo",
+       } {
+               c.Log("doRequests: ", hostPath)
+               doVhostRequestsWithHostPath(c, authz, hostPath)
+       }
+}
+
+func doVhostRequestsWithHostPath(c *check.C, authz authorizer, hostPath string) {
+       for _, tok := range []string{
+               arvadostest.ActiveToken,
+               arvadostest.ActiveToken[:15],
+               arvadostest.SpectatorToken,
+               "bogus",
+               "",
+       } {
+               u := mustParseURL("http://" + hostPath)
+               req := &http.Request{
+                       Method: "GET",
+                       Host:   u.Host,
+                       URL:    u,
+                       Header: http.Header{},
+               }
+               failCode := authz(req, tok)
+               resp := doReq(req)
+               code, body := resp.Code, resp.Body.String()
+               if tok == arvadostest.ActiveToken {
+                       c.Check(code, check.Equals, http.StatusOK)
+                       c.Check(body, check.Equals, "foo")
+               } else {
+                       c.Check(code >= 400, check.Equals, true)
+                       c.Check(code < 500, check.Equals, true)
+                       if tok == arvadostest.SpectatorToken {
+                               // Valid token never offers to retry
+                               // with different credentials.
+                               c.Check(code, check.Equals, http.StatusNotFound)
+                       } else {
+                               // Invalid token can ask to retry
+                               // depending on the authz method.
+                               c.Check(code, check.Equals, failCode)
+                       }
+                       c.Check(body, check.Equals, "")
+               }
+       }
+}
+
+func doReq(req *http.Request) *httptest.ResponseRecorder {
+       resp := httptest.NewRecorder()
+       (&handler{}).ServeHTTP(resp, req)
+       if resp.Code != http.StatusSeeOther {
+               return resp
+       }
+       cookies := (&http.Response{Header: resp.Header()}).Cookies()
+       u, _ := req.URL.Parse(resp.Header().Get("Location"))
+       req = &http.Request{
+               Method: "GET",
+               Host:   u.Host,
+               URL:    u,
+               Header: http.Header{},
+       }
+       for _, c := range cookies {
+               req.AddCookie(c)
+       }
+       return doReq(req)
+}
+
+func (s *IntegrationSuite) TestVhostRedirectQueryTokenToCookie(c *check.C) {
+       s.testVhostRedirectTokenToCookie(c, "GET",
+               arvadostest.FooCollection+".example.com/foo",
+               "?api_token="+arvadostest.ActiveToken,
+               "",
+               "",
+               http.StatusOK,
+               "foo",
+       )
+}
+
+func (s *IntegrationSuite) TestSingleOriginSecretLink(c *check.C) {
+       s.testVhostRedirectTokenToCookie(c, "GET",
+               "example.com/c="+arvadostest.FooCollection+"/t="+arvadostest.ActiveToken+"/foo",
+               "",
+               "",
+               "",
+               http.StatusOK,
+               "foo",
+       )
+}
+
+// Bad token in URL is 404 Not Found because it doesn't make sense to
+// retry the same URL with different authorization.
+func (s *IntegrationSuite) TestSingleOriginSecretLinkBadToken(c *check.C) {
+       s.testVhostRedirectTokenToCookie(c, "GET",
+               "example.com/c="+arvadostest.FooCollection+"/t=bogus/foo",
+               "",
+               "",
+               "",
+               http.StatusNotFound,
+               "",
+       )
+}
+
+// Bad token in a cookie (even if it got there via our own
+// query-string-to-cookie redirect) is, in principle, retryable at the
+// same URL so it's 401 Unauthorized.
+func (s *IntegrationSuite) TestVhostRedirectQueryTokenToBogusCookie(c *check.C) {
+       s.testVhostRedirectTokenToCookie(c, "GET",
+               arvadostest.FooCollection+".example.com/foo",
+               "?api_token=thisisabogustoken",
+               "",
+               "",
+               http.StatusUnauthorized,
+               "",
+       )
+}
+
+func (s *IntegrationSuite) TestVhostRedirectQueryTokenSingleOriginError(c *check.C) {
+       s.testVhostRedirectTokenToCookie(c, "GET",
+               "example.com/c="+arvadostest.FooCollection+"/foo",
+               "?api_token="+arvadostest.ActiveToken,
+               "",
+               "",
+               http.StatusBadRequest,
+               "",
+       )
+}
+
+func (s *IntegrationSuite) TestVhostRedirectQueryTokenTrustAllContent(c *check.C) {
+       defer func(orig bool) {
+               trustAllContent = orig
+       }(trustAllContent)
+       trustAllContent = true
+       s.testVhostRedirectTokenToCookie(c, "GET",
+               "example.com/c="+arvadostest.FooCollection+"/foo",
+               "?api_token="+arvadostest.ActiveToken,
+               "",
+               "",
+               http.StatusOK,
+               "foo",
+       )
+}
+
+func (s *IntegrationSuite) TestVhostRedirectQueryTokenAttachmentOnlyHost(c *check.C) {
+       defer func(orig string) {
+               attachmentOnlyHost = orig
+       }(attachmentOnlyHost)
+       attachmentOnlyHost = "example.com:1234"
+
+       s.testVhostRedirectTokenToCookie(c, "GET",
+               "example.com/c="+arvadostest.FooCollection+"/foo",
+               "?api_token="+arvadostest.ActiveToken,
+               "",
+               "",
+               http.StatusBadRequest,
+               "",
+       )
+
+       resp := s.testVhostRedirectTokenToCookie(c, "GET",
+               "example.com:1234/c="+arvadostest.FooCollection+"/foo",
+               "?api_token="+arvadostest.ActiveToken,
+               "",
+               "",
+               http.StatusOK,
+               "foo",
+       )
+       c.Check(resp.Header().Get("Content-Disposition"), check.Equals, "attachment")
+}
+
+func (s *IntegrationSuite) TestVhostRedirectPOSTFormTokenToCookie(c *check.C) {
+       s.testVhostRedirectTokenToCookie(c, "POST",
+               arvadostest.FooCollection+".example.com/foo",
+               "",
+               "application/x-www-form-urlencoded",
+               url.Values{"api_token": {arvadostest.ActiveToken}}.Encode(),
+               http.StatusOK,
+               "foo",
+       )
+}
+
+func (s *IntegrationSuite) TestVhostRedirectPOSTFormTokenToCookie404(c *check.C) {
+       s.testVhostRedirectTokenToCookie(c, "POST",
+               arvadostest.FooCollection+".example.com/foo",
+               "",
+               "application/x-www-form-urlencoded",
+               url.Values{"api_token": {arvadostest.SpectatorToken}}.Encode(),
+               http.StatusNotFound,
+               "",
+       )
+}
+
+func (s *IntegrationSuite) TestAnonymousTokenOK(c *check.C) {
+       anonymousTokens = []string{arvadostest.AnonymousToken}
+       s.testVhostRedirectTokenToCookie(c, "GET",
+               "example.com/c="+arvadostest.HelloWorldCollection+"/Hello%20world.txt",
+               "",
+               "",
+               "",
+               http.StatusOK,
+               "Hello world\n",
+       )
+}
+
+func (s *IntegrationSuite) TestAnonymousTokenError(c *check.C) {
+       anonymousTokens = []string{"anonymousTokenConfiguredButInvalid"}
+       s.testVhostRedirectTokenToCookie(c, "GET",
+               "example.com/c="+arvadostest.HelloWorldCollection+"/Hello%20world.txt",
+               "",
+               "",
+               "",
+               http.StatusNotFound,
+               "",
+       )
+}
+
+func (s *IntegrationSuite) testVhostRedirectTokenToCookie(c *check.C, method, hostPath, queryString, contentType, reqBody string, expectStatus int, expectRespBody string) *httptest.ResponseRecorder {
+       u, _ := url.Parse(`http://` + hostPath + queryString)
+       req := &http.Request{
+               Method: method,
+               Host:   u.Host,
+               URL:    u,
+               Header: http.Header{"Content-Type": {contentType}},
+               Body:   ioutil.NopCloser(strings.NewReader(reqBody)),
+       }
+
+       resp := httptest.NewRecorder()
+       defer func() {
+               c.Check(resp.Code, check.Equals, expectStatus)
+               c.Check(resp.Body.String(), check.Equals, expectRespBody)
+       }()
+
+       (&handler{}).ServeHTTP(resp, req)
+       if resp.Code != http.StatusSeeOther {
+               return resp
+       }
+       c.Check(resp.Body.String(), check.Matches, `.*href="//`+regexp.QuoteMeta(html.EscapeString(hostPath))+`".*`)
+       cookies := (&http.Response{Header: resp.Header()}).Cookies()
+
+       u, _ = u.Parse(resp.Header().Get("Location"))
+       req = &http.Request{
+               Method: "GET",
+               Host:   u.Host,
+               URL:    u,
+               Header: http.Header{},
+       }
+       for _, c := range cookies {
+               req.AddCookie(c)
+       }
+
+       resp = httptest.NewRecorder()
+       (&handler{}).ServeHTTP(resp, req)
+       c.Check(resp.Header().Get("Location"), check.Equals, "")
+       return resp
+}
diff --git a/services/keep-web/main.go b/services/keep-web/main.go
new file mode 100644 (file)
index 0000000..751543e
--- /dev/null
@@ -0,0 +1,31 @@
+package main
+
+import (
+       "flag"
+       "log"
+       "os"
+)
+
+func init() {
+       // MakeArvadosClient returns an error if this env var isn't
+       // available as a default token (even if we explicitly set a
+       // different token before doing anything with the client). We
+       // set this dummy value during init so it doesn't clobber the
+       // one used by "run test servers".
+       os.Setenv("ARVADOS_API_TOKEN", "xxx")
+}
+
+func main() {
+       flag.Parse()
+       if os.Getenv("ARVADOS_API_HOST") == "" {
+               log.Fatal("ARVADOS_API_HOST environment variable must be set.")
+       }
+       srv := &server{}
+       if err := srv.Start(); err != nil {
+               log.Fatal(err)
+       }
+       log.Println("Listening at", srv.Addr)
+       if err := srv.Wait(); err != nil {
+               log.Fatal(err)
+       }
+}
diff --git a/services/keep-web/server.go b/services/keep-web/server.go
new file mode 100644 (file)
index 0000000..2359f23
--- /dev/null
@@ -0,0 +1,27 @@
+package main
+
+import (
+       "flag"
+       "net/http"
+
+       "git.curoverse.com/arvados.git/sdk/go/httpserver"
+)
+
+var address string
+
+func init() {
+       flag.StringVar(&address, "address", ":80",
+               "Address to listen on: \"host:port\", or \":port\" to listen on all interfaces.")
+}
+
+type server struct {
+       httpserver.Server
+}
+
+func (srv *server) Start() error {
+       mux := http.NewServeMux()
+       mux.Handle("/", &handler{})
+       srv.Handler = mux
+       srv.Addr = address
+       return srv.Server.Start()
+}
diff --git a/services/keep-web/server_test.go b/services/keep-web/server_test.go
new file mode 100644 (file)
index 0000000..8e3a21a
--- /dev/null
@@ -0,0 +1,322 @@
+package main
+
+import (
+       "crypto/md5"
+       "fmt"
+       "io"
+       "io/ioutil"
+       "net"
+       "os/exec"
+       "strings"
+       "testing"
+
+       "git.curoverse.com/arvados.git/sdk/go/arvadosclient"
+       "git.curoverse.com/arvados.git/sdk/go/arvadostest"
+       "git.curoverse.com/arvados.git/sdk/go/keepclient"
+       check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&IntegrationSuite{})
+
+// IntegrationSuite tests need an API server and a keep-web server
+type IntegrationSuite struct {
+       testServer *server
+}
+
+func (s *IntegrationSuite) TestNoToken(c *check.C) {
+       for _, token := range []string{
+               "",
+               "bogustoken",
+       } {
+               hdr, body, _ := s.runCurl(c, token, "collections.example.com", "/collections/"+arvadostest.FooCollection+"/foo")
+               c.Check(hdr, check.Matches, `(?s)HTTP/1.1 404 Not Found\r\n.*`)
+               c.Check(body, check.Equals, "")
+
+               if token != "" {
+                       hdr, body, _ = s.runCurl(c, token, "collections.example.com", "/collections/download/"+arvadostest.FooCollection+"/"+token+"/foo")
+                       c.Check(hdr, check.Matches, `(?s)HTTP/1.1 404 Not Found\r\n.*`)
+                       c.Check(body, check.Equals, "")
+               }
+
+               hdr, body, _ = s.runCurl(c, token, "collections.example.com", "/bad-route")
+               c.Check(hdr, check.Matches, `(?s)HTTP/1.1 404 Not Found\r\n.*`)
+               c.Check(body, check.Equals, "")
+       }
+}
+
+// TODO: Move most cases to functional tests -- at least use Go's own
+// http client instead of forking curl. Just leave enough of an
+// integration test to assure that the documented way of invoking curl
+// really works against the server.
+func (s *IntegrationSuite) Test404(c *check.C) {
+       for _, uri := range []string{
+               // Routing errors (always 404 regardless of what's stored in Keep)
+               "/",
+               "/foo",
+               "/download",
+               "/collections",
+               "/collections/",
+               // Implicit/generated index is not implemented yet;
+               // until then, return 404.
+               "/collections/" + arvadostest.FooCollection,
+               "/collections/" + arvadostest.FooCollection + "/",
+               "/collections/" + arvadostest.FooBarDirCollection + "/dir1",
+               "/collections/" + arvadostest.FooBarDirCollection + "/dir1/",
+               // Non-existent file in collection
+               "/collections/" + arvadostest.FooCollection + "/theperthcountyconspiracy",
+               "/collections/download/" + arvadostest.FooCollection + "/" + arvadostest.ActiveToken + "/theperthcountyconspiracy",
+               // Non-existent collection
+               "/collections/" + arvadostest.NonexistentCollection,
+               "/collections/" + arvadostest.NonexistentCollection + "/",
+               "/collections/" + arvadostest.NonexistentCollection + "/theperthcountyconspiracy",
+               "/collections/download/" + arvadostest.NonexistentCollection + "/" + arvadostest.ActiveToken + "/theperthcountyconspiracy",
+       } {
+               hdr, body, _ := s.runCurl(c, arvadostest.ActiveToken, "collections.example.com", uri)
+               c.Check(hdr, check.Matches, "(?s)HTTP/1.1 404 Not Found\r\n.*")
+               c.Check(body, check.Equals, "")
+       }
+}
+
+func (s *IntegrationSuite) Test1GBFile(c *check.C) {
+       if testing.Short() {
+               c.Skip("skipping 1GB integration test in short mode")
+       }
+       s.test100BlockFile(c, 10000000)
+}
+
+func (s *IntegrationSuite) Test300MBFile(c *check.C) {
+       s.test100BlockFile(c, 3000000)
+}
+
+func (s *IntegrationSuite) test100BlockFile(c *check.C, blocksize int) {
+       testdata := make([]byte, blocksize)
+       for i := 0; i < blocksize; i++ {
+               testdata[i] = byte(' ')
+       }
+       arv, err := arvadosclient.MakeArvadosClient()
+       c.Assert(err, check.Equals, nil)
+       arv.ApiToken = arvadostest.ActiveToken
+       kc, err := keepclient.MakeKeepClient(&arv)
+       c.Assert(err, check.Equals, nil)
+       loc, _, err := kc.PutB(testdata[:])
+       c.Assert(err, check.Equals, nil)
+       mtext := "."
+       for i := 0; i < 100; i++ {
+               mtext = mtext + " " + loc
+       }
+       mtext = mtext + fmt.Sprintf(" 0:%d00:testdata.bin\n", blocksize)
+       coll := map[string]interface{}{}
+       err = arv.Create("collections",
+               map[string]interface{}{
+                       "collection": map[string]interface{}{
+                               "name":          fmt.Sprintf("testdata blocksize=%d", blocksize),
+                               "manifest_text": mtext,
+                       },
+               }, &coll)
+       c.Assert(err, check.Equals, nil)
+       uuid := coll["uuid"].(string)
+
+       hdr, body, size := s.runCurl(c, arv.ApiToken, uuid+".collections.example.com", "/testdata.bin")
+       c.Check(hdr, check.Matches, `(?s)HTTP/1.1 200 OK\r\n.*`)
+       c.Check(hdr, check.Matches, `(?si).*Content-length: `+fmt.Sprintf("%d00", blocksize)+`\r\n.*`)
+       c.Check([]byte(body)[:1234], check.DeepEquals, testdata[:1234])
+       c.Check(size, check.Equals, int64(blocksize)*100)
+}
+
+type curlCase struct {
+       auth    string
+       host    string
+       path    string
+       dataMD5 string
+}
+
+func (s *IntegrationSuite) Test200(c *check.C) {
+       anonymousTokens = []string{arvadostest.AnonymousToken}
+       for _, spec := range []curlCase{
+               // My collection
+               {
+                       auth:    arvadostest.ActiveToken,
+                       host:    arvadostest.FooCollection + "--collections.example.com",
+                       path:    "/foo",
+                       dataMD5: "acbd18db4cc2f85cedef654fccc4a4d8",
+               },
+               {
+                       auth:    arvadostest.ActiveToken,
+                       host:    arvadostest.FooCollection + ".collections.example.com",
+                       path:    "/foo",
+                       dataMD5: "acbd18db4cc2f85cedef654fccc4a4d8",
+               },
+               {
+                       host:    strings.Replace(arvadostest.FooPdh, "+", "-", 1) + ".collections.example.com",
+                       path:    "/t=" + arvadostest.ActiveToken + "/foo",
+                       dataMD5: "acbd18db4cc2f85cedef654fccc4a4d8",
+               },
+               {
+                       path:    "/c=" + arvadostest.FooPdh + "/t=" + arvadostest.ActiveToken + "/foo",
+                       dataMD5: "acbd18db4cc2f85cedef654fccc4a4d8",
+               },
+               {
+                       path:    "/c=" + strings.Replace(arvadostest.FooPdh, "+", "-", 1) + "/t=" + arvadostest.ActiveToken + "/_/foo",
+                       dataMD5: "acbd18db4cc2f85cedef654fccc4a4d8",
+               },
+               {
+                       path:    "/collections/download/" + arvadostest.FooCollection + "/" + arvadostest.ActiveToken + "/foo",
+                       dataMD5: "acbd18db4cc2f85cedef654fccc4a4d8",
+               },
+               {
+                       auth:    "tokensobogus",
+                       path:    "/collections/download/" + arvadostest.FooCollection + "/" + arvadostest.ActiveToken + "/foo",
+                       dataMD5: "acbd18db4cc2f85cedef654fccc4a4d8",
+               },
+               {
+                       auth:    arvadostest.ActiveToken,
+                       path:    "/collections/download/" + arvadostest.FooCollection + "/" + arvadostest.ActiveToken + "/foo",
+                       dataMD5: "acbd18db4cc2f85cedef654fccc4a4d8",
+               },
+               {
+                       auth:    arvadostest.AnonymousToken,
+                       path:    "/collections/download/" + arvadostest.FooCollection + "/" + arvadostest.ActiveToken + "/foo",
+                       dataMD5: "acbd18db4cc2f85cedef654fccc4a4d8",
+               },
+
+               // Anonymously accessible data
+               {
+                       path:    "/c=" + arvadostest.HelloWorldCollection + "/Hello%20world.txt",
+                       dataMD5: "f0ef7081e1539ac00ef5b761b4fb01b3",
+               },
+               {
+                       host:    arvadostest.HelloWorldCollection + ".collections.example.com",
+                       path:    "/Hello%20world.txt",
+                       dataMD5: "f0ef7081e1539ac00ef5b761b4fb01b3",
+               },
+               {
+                       host:    arvadostest.HelloWorldCollection + ".collections.example.com",
+                       path:    "/_/Hello%20world.txt",
+                       dataMD5: "f0ef7081e1539ac00ef5b761b4fb01b3",
+               },
+               {
+                       path:    "/collections/" + arvadostest.HelloWorldCollection + "/Hello%20world.txt",
+                       dataMD5: "f0ef7081e1539ac00ef5b761b4fb01b3",
+               },
+               {
+                       auth:    arvadostest.ActiveToken,
+                       path:    "/collections/" + arvadostest.HelloWorldCollection + "/Hello%20world.txt",
+                       dataMD5: "f0ef7081e1539ac00ef5b761b4fb01b3",
+               },
+               {
+                       auth:    arvadostest.SpectatorToken,
+                       path:    "/collections/" + arvadostest.HelloWorldCollection + "/Hello%20world.txt",
+                       dataMD5: "f0ef7081e1539ac00ef5b761b4fb01b3",
+               },
+               {
+                       auth:    arvadostest.SpectatorToken,
+                       host:    arvadostest.HelloWorldCollection + "--collections.example.com",
+                       path:    "/Hello%20world.txt",
+                       dataMD5: "f0ef7081e1539ac00ef5b761b4fb01b3",
+               },
+               {
+                       auth:    arvadostest.SpectatorToken,
+                       path:    "/collections/download/" + arvadostest.HelloWorldCollection + "/" + arvadostest.SpectatorToken + "/Hello%20world.txt",
+                       dataMD5: "f0ef7081e1539ac00ef5b761b4fb01b3",
+               },
+       } {
+               host := spec.host
+               if host == "" {
+                       host = "collections.example.com"
+               }
+               hdr, body, _ := s.runCurl(c, spec.auth, host, spec.path)
+               c.Check(hdr, check.Matches, `(?s)HTTP/1.1 200 OK\r\n.*`)
+               if strings.HasSuffix(spec.path, ".txt") {
+                       c.Check(hdr, check.Matches, `(?s).*\r\nContent-Type: text/plain.*`)
+                       // TODO: Check some types that aren't
+                       // automatically detected by Go's http server
+                       // by sniffing the content.
+               }
+               c.Check(fmt.Sprintf("%x", md5.Sum([]byte(body))), check.Equals, spec.dataMD5)
+       }
+}
+
+// Return header block and body.
+func (s *IntegrationSuite) runCurl(c *check.C, token, host, uri string, args ...string) (hdr, bodyPart string, bodySize int64) {
+       curlArgs := []string{"--silent", "--show-error", "--include"}
+       testHost, testPort, _ := net.SplitHostPort(s.testServer.Addr)
+       curlArgs = append(curlArgs, "--resolve", host+":"+testPort+":"+testHost)
+       if token != "" {
+               curlArgs = append(curlArgs, "-H", "Authorization: OAuth2 "+token)
+       }
+       curlArgs = append(curlArgs, args...)
+       curlArgs = append(curlArgs, "http://"+host+":"+testPort+uri)
+       c.Log(fmt.Sprintf("curlArgs == %#v", curlArgs))
+       cmd := exec.Command("curl", curlArgs...)
+       stdout, err := cmd.StdoutPipe()
+       c.Assert(err, check.Equals, nil)
+       cmd.Stderr = cmd.Stdout
+       go cmd.Start()
+       buf := make([]byte, 2<<27)
+       n, err := io.ReadFull(stdout, buf)
+       // Discard (but measure size of) anything past 128 MiB.
+       var discarded int64
+       if err == io.ErrUnexpectedEOF {
+               err = nil
+               buf = buf[:n]
+       } else {
+               c.Assert(err, check.Equals, nil)
+               discarded, err = io.Copy(ioutil.Discard, stdout)
+               c.Assert(err, check.Equals, nil)
+       }
+       err = cmd.Wait()
+       // Without "-f", curl exits 0 as long as it gets a valid HTTP
+       // response from the server, even if the response status
+       // indicates that the request failed. In our test suite, we
+       // always expect a valid HTTP response, and we parse the
+       // headers ourselves. If curl exits non-zero, our testing
+       // environment is broken.
+       c.Assert(err, check.Equals, nil)
+       hdrsAndBody := strings.SplitN(string(buf), "\r\n\r\n", 2)
+       c.Assert(len(hdrsAndBody), check.Equals, 2)
+       hdr = hdrsAndBody[0]
+       bodyPart = hdrsAndBody[1]
+       bodySize = int64(len(bodyPart)) + discarded
+       return
+}
+
+func (s *IntegrationSuite) SetUpSuite(c *check.C) {
+       arvadostest.StartAPI()
+       arvadostest.StartKeep()
+
+       arv, err := arvadosclient.MakeArvadosClient()
+       c.Assert(err, check.Equals, nil)
+       arv.ApiToken = arvadostest.ActiveToken
+       kc, err := keepclient.MakeKeepClient(&arv)
+       c.Assert(err, check.Equals, nil)
+       kc.PutB([]byte("Hello world\n"))
+       kc.PutB([]byte("foo"))
+       kc.PutB([]byte("foobar"))
+}
+
+func (s *IntegrationSuite) TearDownSuite(c *check.C) {
+       arvadostest.StopKeep()
+       arvadostest.StopAPI()
+}
+
+func (s *IntegrationSuite) SetUpTest(c *check.C) {
+       arvadostest.ResetEnv()
+       s.testServer = &server{}
+       var err error
+       address = "127.0.0.1:0"
+       err = s.testServer.Start()
+       c.Assert(err, check.Equals, nil)
+}
+
+func (s *IntegrationSuite) TearDownTest(c *check.C) {
+       var err error
+       if s.testServer != nil {
+               err = s.testServer.Close()
+       }
+       c.Check(err, check.Equals, nil)
+}
+
+// Gocheck boilerplate
+func Test(t *testing.T) {
+       check.TestingT(t)
+}
index d3dbeaf89e420d4546d0fedab643d9b15e9849c9..1a1189658d7ba1473aa33036cce60276932cb9b8 100644 (file)
@@ -37,7 +37,7 @@ func main() {
                pidfile          string
        )
 
-       flagset := flag.NewFlagSet("default", flag.ExitOnError)
+       flagset := flag.NewFlagSet("keepproxy", flag.ExitOnError)
 
        flagset.StringVar(
                &listen,
@@ -201,7 +201,7 @@ func GetRemoteAddress(req *http.Request) string {
        return req.RemoteAddr
 }
 
-func CheckAuthorizationHeader(kc keepclient.KeepClient, cache *ApiTokenCache, req *http.Request) (pass bool, tok string) {
+func CheckAuthorizationHeader(kc *keepclient.KeepClient, cache *ApiTokenCache, req *http.Request) (pass bool, tok string) {
        var auth string
        if auth = req.Header.Get("Authorization"); auth == "" {
                return false, ""
@@ -331,7 +331,7 @@ func (this GetBlockHandler) ServeHTTP(resp http.ResponseWriter, req *http.Reques
 
        var pass bool
        var tok string
-       if pass, tok = CheckAuthorizationHeader(kc, this.ApiTokenCache, req); !pass {
+       if pass, tok = CheckAuthorizationHeader(&kc, this.ApiTokenCache, req); !pass {
                status, err = http.StatusForbidden, BadAuthorizationHeader
                return
        }
@@ -438,7 +438,7 @@ func (this PutBlockHandler) ServeHTTP(resp http.ResponseWriter, req *http.Reques
 
        var pass bool
        var tok string
-       if pass, tok = CheckAuthorizationHeader(kc, this.ApiTokenCache, req); !pass {
+       if pass, tok = CheckAuthorizationHeader(&kc, this.ApiTokenCache, req); !pass {
                err = BadAuthorizationHeader
                status = http.StatusForbidden
                return
@@ -521,7 +521,7 @@ func (handler IndexHandler) ServeHTTP(resp http.ResponseWriter, req *http.Reques
 
        kc := *handler.KeepClient
 
-       ok, token := CheckAuthorizationHeader(kc, handler.ApiTokenCache, req)
+       ok, token := CheckAuthorizationHeader(&kc, handler.ApiTokenCache, req)
        if !ok {
                status, err = http.StatusForbidden, BadAuthorizationHeader
                return
index 997163eca42c965f41109af3ad51de22e65c80d3..2c75ec1616e3b404a9547b6e2f969ce9fc1fe9f2 100644 (file)
@@ -121,7 +121,7 @@ func setupProxyService() {
        }
 }
 
-func runProxy(c *C, args []string, port int, bogusClientToken bool) keepclient.KeepClient {
+func runProxy(c *C, args []string, port int, bogusClientToken bool) *keepclient.KeepClient {
        if bogusClientToken {
                os.Setenv("ARVADOS_API_TOKEN", "bogus-token")
        }
@@ -156,7 +156,7 @@ func runProxy(c *C, args []string, port int, bogusClientToken bool) keepclient.K
                go main()
        }
 
-       return kc
+       return &kc
 }
 
 func (s *ServerRequiredSuite) TestPutAskGet(c *C) {