1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
5 // Keep-web provides read/write HTTP (WebDAV) access to files stored
6 // in Keep. It serves public data to anonymous and unauthenticated
7 // clients, and serves private data to clients that supply Arvados API
8 // tokens. It can be installed anywhere with access to Keep services,
9 // typically behind a web proxy that supports TLS.
11 // See http://doc.arvados.org/install/install-keep-web.html.
15 // The default configuration file location is
16 // /etc/arvados/keep-web/keep-web.yml.
18 // Example configuration file
21 // APIHost: "zzzzz.arvadosapi.com:443"
26 // - xxxxxxxxxxxxxxxxxxxx
27 // AttachmentOnlyHost: ""
28 // TrustAllContent: false
30 // Starting the server
32 // Start a server using the default config file
33 // /etc/arvados/keep-web/keep-web.yml:
37 // Start a server using the config file /path/to/keep-web.yml:
39 // keep-web -config /path/to/keep-web.yml
41 // Proxy configuration
43 // Keep-web does not support TLS natively. Typically, it is installed
44 // behind a proxy like nginx.
46 // Here is an example nginx configuration.
49 // upstream keep-web {
50 // server localhost:1234;
54 // server_name collections.example.com *.collections.example.com ~.*--collections.example.com;
55 // ssl_certificate /root/wildcard.example.com.crt;
56 // ssl_certificate_key /root/wildcard.example.com.key;
58 // proxy_pass http://keep-web;
59 // proxy_set_header Host $host;
60 // proxy_set_header X-Forwarded-For $remote_addr;
65 // It is not necessary to run keep-web on the same host as the nginx
66 // proxy. However, TLS is not used between nginx and keep-web, so
67 // intervening networks must be secured by other means.
69 // Anonymous downloads
71 // The "AnonymousTokens" configuration entry is an array of tokens to
72 // use when processing anonymous requests, i.e., whenever a web client
73 // does not supply its own Arvados API token via path, query string,
74 // cookie, or request header.
76 // "AnonymousTokens":["xxxxxxxxxxxxxxxxxxxxxxx"]
78 // See http://doc.arvados.org/install/install-keep-web.html for examples.
82 // The following "same origin" URL patterns are supported for public
83 // collections and collections shared anonymously via secret links
84 // (i.e., collections which can be served by keep-web without making
85 // use of any implicit credentials like cookies). See "Same-origin
88 // http://collections.example.com/c=uuid_or_pdh/path/file.txt
89 // http://collections.example.com/c=uuid_or_pdh/t=TOKEN/path/file.txt
91 // The following "multiple origin" URL patterns are supported for all
94 // http://uuid_or_pdh--collections.example.com/path/file.txt
95 // http://uuid_or_pdh--collections.example.com/t=TOKEN/path/file.txt
97 // In the "multiple origin" form, the string "--" can be replaced with
98 // "." with identical results (assuming the downstream proxy is
99 // configured accordingly). These two are equivalent:
101 // http://uuid_or_pdh--collections.example.com/path/file.txt
102 // http://uuid_or_pdh.collections.example.com/path/file.txt
104 // The first form (with "--" instead of ".") avoids the cost and
105 // effort of deploying a wildcard TLS certificate for
106 // *.collections.example.com at sites that already have a wildcard
107 // certificate for *.example.com. The second form is likely to be
108 // easier to configure, and more efficient to run, on a downstream
111 // In all of the above forms, the "collections.example.com" part can
112 // be anything at all: keep-web itself ignores everything after the
113 // first "." or "--". (Of course, in order for clients to connect at
114 // all, DNS and any relevant proxies must be configured accordingly.)
116 // In all of the above forms, the "uuid_or_pdh" part can be either a
117 // collection UUID or a portable data hash with the "+" character
118 // optionally replaced by "-". (When "uuid_or_pdh" appears in the
119 // domain name, replacing "+" with "-" is mandatory, because "+" is
120 // not a valid character in a domain name.)
122 // In all of the above forms, a top level directory called "_" is
123 // skipped. In cases where the "path/file.txt" part might start with
124 // "t=" or "c=" or "_/", links should be constructed with a leading
125 // "_/" to ensure the top level directory is not interpreted as a
126 // token or collection ID.
128 // Assuming there is a collection with UUID
129 // zzzzz-4zz18-znfnqtbbv4spc3w and portable data hash
130 // 1f4b0bc7583c2a7f9102c395f4ffc5e3+45, the following URLs are
133 // http://zzzzz-4zz18-znfnqtbbv4spc3w.collections.example.com/foo/bar.txt
134 // http://zzzzz-4zz18-znfnqtbbv4spc3w.collections.example.com/_/foo/bar.txt
135 // http://zzzzz-4zz18-znfnqtbbv4spc3w--collections.example.com/_/foo/bar.txt
137 // The following URLs are read-only, but otherwise interchangeable
140 // http://1f4b0bc7583c2a7f9102c395f4ffc5e3-45--foo.example.com/foo/bar.txt
141 // http://1f4b0bc7583c2a7f9102c395f4ffc5e3-45--.invalid/foo/bar.txt
142 // http://collections.example.com/by_id/1f4b0bc7583c2a7f9102c395f4ffc5e3%2B45/foo/bar.txt
143 // http://collections.example.com/by_id/zzzzz-4zz18-znfnqtbbv4spc3w/foo/bar.txt
145 // If the collection is named "MyCollection" and located in a project
146 // called "MyProject" which is in the home project of a user with
147 // username is "bob", the following read-only URL is also available
148 // when authenticating as bob:
150 // http://collections.example.com/users/bob/MyProject/MyCollection/foo/bar.txt
152 // An additional form is supported specifically to make it more
153 // convenient to maintain support for existing Workbench download
156 // http://collections.example.com/collections/download/uuid_or_pdh/TOKEN/foo/bar.txt
158 // A regular Workbench "download" link is also accepted, but
159 // credentials passed via cookie, header, etc. are ignored. Only
160 // public data can be served this way:
162 // http://collections.example.com/collections/uuid_or_pdh/foo/bar.txt
164 // Collections can also be accessed (read-only) via "/by_id/X" where X
165 // is a UUID or portable data hash.
167 // Authorization mechanisms
169 // A token can be provided in an Authorization header:
171 // Authorization: OAuth2 o07j4px7RlJK4CuMYp7C0LDT4CzR1J1qBE5Avo7eCcUjOTikxK
173 // A base64-encoded token can be provided in a cookie named "api_token":
175 // Cookie: api_token=bzA3ajRweDdSbEpLNEN1TVlwN0MwTERUNEN6UjFKMXFCRTVBdm83ZUNjVWpPVGlreEs=
177 // A token can be provided in an URL-encoded query string:
179 // GET /foo/bar.txt?api_token=o07j4px7RlJK4CuMYp7C0LDT4CzR1J1qBE5Avo7eCcUjOTikxK
181 // A suitably encoded token can be provided in a POST body if the
182 // request has a content type of application/x-www-form-urlencoded or
183 // multipart/form-data:
186 // Content-Type: application/x-www-form-urlencoded
188 // api_token=o07j4px7RlJK4CuMYp7C0LDT4CzR1J1qBE5Avo7eCcUjOTikxK
190 // If a token is provided in a query string or in a POST request, the
191 // response is an HTTP 303 redirect to an equivalent GET request, with
192 // the token stripped from the query string and added to a cookie
197 // Keep-web returns a generic HTML index listing when a directory is
198 // requested with the GET method. It does not serve a default file
199 // like "index.html". Directory listings are also returned for WebDAV
200 // PROPFIND requests.
204 // Client-provided authorization tokens are ignored if the client does
205 // not provide a Host header.
207 // In order to use the query string or a POST form authorization
208 // mechanisms, the client must follow 303 redirects; the client must
209 // accept cookies with a 303 response and send those cookies when
210 // performing the redirect; and either the client or an intervening
211 // proxy must resolve a relative URL ("//host/path") if given in a
212 // response Location header.
216 // Normally, Keep-web accepts requests for multiple collections using
217 // the same host name, provided the client's credentials are not being
218 // used. This provides insufficient XSS protection in an installation
219 // where the "anonymously accessible" data is not truly public, but
220 // merely protected by network topology.
222 // In such cases -- for example, a site which is not reachable from
223 // the internet, where some data is world-readable from Arvados's
224 // perspective but is intended to be available only to users within
225 // the local network -- the downstream proxy should configured to
226 // return 401 for all paths beginning with "/c=".
230 // Without the same-origin protection outlined above, a web page
231 // stored in collection X could execute JavaScript code that uses the
232 // current viewer's credentials to download additional data from
233 // collection Y -- data which is accessible to the current viewer, but
234 // not to the author of collection X -- from the same origin
235 // (``https://collections.example.com/'') and upload it to some other
236 // site chosen by the author of collection X.
238 // Attachment-Only host
240 // It is possible to serve untrusted content and accept user
241 // credentials at the same origin as long as the content is only
242 // downloaded, never executed by browsers. A single origin (hostname
243 // and port) can be designated as an "attachment-only" origin: cookies
244 // will be accepted and all responses will have a
245 // "Content-Disposition: attachment" header. This behavior is invoked
246 // only when the designated origin matches exactly the Host header
247 // provided by the client or downstream proxy.
249 // "AttachmentOnlyHost":"domain.example:9999"
251 // Trust All Content mode
253 // In TrustAllContent mode, Keep-web will accept credentials (API
254 // tokens) and serve any collection X at
255 // "https://collections.example.com/c=X/path/file.ext". This is
256 // UNSAFE except in the special case where everyone who is able write
257 // ANY data to Keep, and every JavaScript and HTML file written to
258 // Keep, is also trusted to read ALL of the data in Keep.
260 // In such cases you can enable trust-all-content mode.
262 // "TrustAllContent":true
264 // When TrustAllContent is enabled, the only effect of the
265 // AttachmentOnlyHost flag is to add a "Content-Disposition:
266 // attachment" header.
268 // "AttachmentOnlyHost":"domain.example:9999",
269 // "TrustAllContent":true
271 // Depending on your site configuration, you might also want to enable
272 // the "trust all content" setting in Workbench. Normally, Workbench
273 // avoids redirecting requests to keep-web if they depend on
274 // TrustAllContent being enabled.
278 // Keep-web exposes request metrics in Prometheus text-based format at
279 // /metrics. The same information is also available as JSON at