SPDX-License-Identifier: CC-BY-SA-3.0
{% endcomment %}
-Note that each volume has a UUID, like @zzzzz-nyw5e-0123456789abcde@. You assign these manually: replace @zzzzz@ with your cluster ID, and replace @0123456789abcde@ with an arbitrary string of 15 alphanumerics. Once assigned, UUIDs should not be changed.
+Note that each volume has a UUID, like @zzzzz-nyw5e-0123456789abcde@. You assign these manually: replace @zzzzz@ with your Cluster ID, and replace @0123456789abcde@ with an arbitrary unique string of 15 alphanumerics. Once assigned, UUIDs should not be changed.
+
+Essential configuration values are highlighted in <span class="userinput">red</span>. Remaining parameters are provided for documentation, with their default values.
\ No newline at end of file
{% include 'assign_volume_uuid' %}
-<notextile><pre><code>Clusters:
- <span class="userinput">uuid_prefix</span>:
- Volumes:
- <span class="userinput">uuid_prefix</span>-nyw5e-<span class="userinput">000000000000000</span>:
+<notextile><pre><code> Volumes:
+ <span class="userinput">ClusterID</span>-nyw5e-<span class="userinput">000000000000000</span>:
AccessViaHosts:
# This section determines which keepstore servers access the
# volume. In this example, keep0 has read/write access, and
"http://<span class="userinput">keep0.uuid_prefix.example.com</span>:25107/": {}
"http://<span class="userinput">keep1.uuid_prefix.example.com</span>:25107/": {ReadOnly: true}
- Driver: Azure
+ Driver: <span class="userinput">Azure</span>
DriverParameters:
# Storage account name and secret key, used for
# authentication.
- StorageAccountName: exampleStorageAccountName
- StorageAccountKey: zzzzzzzzzzzzzzzzzzzzzzzzzz
+ StorageAccountName: <span class="userinput">exampleStorageAccountName</span>
+ StorageAccountKey: <span class="userinput">zzzzzzzzzzzzzzzzzzzzzzzzzz</span>
+
+ # Storage container name.
+ ContainerName: <span class="userinput">exampleContainerName</span>
# The cloud environment to use,
# e.g. "core.chinacloudapi.cn". Defaults to
# "core.windows.net" if blank or omitted.
StorageBaseURL: ""
- # Storage container name.
- ContainerName: exampleContainerName
-
# Time to wait for an upstream response before failing the
# request.
RequestTimeout: 10m
{% include 'assign_volume_uuid' %}
-Note that each volume has an AccessViaHosts section indicating that (for example) keep0's /mnt/local-disk directory is volume 0, while keep1's /mnt/local-disk directory is volume 1.
+Note that each volume entry has an @AccessViaHosts@ section indicating which Keepstore instance(s) will serve that volume. In this example, keep0 and keep1 each have their own data disk. The @/mnt/local-disk@ directory on keep0 is volume @ClusterID-nyw5e-000000000000000@, and the @/mnt/local-disk@ directory on keep1 is volume @ClusterID-nyw5e-000000000000001@ .
<notextile>
-<pre><code>Clusters:
- <span class="userinput">uuid_prefix</span>:
- Volumes:
- <span class="userinput">uuid_prefix</span>-nyw5e-<span class="userinput">000000000000000</span>:
+<pre><code> Volumes:
+ <span class="userinput">ClusterID</span>-nyw5e-<span class="userinput">000000000000000</span>:
AccessViaHosts:
- "http://<span class="userinput">keep0.uuid_prefix.example.com</span>:25107": {}
- Driver: Directory
+ "http://<span class="userinput">keep0.ClusterID.example.com</span>:25107": {}
+ Driver: <span class="userinput">Directory</span>
DriverParameters:
# The directory that will be used as the backing store.
- Root: /mnt/local-disk
-
- # When true, read and write operations (for whole 64MiB
- # blocks) on an individual volume will queued and issued
- # serially. When false, read and write operations will be
- # issued concurrently.
- #
- # May improve throughput if you experience contention when
- # there are multiple requests to the same volume.
- #
- # When using SSDs, RAID, or a shared network filesystem, you
- # probably don't want this.
- Serialize: false
+ Root: <span class="userinput">/mnt/local-disk</span>
# How much replication is performed by the underlying
# filesystem. (for example, a network filesystem may provide
# reads.
ReadOnly: false
- # Storage classes to associate with this volume. See "Storage
- # classes" in the "Admin" section of doc.arvados.org.
+ # <a href="{{site.baseurl}}/admin/storage-classes.html">Storage classes</a> to associate with this volume.
StorageClasses: null
- <span class="userinput">uuid_prefix</span>-nyw5e-<span class="userinput">000000000000001</span>:
+ <span class="userinput">ClusterID</span>-nyw5e-<span class="userinput">000000000000001</span>:
AccessViaHosts:
- "http://keep1.<span class="userinput">uuid_prefix</span>.example.com:25107": {}
- Driver: Directory
+ "http://keep1.<span class="userinput">ClusterID</span>.example.com:25107": {}
+ Driver: <span class="userinput">Directory</span>
DriverParameters:
- Root: /mnt/local-disk
+ Root: <span class="userinput">/mnt/local-disk</span>
</code></pre></notextile>
-In the case of a network-attached filesystem, the AccessViaHosts section can have multiple entries. If the filesystem is accessible by all keepstore servers, the AccessViaHosts section can be empty, or omitted entirely.
+In the case of a network-attached filesystem, the @AccessViaHosts@ section can have multiple entries. If the filesystem is accessible by all keepstore servers, the AccessViaHosts section can be empty, or omitted entirely. In this example, the underlying storage system performs replication, so specifying @Replication: 2@ means a block is considered to be stored twice for the purposes of data integrity, while only stored on a single volume from the perspective of Keep.
<notextile>
-<pre><code>Clusters:
- <span class="userinput">uuid_prefix</span>:
- Volumes:
- <span class="userinput">uuid_prefix</span>-nyw5e-<span class="userinput">000000000000002</span>:
+<pre><code> Volumes:
+ <span class="userinput">ClusterID</span>-nyw5e-<span class="userinput">000000000000002</span>:
AccessViaHosts:
# This section determines which keepstore servers access the
# volume. In this example, keep0 has read/write access, and
# If the AccessViaHosts section is empty or omitted, all
# keepstore servers will have read/write access to the
# volume.
- "http://<span class="userinput">keep0.uuid_prefix.example.com</span>:25107/": {}
- "http://<span class="userinput">keep1.uuid_prefix.example.com</span>:25107/": {ReadOnly: true}
- Driver: Directory
+ "http://<span class="userinput">keep0.ClusterID.example.com</span>:25107/": {}
+ "http://<span class="userinput">keep1.ClusterID.example.com</span>:25107/": {ReadOnly: true}
+ Driver: <span class="userinput">Directory</span>
DriverParameters:
- Root: /mnt/network-attached-filesystem
+ Root: <span class="userinput">/mnt/network-attached-filesystem</span>
Replication: 2
</code></pre></notextile>
{% include 'assign_volume_uuid' %}
-<notextile><pre><code>Clusters:
- <span class="userinput">uuid_prefix</span>:
- Volumes:
- <span class="userinput">uuid_prefix</span>-nyw5e-<span class="userinput">000000000000000</span>:
+<notextile><pre><code> Volumes:
+ <span class="userinput">ClusterID</span>-nyw5e-<span class="userinput">000000000000000</span>:
AccessViaHosts:
# This section determines which keepstore servers access the
# volume. In this example, keep0 has read/write access, and
"http://<span class="userinput">keep0.uuid_prefix.example.com</span>:25107/": {}
"http://<span class="userinput">keep1.uuid_prefix.example.com</span>:25107/": {ReadOnly: true}
- Driver: S3
+ Driver: <span class="userinput">S3</span>
DriverParameters:
+ # Bucket name.
+ Bucket: <span class="userinput">example-bucket-name</span>
+
# IAM role name to use when retrieving credentials from
# instance metadata. It can be omitted, in which case the
# role name itself will be retrieved from instance metadata
# -- but setting it explicitly may protect you from using
# the wrong credentials in the event of an
# installation/configuration error.
- IAMRole: ""
+ IAMRole: <span class="userinput">""</span>
# If you are not using an IAM role for authentication,
# specify access credentials here instead.
- AccessKey: ""
- SecretKey: ""
+ AccessKey: <span class="userinput">""</span>
+ SecretKey: <span class="userinput">""</span>
+
+ # Storage provider region. For Google Cloud Storage, use ""
+ # or omit.
+ Region: <span class="userinput">us-east-1a</span>
# Storage provider endpoint. For Amazon S3, use "" or
# omit. For Google Cloud Storage, use
# "https://storage.googleapis.com".
Endpoint: ""
- # Storage provider region. For Google Cloud Storage, use ""
- # or omit.
- Region: us-east-1a
-
# Change to true if the region requires a LocationConstraint
# declaration.
LocationConstraint: false
- # Bucket name.
- Bucket: example-bucket-name
-
# Requested page size for "list bucket contents" requests.
IndexPageSize: 1000
# Maximum eventual consistency latency
RaceWindow: 24h
- # Enable deletion (garbage collection) even when the
- # configured BlobTrashLifetime is zero. WARNING: eventual
- # consistency may result in race conditions that can cause
- # data loss. Do not enable this unless you understand and
- # accept the risk.
- UnsafeDelete: false
-
# How much replication is provided by the underlying bucket.
# This is used to inform replication decisions at the Keep
# layer.
SPDX-License-Identifier: CC-BY-SA-3.0
{% endcomment %}
+# "Introduction":#introduction
+# "Configure DNS":#introduction
+# "Update config.yml":#update-config
+# "Update nginx configuration":#update-nginx
+# "Install keep-web package":#install-packages
+# "Start the service":#start-service
+# "Restart the API server and controller":#restart-api
+# "Confirm working installation":#confirm-working
+
+h2(#introduction). Introduction
+
The Keep-web server provides read/write HTTP (WebDAV) access to files stored in Keep. It serves public data to unauthenticated clients, and serves private data to clients that supply Arvados API tokens. It can be installed anywhere with access to Keep services, typically behind a web proxy that provides TLS support. See the "godoc page":http://godoc.org/github.com/curoverse/arvados/services/keep-web for more detail.
By convention, we use the following hostnames for the Keep-web service:
The above hostnames should resolve from anywhere on the internet.
-h2. Install Keep-web
+h2(#dns). Configure DNS
-Typically Keep-web runs on the same host as Keepproxy.
+It is important to properly configure the download service to migitate cross-site-scripting (XSS) attacks. A HTML page can be stored in collection. If an attacker causes a victim to visit that page through Workbench, it will be rendered by the browser. If all collections are served at the same domain, the browser will consider collections as coming from the same origin and having access to the same browsing data, enabling malicious Javascript on that page to access Arvados on behalf of the victim.
-On Debian-based systems:
+Browser security is based on domain names. We having separate domains for each collection
-<notextile>
-<pre><code>~$ <span class="userinput">sudo apt-get install keep-web</span>
-</code></pre>
-</notextile>
-On Red Hat-based systems:
+
+ # Serve preview links using uuid or pdh in subdomain
+ # (requires wildcard DNS and TLS certificate)
+ # https://*.collections.ClusterID.example.com
+ #
+ # Serve preview links using uuid or pdh in main domain
+ # (requires wildcard DNS and TLS certificate)
+ # https://*--collections.ClusterID.example.com
+
+
+limiting preview to circumstances where the collection is not accessed with the user's regular full-access token.
+
+ # Serve preview links by setting uuid or pdh in the path.
+ # This configuration only allows previews of public data or
+ # collection-sharing links, because these use the anonymous
+ # user token or the token is already embedded in the URL.
+ # Other data must be handled as downloads via WebDAVDownload:
+ # https://collections.ClusterID.example.com
+
+The configuration option @Services.WebDAV.ExternalURL@ is the base URL for Workbench inline preview. If blank, use WebDAVDownload instead, and disable inline preview. If both are empty, downloading collections from workbench will be impossible.
+
+
+
+
+ # Base URL for download links. If blank, serve links to WebDAV
+ # with disposition=attachment query param. Unlike preview links,
+ # browsers do not render attachments, so there is no risk of XSS.
+ #
+ # If WebDAVDownload is blank, and WebDAV uses a
+ # single-origin form, then Workbench will show an error page
+ #
+ # Serve download links by setting uuid or pdh in the path:
+ # https://download.ClusterID.example.com
+
+
+Configure your DNS servers so the following names resolve to your keep-web server's public IP address.
+* @download.ClusterID.example.com@
+* @collections.ClusterID.example.com@
+* @*--collections.ClusterID.example.com@, if you have a wildcard TLS certificate valid for @*.ClusterID.example.com@ and your DNS server allows this without interfering with other DNS names.
+* @*.collections.ClusterID.example.com@, if you have a wildcard TLS certificate valid for these names.
+
+If neither of the above wildcard options is feasible, you have two choices:
+# Serve web content at @collections.ClusterID.example.com@, but only for unauthenticated requests (public data and collection sharing links). Authenticated requests will always result in file downloads, using the @download@ name. For example, the Workbench "preview" button and the "view entire log file" link will invoke file downloads instead of displaying content in the browser window.
+# In the special case where you know you are immune to XSS exploits, you can enable the "trust all content" mode in Keep-web and Workbench (setting @Collections.TrustAllContent: true@ on the config file). With this enabled, inline web content can be served from a single @collections@ host name; no wildcard DNS or certificate is needed. Do not do this without understanding the security implications described in the "Keep-web documentation":http://godoc.org/github.com/curoverse/arvados/services/keep-web.
+
+h2(#update-config). Update config.yml
+
+{% assign railscmd = "bundle exec ./script/get_anonymous_user_token.rb --get" %}
+{% assign railsout = "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz" %}
+If you intend to use Keep-web to serve public data to anonymous clients, configure it with an anonymous token. Use the following command on the <strong>API server</strong> to create an anonymous user token. {% include 'install_rails_command' %}
+
+Update @Services.WebDAV@ and @Services.WebDAVDownload@ in config.yml:
<notextile>
-<pre><code>~$ <span class="userinput">sudo yum install keep-web</span>
+<pre><code> Services:
+ WebDAV:
+ InternalURLs:
+ <span class="userinput">"http://collections.<span class="userinput">ClusterID</span>.example.com:9002/": {}</span>
+ ExternalURL: "https://collections.<span class="userinput">ClusterID</span>.example.com"
+ WebDAVDownload:
+ InternalURLs:
+ <span class="userinput">"http://download.<span class="userinput">ClusterID</span>.example.com:9002/": {}</span>
+ ExternalURL: "https://download.<span class="userinput">ClusterID</span>.example.com"
+ Users:
+ AnonymousUserToken: "{{railsout}}"
+ Collections:
+ TrustAllContent: false
</code></pre>
</notextile>
-Verify that @Keep-web@ is functional:
+Set @Users.AnonymousUserToken: ""@ (empty string) or leave it out if you do not want to serve public data.
+
+Workbench has features like "download file from collection" and "show image" which work better if the content is served by Keep-web rather than Workbench itself. We recommend using the two different hostnames ("download" and "collections" above) for file downloads and inline content respectively.
+
+Additionally, one of the following entries on your cluster configuration file (depending on your DNS setup) tells Workbench which URL will be used to serve user content that can be displayed in the browser, like image previews and static HTML pages.
<notextile>
-<pre><code>~$ <span class="userinput">keep-web -h</span>
-Usage of keep-web:
- -config file
- Site configuration file (default may be overridden by setting an ARVADOS_CONFIG environment variable) (default "/etc/arvados/config.yml")
- -dump-config
- write current configuration to stdout and exit
-[...]
- -version
- print version information and exit.
+<pre><code>Clusters:
+ <span class="userinput">uuid_prefix</span>:
+ Services:
+ WebDAV:
+ ExternalURL: "https://*--collections.<span class="userinput">uuid_prefix</span>.example.com"
+ ExternalURL: "https://*.collections.<span class="userinput">uuid_prefix</span>.example.com"
+ ExternalURL: "https://collections.<span class="userinput">uuid_prefix</span>.example.com"
</code></pre>
</notextile>
-h3. Set up a reverse proxy with TLS support
+h3. Update nginx configuration
The Keep-web service will be accessible from anywhere on the internet, so we recommend using TLS for transport encryption.
Note: A wildcard TLS certificate is required in order to support a full-featured secure Keep-web service. Without it, Keep-web can offer file downloads for all Keep data; however, in order to avoid cross-site scripting vulnerabilities, Keep-web refuses to serve private data as web content except when it is accessed using a "secret link" share. With a wildcard TLS certificate and DNS configured appropriately, all data can be served as web content.
-For example, using Nginx:
+Use a text editor to create a new file @/etc/nginx/conf.d/keep-web.conf@ with the following configuration. Options that need attention are marked with “TODO”.
<notextile><pre>
upstream keep-web {
}
server {
- listen <span class="userinput">[your public IP address]</span>:443 ssl;
- server_name download.<span class="userinput">uuid_prefix</span>.your.domain
- collections.<span class="userinput">uuid_prefix</span>.your.domain
- *.collections.<span class="userinput">uuid_prefix</span>.your.domain
- ~.*--collections.<span class="userinput">uuid_prefix</span>.your.domain;
+ listen <span class="userinput">[TODO: your public IP address]</span>:443 ssl;
+ server_name download.<span class="userinput">ClusterID</span>.example.com
+ collections.<span class="userinput">ClusterID</span>.example.com
+ *.collections.<span class="userinput">ClusterID</span>.example.com
+ ~.*--collections.<span class="userinput">ClusterID</span>.example.com;
proxy_connect_timeout 90s;
proxy_read_timeout 300s;
If you restrict access to your Arvados services based on network topology -- for example, your proxy server is not reachable from the public internet -- additional proxy configuration might be needed to thwart cross-site scripting attacks that would circumvent your restrictions. Read the "'Intranet mode' section of the Keep-web documentation":https://godoc.org/github.com/curoverse/arvados/services/keep-web#hdr-Intranet_mode now.
{% include 'notebox_end' %}
-h3(#dns). Configure DNS
-
-Configure your DNS servers so the following names resolve to your Nginx proxy's public IP address.
-* @download.uuid_prefix.your.domain@
-* @collections.uuid_prefix.your.domain@
-* @*--collections.uuid_prefix.your.domain@, if you have a wildcard TLS certificate valid for @*.uuid_prefix.your.domain@ and your DNS server allows this without interfering with other DNS names.
-* @*.collections.uuid_prefix.your.domain@, if you have a wildcard TLS certificate valid for these names.
-
-If neither of the above wildcard options is feasible, you have two choices:
-# Serve web content at @collections.uuid_prefix.your.domain@, but only for unauthenticated requests (public data and collection sharing links). Authenticated requests will always result in file downloads, using the @download@ name. For example, the Workbench "preview" button and the "view entire log file" link will invoke file downloads instead of displaying content in the browser window.
-# In the special case where you know you are immune to XSS exploits, you can enable the "trust all content" mode in Keep-web and Workbench (setting @Collections.TrustAllContent: true@ on the config file). With this enabled, inline web content can be served from a single @collections@ host name; no wildcard DNS or certificate is needed. Do not do this without understanding the security implications described in the "Keep-web documentation":http://godoc.org/github.com/curoverse/arvados/services/keep-web.
-
-h2. Configure Keep-web
-
-{% assign railscmd = "bundle exec ./script/get_anonymous_user_token.rb --get" %}
-{% assign railsout = "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz" %}
-If you intend to use Keep-web to serve public data to anonymous clients, configure it with an anonymous token. You can use the same one you used when you set up your Keepproxy server, or use the following command on the <strong>API server</strong> to create another. {% include 'install_rails_command' %}
-
-Set the cluster config file like the following:
-
-<notextile>
-<pre><code>Clusters:
- <span class="userinput">uuid_prefix</span>:
- Services:
- Controller:
- ExternalURL: "https://<span class="userinput">uuid_prefix</span>.your.domain"
- WebDAV:
- InternalURLs:
- "http://keep_web_hostname_goes_here:9002/": {}
- ExternalURL: "https://collections.<span class="userinput">uuid_prefix</span>.your.domain"
- WebDAVDownload:
- InternalURLs:
- "http://keep_web_hostname_goes_here:9002/": {}
- ExternalURL: "https://download.<span class="userinput">uuid_prefix</span>.your.domain"
- Users:
- AnonymousUserToken: "{{railsout}}"
- Collections:
- TrustAllContent: false
- TLS:
- Insecure: false
-</code></pre>
-</notextile>
+h2. Install Keep-web package
-Set @Users.AnonymousUserToken: ""@ (empty string) if you do not want to serve public data.
-
-Set @TLS.Insecure: true@ if your API server's TLS certificate is not signed by a recognized CA.
-
-Workbench has features like "download file from collection" and "show image" which work better if the content is served by Keep-web rather than Workbench itself. We recommend using the two different hostnames ("download" and "collections" above) for file downloads and inline content respectively.
+Typically Keep-web runs on the same host as Keepproxy.
-The following entry on your cluster configuration file (@/etc/arvados/config.yml@) details the URL that will be used for file downloads.
+h3. Centos 7
<notextile>
-<pre><code>Clusters:
- <span class="userinput">uuid_prefix</span>:
- Services:
- WebDAVDownload:
- ExternalURL: "https://download.<span class="userinput">uuid_prefix</span>.your.domain"
+<pre><code># <span class="userinput">yum install keepproxy</span>
</code></pre>
</notextile>
-Additionally, one of the following entries on your cluster configuration file (depending on your DNS setup) tells Workbench which URL will be used to serve user content that can be displayed in the browser, like image previews and static HTML pages.
+h3. Debian and Ubuntu
<notextile>
-<pre><code>Clusters:
- <span class="userinput">uuid_prefix</span>:
- Services:
- WebDAV:
- ExternalURL: "https://*--collections.<span class="userinput">uuid_prefix</span>.your.domain"
- ExternalURL: "https://*.collections.<span class="userinput">uuid_prefix</span>.your.domain"
- ExternalURL: "https://collections.<span class="userinput">uuid_prefix</span>.your.domain"
+<pre><code># <span class="userinput">apt-get install keepproxy</span>
</code></pre>
</notextile>
-h2. Run Keep-web
-
-h3. Start the service (option 1: systemd)
+h2(#start-service). Start the service
If your system does not use systemd, skip this section and follow the "runit instructions":#runit instead.
[...]
</code></pre>
</notextile>
-
-h3(#runit). Start the service (option 2: runit)
-
-Install runit to supervise the Keep-web daemon. {% include 'install_runit' %}
-
-The basic command to start Keep-web in the service run script is:
-
-<notextile>
-<pre><code>exec keep-web
-</code></pre>
-</notextile>
-
SPDX-License-Identifier: CC-BY-SA-3.0
{% endcomment %}
+# "Introduction":#introduction
+# "Update config.yml":#update-config
+# "Update nginx configuration":#update-nginx
+# "Install keepproxy package":#install-packages
+# "Start the service":#start-service
+# "Restart the API server and controller":#restart-api
+# "Confirm working installation":#confirm-working
+
+h2(#introduction). Introduction
+
The Keepproxy server is a gateway into your Keep storage. Unlike the Keepstore servers, which are only accessible on the local LAN, Keepproxy is suitable for clients located elsewhere on the internet. Specifically, in contrast to Keepstore:
-* A client writing through Keepproxy generates less network traffic: the client sends a single copy of a data block, and Keepproxy sends copies to the appropriate Keepstore servers.
+* A client writing through Keepproxy sends a single copy of a data block, and Keepproxy distributes copies to the appropriate Keepstore servers.
* A client can write through Keepproxy without precomputing content hashes. Notably, the browser-based upload feature in Workbench requires Keepproxy.
* Keepproxy checks API token validity before processing requests. (Clients that can connect directly to Keepstore can use it as scratch space even without a valid API token.)
<div class="offset1">
table(table table-bordered table-condensed).
-|_Hostname_|
+|_. Hostname|
|keep.@uuid_prefix@.your.domain|
</div>
This hostname should resolve from anywhere on the internet.
-h2. Install Keepproxy
-
-On Debian-based systems:
-
-<notextile>
-<pre><code>~$ <span class="userinput">sudo apt-get install keepproxy</span>
-</code></pre>
-</notextile>
-
-On Red Hat-based systems:
-
-<notextile>
-<pre><code>~$ <span class="userinput">sudo yum install keepproxy</span>
-</code></pre>
-</notextile>
-
-Verify that Keepproxy is functional:
-
-<notextile>
-<pre><code>~$ <span class="userinput">keepproxy -h</span>
-Usage of keepproxy:
- -config file
- Site configuration file (default may be overridden by setting an ARVADOS_CONFIG environment variable) (default "/etc/arvados/config.yml")
- -dump-config
- write current configuration to stdout and exit
-[...]
- -version
- print version information and exit.
-</code></pre>
-</notextile>
-
-h3. Update the cluster config
+h2(#update-config). Update config.yml
-Edit the cluster config at @/etc/arvados/config.yml@ and set @Services.Keepproxy.ExternalURL@ and @Services.Keepproxy.InternalURLs@. Replace @uuid_prefix@ with your cluster id.
+Edit the cluster config at @/etc/arvados/config.yml@ and set @Services.Keepproxy.ExternalURL@ and @Services.Keepproxy.InternalURLs@.
<notextile>
-<pre><code>Clusters:
- <span class="userinput">uuid_prefix</span>:
- Services:
+<pre><code> Services:
Keepproxy:
- ExternalURL: <span class="userinput">https://keep.uuid_prefix.your.domain</span>
+ ExternalURL: <span class="userinput">https://keep.ClusterID.example.com</span>
InternalURLs:
- <span class="userinput">"http://localhost:25107": {}</span>
+ <span class="userinput">"http://keep.ClusterID.example.com:25107": {}</span>
</span></code></pre>
</notextile>
-h3. Set up a reverse proxy with SSL support
+h2(#update-nginx). Update Nginx configuration
-Because the Keepproxy is intended for access from anywhere on the internet, it is recommended to use SSL for transport encryption.
+Put a reverse proxy with SSL support in front of Keepproxy. Keepproxy itself runs on port 25107 by default; the reverse proxy runs on port 443 and forwards requests to Keepproxy on port 25107.
-This is best achieved by putting a reverse proxy with SSL support in front of Keepproxy. Keepproxy itself runs on port 25107 by default; your reverse proxy can run on port 443 and pass requests to Keepproxy on port 25107.
+Use a text editor to create a new file @/etc/nginx/conf.d/keepproxy.conf@ with the following configuration. Options that need attention are marked with “TODO”.
-<notextile><pre>
-upstream keepproxy {
+<notextile><pre><code>upstream keepproxy {
server 127.0.0.1:<span class="userinput">25107</span>;
}
server {
- listen <span class="userinput">[your public IP address]</span>:443 ssl;
- server_name keep.<span class="userinput">uuid_prefix</span>.your.domain;
+ listen <span class="userinput">[TODO your public IP address]</span>:443 ssl;
+ server_name keep.<span class="userinput">ClusterID</span>.example.com;
proxy_connect_timeout 90s;
proxy_read_timeout 300s;
proxy_http_version 1.1;
proxy_request_buffering off;
- ssl on;
- ssl_certificate /etc/nginx/keep.<span class="userinput">uuid_prefix</span>.your.domain-ssl.crt;
- ssl_certificate_key /etc/nginx/keep.<span class="userinput">uuid_prefix</span>.your.domain-ssl.key;
+ ssl on;
+ ssl_certificate <span class="userinput">/TODO/YOUR/PATH/TO/cert.pem</span>;
+ ssl_certificate_key <span class="userinput">/TODO/YOUR/PATH/TO/cert.key</span>;
# Clients need to be able to upload blocks of data up to 64MiB in size.
client_max_body_size 64m;
proxy_pass http://keepproxy;
}
}
-</pre></notextile>
+</code></pre></notextile>
Note: if the Web uploader is failing to upload data and there are no logs from keepproxy, be sure to check the nginx proxy logs. In addition to "GET" and "PUT", The nginx proxy must pass "OPTIONS" requests to keepproxy, which should respond with appropriate Cross-origin resource sharing headers. If the CORS headers are not present, brower security policy will cause the upload request to silently fail. The CORS headers are generated by keepproxy and should not be set in nginx.
-h3. Tell the API server about the Keepproxy server
-
-The API server needs to be informed about the presence of your Keepproxy server.
-
-First, if you don't already have an admin token, create a superuser token.
+h2(#install-packages). Install Keepproxy package
-{% include 'create_superuser_token' %}
+h3. Centos 7
-Configure your environment to run @arv@ using the output of create_superuser_token.rb:
+<notextile>
+<pre><code># <span class="userinput">yum install keepproxy</span>
+</code></pre>
+</notextile>
-<pre>
-export ARVADOS_API_HOST=zzzzz.example.com
-export ARVADOS_API_TOKEN=zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
-</pre>
+h3. Debian and Ubuntu
<notextile>
-<pre><code>~$ <span class="userinput">uuid_prefix=`arv --format=uuid user current | cut -d- -f1`</span>
-~$ <span class="userinput">echo "Site prefix is '$uuid_prefix'"</span>
-~$ <span class="userinput">read -rd $'\000' keepservice <<EOF; arv keep_service create --keep-service "$keepservice"</span>
-<span class="userinput">{
- "service_host":"<strong>keep.$uuid_prefix.your.domain</strong>",
- "service_port":443,
- "service_ssl_flag":true,
- "service_type":"proxy"
-}
-EOF</span>
-</code></pre></notextile>
-
-h2. Run Keepproxy
+<pre><code># <span class="userinput">apt-get install keepproxy</span>
+</code></pre>
+</notextile>
-h3. Start the service (option 1: systemd)
+h2(#start-service). Start the service
If your system does not use systemd, skip this section and follow the "runit instructions":#runit instead.
If your system uses systemd, the keepproxy service should already be set up. Start it and check its status:
<notextile>
-<pre><code>~$ <span class="userinput">sudo systemctl restart keepproxy</span>
-~$ <span class="userinput">sudo systemctl status keepproxy</span>
+<pre><code># <span class="userinput">systemctl restart keepproxy</span>
+# <span class="userinput">systemctl status keepproxy</span>
● keepproxy.service - Arvados Keep Proxy
Loaded: loaded (/lib/systemd/system/keepproxy.service; enabled)
Active: active (running) since Tue 2019-07-23 09:33:47 EDT; 3 weeks 1 days ago
</code></pre>
</notextile>
-h3(#runit). Start the service (option 2: runit)
+h2(#restart-api). Restart the API server and controller
-Install runit to supervise the Keep-web daemon. {% include 'install_runit' %}
+After adding keeproxy to the Services section, make sure the cluster config file is up to date on the API server host, and restart the API server and controller processes to ensure the changes are applied.
+
+<notextile>
+<pre><code># <span class="userinput">systemctl restart nginx arvados-controller</span>
+</code></pre>
+</notextile>
-h3. Testing keepproxy
+h2(#confirm-working). Confirm working installation
-Log into a host that is on an external network from your private Arvados network. The host should be able to contact your keepproxy server (eg keep.$uuid_prefix.arvadosapi.com), but not your keepstore servers (eg keep[0-9].$uuid_prefix.arvadosapi.com).
+Log into a host that is on a network external to your private Arvados network. The host should be able to contact your keepproxy server (eg @keep.ClusterID.example.com@), but not your keepstore servers (eg keep[0-9].ClusterID.example.com).
Install the "Python SDK":{{site.baseurl}}/sdk/python/sdk-python.html
SPDX-License-Identifier: CC-BY-SA-3.0
{% endcomment %}
+# "Introduction":#introduction
+# "Update config.yml":#update-config
+# "Install keepstore package":#install-packages
+# "Restart the API server and controller":#restart-api
+# "Confirm working installation":#confirm-working
+
+h2. Introduction
+
Keepstore provides access to underlying storage for reading and writing content-addressed blocks, with enforcement of Arvados permissions. Keepstore supports a variety of cloud object storage and POSIX filesystems for its backing store.
-h2. Plan your storage layout
+h3. Plan your storage layout
In the steps below, you will configure a number of backend storage volumes (like local filesystems and S3 buckets) and specify which keepstore servers have read-only and read-write access to which volumes.
<div class="offset1">
table(table table-bordered table-condensed).
-|_Hostname_|
+|_. Hostname|
|keep0.@ClusterID@.example.com|
|keep1.@ClusterID@.example.com|
</div>
Keepstore servers should not be directly accessible from the Internet (they are accessed via "keepproxy":install-keepproxy.html), so the hostnames only need to resolve on the private network.
-h2. Update cluster config
+h2(#update-config). Update cluster config
+
+h3. Configure storage volumes
+
+Fill in the @Volumes@ section of @config.yml@ for each storage volume. Available storage volume types include POSIX filesystems and cloud object storage. It is possible to have different volume types in the same cluster.
+
+* To use a POSIX filesystem, including both local filesystems (ext4, xfs) and network file system such as GPFS or Lustre, follow the setup instructions on "Filesystem storage":configure-fs-storage.html
+* If you are using S3-compatible object storage (including Amazon S3, Google Cloud Storage, and Ceph RADOS), follow the setup instructions on "S3 Object Storage":configure-s3-object-storage.html
+* If you are using Azure Blob Storage, follow the setup instructions on "Azure Blob Storage":configure-azure-blob-storage.html
h3. List services
-Add each keepstore server to @/etc/arvados/config.yml@ .
+Add each keepstore server to the @Services.Keepstore@ section of @/etc/arvados/config.yml@ .
<notextile>
<pre><code> Services:
</code></pre>
</notextile>
-h3. Configure storage volumes
-
-Available storage volume types include POSIX filesystems and cloud object storage.
-
-* To use a POSIX filesystem, including both local filesystems (ext4, xfs) and network file system such as GPFS or Lustre, follow the setup instructions on "Filesystem storage":configure-fs-storage.html
-* If you are using S3-compatible object storage (including Amazon S3, Google Cloud Storage, and Ceph RADOS), follow the setup instructions on "S3 Object Storage":configure-s3-object-storage.html
-* If you are using Azure Blob Storage, follow the setup instructions on "Azure Blob Storage":configure-azure-blob-storage.html
-
-h2. Install keepstore package
+h2(#install-packages). Install keepstore package
On each host that will run keepstore, install the @keepstore@ package.
</code></pre>
</notextile>
-h2. Restart the API server and controller
+h2(#restart-api). Restart the API server and controller
After adding all of your keepstore servers to the Services section, make sure the cluster config file is up to date on the API server host, and restart the API server and controller processes to ensure the changes are applied.
</code></pre>
</notextile>
-h2(#testing). Testing keep
+h2(#confirm-working). Confirm working installation
Install the "Python SDK":{{site.baseurl}}/sdk/python/sdk-python.html
|\3=. *Keep (storage)*|
|"Keepstore":install-keepstore.html |Stores content-addressed blocks in a variety of backends (local filesystem, cloud object storage).|Required.|
|"Keepproxy":install-keepproxy.html |Gateway service to access keep servers from external networks.|Required to be able to use arv-put, arv-get, or arv-mount outside the private Arvados network.|
-|"Keep-web":install-keep-web.html |Gateway service providing read/write HTTP and WebDAV support on top of Keep.|Required to be able to download files from Keep over plain HTTP in Workbench.|
+|"Keep-web":install-keep-web.html |Gateway service providing read/write HTTP and WebDAV support on top of Keep.|Required to access files from Workbench.|
|"Keep-balance":install-keep-balance.html |Storage cluster maintenance daemon responsible for moving blocks to their optimal server location, adjusting block replication levels, and trashing unreferenced blocks.|Required to free deleted data from underlying storage, and to ensure proper replication and block distribution (including support for storage classes).|
|\3=. *User interface*|
|"Single Sign On server":install-sso.html |Web based login to Workbench.|Depends on identity provider. Not required for Google. Required for LDAP or standalone database.|
</code></pre>
</notextile>
-You may also use a different method to pick the cluster identifier. The cluster identifier will be part of the hostname of the services in your Arvados cluster. The rest of this documentation will refer to it as your @ClusterID@.
+You may also use a different method to pick the cluster identifier. The cluster identifier will be part of the hostname of the services in your Arvados cluster. The rest of this documentation will refer to it as your @ClusterID@. Whenever @ClusterID@ appears in a configuration example, replace it with your five-character cluster identifier.
h2(#dnstls). DNS entries and TLS certificates
SAMPLE: true
Driver: s3
DriverParameters:
-
# for s3 driver -- see
# https://doc.arvados.org/install/configure-s3-object-storage.html
IAMRole: aaaaa
ConnectTimeout: 1m
ReadTimeout: 10m
RaceWindow: 24h
+
+ # For S3 driver, potentially unsafe tuning parameter,
+ # intentionally excluded from main documentation.
+ #
+ # Enable deletion (garbage collection) even when the
+ # configured BlobTrashLifetime is zero. WARNING: eventual
+ # consistency may result in race conditions that can cause
+ # data loss. Do not enable this unless you understand and
+ # accept the risk.
UnsafeDelete: false
# for azure driver -- see
# for local directory driver -- see
# https://doc.arvados.org/install/configure-fs-storage.html
Root: /var/lib/arvados/keep-data
+
+ # For local directory driver, potentially confusing tuning
+ # parameter, intentionally excluded from main documentation.
+ #
+ # When true, read and write operations (for whole 64MiB
+ # blocks) on an individual volume will queued and issued
+ # serially. When false, read and write operations will be
+ # issued concurrently.
+ #
+ # May possibly improve throughput if you have physical spinning disks
+ # and experience contention when there are multiple requests
+ # to the same volume.
+ #
+ # Otherwise, when using SSDs, RAID, or a shared network filesystem, you
+ # should leave this alone.
Serialize: false
Mail: