mini_portile2 (~> 2.3.0)
npm-rails (0.2.1)
rails (>= 3.2)
- oj (3.5.0)
+ oj (3.6.4)
os (0.9.6)
passenger (5.2.1)
rack
simplecov-html (0.10.2)
simplecov-rcov (0.2.3)
simplecov (>= 0.4.1)
- sprockets (3.7.1)
+ sprockets (3.7.2)
concurrent-ruby (~> 1.0)
rack (> 1, < 3)
sprockets-rails (3.2.1)
wiselinks
BUNDLED WITH
- 1.16.1
+ 1.16.2
attrs['cwd'] = "/var/spool/cwl"
attrs['output_path'] = "/var/spool/cwl"
+ # runtime constriants
+ runtime_constraints = {
+ "vcpus" => 1,
+ "ram" => 1024 * 1024 * 1024,
+ "API" => true
+ }
+
input_defaults = {}
if wf_json
- inputs = get_cwl_inputs(wf_json)
- inputs.each do |input|
+ main = get_cwl_main(wf_json)
+ main[:inputs].each do |input|
if input[:default]
input_defaults[cwl_shortname(input[:id])] = input[:default]
end
end
+ if main[:hints]
+ main[:hints].each do |hint|
+ if hint[:class] == "http://arvados.org/cwl#WorkflowRunnerResources"
+ if hint[:coresMin]
+ runtime_constraints["vcpus"] = hint[:coresMin]
+ end
+ if hint[:ramMin]
+ runtime_constraints["ram"] = hint[:ramMin] * 1024 * 1024
+ end
+ end
+ end
+ end
end
# mounts
end
attrs['mounts'] = mounts
- # runtime constriants
- runtime_constraints = {
- "vcpus" => 1,
- "ram" => 256000000,
- "API" => true
- }
attrs['runtime_constraints'] = runtime_constraints
else
raise ArgumentError, "Unsupported template uuid: #{template_uuid}"
lt
end
- def get_cwl_inputs(workflow)
- if workflow[:inputs]
- return workflow[:inputs]
+ def get_cwl_main(workflow)
+ if workflow[:"$graph"].nil?
+ return workflow
else
workflow[:"$graph"].each do |tool|
if tool[:id] == "#main"
- return tool[:inputs]
+ return tool
end
end
end
end
+ def get_cwl_inputs(workflow)
+ get_cwl_main(workflow)[:inputs]
+ end
+
+
def cwl_shortname(id)
if id[0] == "#"
id = id[1..-1]
arvados_api_client.discovery[:source_version]
end
+ # Get the packageVersion given in the API server's discovery
+ # document.
+ def api_package_version
+ arvados_api_client.discovery[:packageVersion]
+ end
+
# URL for browsing source code for the given version.
def version_link_target version
"https://arvados.org/projects/arvados/repository/changes?rev=#{version.sub(/-.*/, "")}"
additional_info_str = additional_info.map {|k,v| "#{k}=#{v}"}.join("\n")
additional_info['api_source_version'] = api_source_version
+ additional_info['api_package_version'] = api_package_version
additional_info['generated_at'] = generated_at
additional_info['workbench_version'] = AppVersion.hash
+ additional_info['workbench_package_version'] = AppVersion.package_version
additional_info['arvados_base'] = arvados_base
additional_info['support_email'] = support_email
additional_info['error_message'] = params[:error_message] if params[:error_message]
<label for="wb_version" class="col-sm-4 control-label"> Workbench version </label>
<div class="col-sm-8">
<p class="form-control-static" name="wb_version">
- <%= link_to AppVersion.hash, version_link_target(AppVersion.hash) %>
+ <%= AppVersion.package_version %> (<%= link_to AppVersion.hash, version_link_target(AppVersion.hash) %>)
</p>
</div>
</div>
<label for="server_version" class="col-sm-4 control-label"> API version </label>
<div class="col-sm-8">
<p class="form-control-static" name="server_version">
- <%= link_to api_source_version, version_link_target(api_source_version) %>
+ <%= api_package_version %> (<%= link_to api_source_version, version_link_target(api_source_version) %>)
</p>
</div>
</div>
i18n.fallbacks: true
active_support.deprecation: :notify
profiling_enabled: false
+ log_level: info
arvados_insecure_https: false
# "git log".
source_version: false
+ # Override the automatic package string. With the default value of
+ # false, the package string is read from package-build.version in
+ # Rails.root (included in vendor packages).
+ package_version: false
+
# report notification to and from addresses
issue_reporter_email_from: arvados@example.com
issue_reporter_email_to: arvados@example.com
def self.forget
@hash = nil
+ @package_version = nil
end
# Return abbrev commit hash for current code version: "abc1234", or
@hash || "unknown"
end
+
+ def self.package_version
+ if (cached = Rails.configuration.package_version || @package_version)
+ return cached
+ end
+
+ begin
+ @package_version = IO.read(Rails.root.join("package-build.version")).strip
+ rescue Errno::ENOENT
+ @package_version = "unknown"
+ end
+
+ @package_version
+ end
end
-#!/bin/bash
+#!/bin/bash -xe
# Copyright (C) The Arvados Authors. All rights reserved.
#
# SPDX-License-Identifier: AGPL-3.0
cd "$srcdir"
local license_path="$1"; shift
local version="$(version_from_git)"
+ echo "$version" >package-build.version
local scripts_dir="$(mktemp --tmpdir -d "$pkgname-XXXXXXXX.scripts")" && \
(
set -e
# Jenkins config requires that glob tmp/*.log match something. Ensure
# that happens even if we don't end up running services that set up
# logging.
-touch "${WORKSPACE}/tmp/controller.log"
+mkdir -p "${WORKSPACE}/tmp/" || fatal "could not mkdir ${WORKSPACE}/tmp"
+touch "${WORKSPACE}/tmp/controller.log" || fatal "could not touch ${WORKSPACE}/tmp/controller.log"
retry() {
remain="${repeat}"
- install/cheat_sheet.html.textile.liquid
- user/topics/arvados-sync-groups.html.textile.liquid
- admin/storage-classes.html.textile.liquid
+ - admin/activation.html.textile.liquid
- admin/migrating-providers.html.textile.liquid
- admin/merge-remote-account.html.textile.liquid
+ - admin/spot-instances.html.textile.liquid
- install/migrate-docker19.html.textile.liquid
installguide:
- Overview:
- install/index.html.textile.liquid
- Docker quick start:
- install/arvbox.html.textile.liquid
+ - Arvados on Kubernetes:
+ - install/arvados-on-kubernetes.html.textile.liquid
- Manual installation:
- install/install-manual-prerequisites.html.textile.liquid
+ - install/install-components.html.textile.liquid
+ - Core:
- install/install-postgresql.html.textile.liquid
- - install/install-sso.html.textile.liquid
- install/install-api-server.html.textile.liquid
- - install/install-ws.html.textile.liquid
- - install/install-arv-git-httpd.html.textile.liquid
- - install/install-workbench-app.html.textile.liquid
- - install/install-shell-server.html.textile.liquid
- - install/create-standard-objects.html.textile.liquid
+ - Keep:
- install/install-keepstore.html.textile.liquid
- install/configure-fs-storage.html.textile.liquid
- install/configure-s3-object-storage.html.textile.liquid
- install/install-keepproxy.html.textile.liquid
- install/install-keep-web.html.textile.liquid
- install/install-keep-balance.html.textile.liquid
+ - User interface:
+ - install/install-sso.html.textile.liquid
+ - install/install-workbench-app.html.textile.liquid
+ - install/install-composer.html.textile.liquid
+ - Additional services:
+ - install/install-ws.html.textile.liquid
+ - install/install-shell-server.html.textile.liquid
+ - install/install-arv-git-httpd.html.textile.liquid
- Containers API support on SLURM:
- install/crunch2-slurm/install-prerequisites.html.textile.liquid
- install/crunch2-slurm/install-slurm.html.textile.liquid
table(table table-bordered table-condensed).
|_. Key|_. Type|_. Description|_. Notes|
|partitions|array of strings|The names of one or more compute partitions that may run this container. If not provided, the system will choose where to run the container.|Optional.|
+|preemptible|boolean|If true, the dispatcher will ask for a preemptible cloud node instance (eg: AWS Spot Instance) to run this container.|Optional. Default is false.|
+|max_run_time|integer|Maximum running time (in seconds) that this container will be allowed to run before being cancelled.|Optional. Default is 0 (no limit).|
</notextile>
Finally, reboot the system to make these changes effective.
+
+h2. Create a project for Docker images
+
+Here we create a default project for the standard Arvados Docker images, and give all users read access to it. The project is owned by the system user.
+
+<notextile>
+<pre><code>~$ <span class="userinput">project_uuid=`arv --format=uuid group create --group "{\"owner_uuid\":\"$prefix-tpzed-000000000000000\", \"name\":\"Arvados Standard Docker Images\"}"`</span>
+~$ <span class="userinput">echo "Arvados project uuid is '$project_uuid'"</span>
+~$ <span class="userinput">read -rd $'\000' newlink <<EOF; arv link create --link "$newlink"</span>
+<span class="userinput">{
+ "tail_uuid":"$all_users_group_uuid",
+ "head_uuid":"$project_uuid",
+ "link_class":"permission",
+ "name":"can_read"
+}
+EOF</span>
+</code></pre></notextile>
+
+h2. Download and tag the latest arvados/jobs docker image
+
+In order to start workflows from workbench, there needs to be Docker image tagged @arvados/jobs:latest@. The following command downloads the latest arvados/jobs image from Docker Hub, loads it into Keep, and tags it as 'latest'. In this example @$project_uuid@ should be the the UUID of the "Arvados Standard Docker Images" project.
+
+<notextile>
+<pre><code>~$ <span class="userinput">arv-keepdocker --project-uuid $project_uuid --pull arvados/jobs latest</span>
+</code></pre></notextile>
+
+If the image needs to be downloaded from Docker Hub, the command can take a few minutes to complete, depending on available network bandwidth.
--- /dev/null
+---
+layout: default
+navsection: admin
+title: User activation
+...
+
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
+
+This page describes how new users are created and activated.
+
+"Browser login and management of API tokens is described here.":{{site.baseurl}}/api/tokens.html
+
+h3. Authentication
+
+After completing the authentication process, a callback is made from the SSO server to the API server, providing a user record and @identity_url@ (despite the name, this is actually an Arvados user uuid).
+
+The API server searches for a user record with the @identity_url@ supplied by the SSO. If found, that user account will be used, unless the account has @redirect_to_user_uuid@ set, in which case it will use the user in @redirect_to_user_uuid@ instead (this is used for the "link account":{{site.baseurl}}/user/topics/link-accounts.html feature).
+
+Next, it searches by email address for a "pre-activated account.":#pre-activated
+
+If no existing user record is found, a new user object will be created.
+
+A federated user follows a slightly different flow, whereby a special token is presented and the API server verifies user's identity with the home cluster, however it also results in a user object (representing the remote user) being created.
+
+h3. User setup
+
+If @auto_setup_new_users@ is true, as part of creating the new user object, the user is immediately set up with:
+
+* @can_login@ @permission@ link going (email address → user uuid) which records @identity_url_prefix@
+* Membership in the "All users" group (can read all users, all users can see new user)
+* A new git repo and @can_manage@ permission if @auto_setup_new_users_with_repository@ is true
+* @can_login@ permission to a shell node if @auto_setup_new_users_with_vm_uuid@ is set to the uuid of a vm
+
+Otherwise, an admin must explicitly invoke "setup" on the user via workbench or the API.
+
+h3. User activation
+
+A newly created user is inactive (@is_active@ is false) by default unless @new_users_are_active@.
+
+An inactive user cannot create or update any object, but can read Arvados objects that the user account has permission to read. This implies that if @auto_setup_new_users@ is true, an "inactive" user who has been set up may still be able to do things, such as read things shared with "All users", clone and push to the git repository, or login to a VM.
+
+{% comment %}
+Maybe these services should check is_active.
+
+I believe that when this was originally designed, being able to access git and VM required an ssh key, and an inactive user could not register an ssh key because that required creating a record. However, it is now possible to authenticate to shell VMs and http+git with just an API token.
+{% endcomment %}
+
+At this point, there are two ways a user can be activated.
+
+# An admin can set the @is_active@ field directly. This runs @setup_on_activate@ which sets up oid_login_perm and group membership, but does not set repo or vm (even if if @auto_setup_new_users_with_repository@ and/or @auto_setup_new_users_with_vm_uuid@ are set).
+# Self-activation using the @activate@ method of the users controller.
+
+h3. User agreements
+
+The @activate@ method of the users controller checks if the user @is_invited@ and whether the user has "signed" all the user agreements.
+
+@is_invited@ is true if any of these are true:
+* @is_active@ is true
+* @new_users_are_active@ is true
+* the user account has a permission link to read the system "all users" group.
+
+User agreements are accessed by getting a listing on the @user_agreements@ endpoint. This returns a list of collection uuids. This is executed as a system user, so it bypasses normal read permission checks.
+
+The available user agreements are represented in the Links table as
+
+<pre>
+{
+ "link_class": "signature",
+ "name": "require",
+ "tail_uuid": "*system user uuid*",
+ "head_uuid: "*collection uuid*"
+}
+</pre>
+
+The collection contains the user agreement text file.
+
+On workbench, it checks @is_invited@. If true, it displays the clickthrough agreements which the user can "sign". If @is_invited@ is false, the user ends up at the "inactive user" page.
+
+The @user_agreements/sign@ endpoint creates a Link object:
+
+<pre>
+{
+ "link_class": "signature"
+ "name": "click",
+ "tail_uuid": "*user uuid*",
+ "head_uuid: "*collection uuid*"
+}
+</pre>
+
+This is executed as a system user, so it bypasses the restriction that inactive users cannot create objects.
+
+The @user_agreements/signatures@ endpoint returns the list of Link objects that represent signatures by the current user (created by @sign@).
+
+h3. User profile
+
+The user profile is checked by workbench after checking if user agreements need to be signed. The requirement to fill out the user profile is not enforced by the API server.
+
+h3(#pre-activated). Pre-activate user by email address
+
+You may create a user account for a user that has not yet logged in, and identify the user by email address.
+
+1. As an admin, create a user object:
+
+<pre>
+{
+ "email": "foo@example.com",
+ "username": "barney",
+ "is_active": true
+}
+</pre>
+
+2. Create a link object, where @tail_uuid@ is the user's email address, @head_uuid@ is the user object created in the previous step, and @xxxxx@ is the value of @uuid_prefix@ of the SSO server.
+
+<pre>
+{
+ "link_class": "permission",
+ "name": "can_login",
+ "tail_uuid": "email address",
+ "head_uuid: "user uuid",
+ "properties": {
+ "identity_url_prefix": "xxxxx-tpzed-"
+ }
+}
+</pre>
+
+3. When the user logs in the first time, the email address will be recognized and the user will be associated with the linked user object.
+
+h3. Pre-activate federated user
+
+1. As admin, create a user object with the @uuid@ of the federated user (this is the user's uuid on their home cluster):
+
+<pre>
+{
+ "uuid": "home1-tpzed-000000000000000",
+ "email": "foo@example.com",
+ "username": "barney",
+ "is_active": true
+}
+</pre>
+
+2. When the user logs in, they will be associated with the existing user object.
+
+h3. Auto-activate federated users from trusted clusters
+
+In the API server config, configure @auto_activate_users_from@ with a list of one or more five-character cluster ids. A federated user from one of the listed clusters which @is_active@ on the home cluster will be automatically set up and activated on this cluster.
+
+h3(#deactivating_users). Deactivating users
+
+Setting @is_active@ is not sufficient to lock out a user. The user can call @activate@ to become active again. Instead, use @unsetup@:
+
+* Delete oid_login_perms
+* Delete git repository permission links
+* Delete VM login permission links
+* Remove from "All users" group
+* Delete any "signatures"
+* Clear preferences / profile
+* Mark as inactive
+
+{% comment %}
+Does not revoke @is_admin@, so you can't unsetup an admin unless you turn admin off first.
+
+"inactive" does not prevent user from reading things they previously had access to.
+
+Does not revoke API tokens.
+{% endcomment %}
+
+h3. Activation flows
+
+h4. Private instance
+
+Policy: users must be manually approved.
+
+<pre>
+auto_setup_new_users: false
+new_users_are_active: false
+</pre>
+
+# User is created. Not set up. @is_active@ is false.
+# Workbench checks @is_invited@ and finds it is false. User gets "inactive user" page.
+# Admin goes to user page and clicks either "setup user" or manually @is_active@ to true.
+# Clicking "setup user" sets up the user. This includes adding the user to "All users" which qualifies the user as @is_invited@.
+# On refreshing workbench, the user is still inactive, but is able to self-activate after signing clickthrough agreements (if any).
+# Alternately, directly setting @is_active@ to true also sets up the user, but workbench won't display clickthrough agreements (because the user is already active).
+
+h4. Federated instance
+
+Policy: users from other clusters in the federation are activated, users from outside the federation must be manually approved
+
+<pre>
+auto_setup_new_users: false
+new_users_are_active: false
+auto_activate_users_from: [home1]
+</pre>
+
+# Federated user arrives claiming to be from cluster 'home1'
+# API server authenticates user as being from cluster 'home1'
+# Because 'home1' is in @auto_activate_users_from@ the user is set up and activated.
+# User can immediately start using workbench.
+
+h4. Open instance
+
+Policy: anybody who shows up and signs the agreements is activated.
+
+<pre>
+auto_setup_new_users: true
+new_users_are_active: false
+</pre>
+
+# User is created and auto-setup. At this point, @is_active@ is false, but user has been added to "All users" group.
+# Workbench checks @is_invited@ and finds it is true, because the user is a member of "All users" group.
+# Workbench presents user with list of user agreements, user reads and clicks "sign" for each one.
+# Workbench tries to activate user.
+# User is activated.
+
+h4. Developer instance
+
+Policy: avoid wasting developer's time during development/testing
+
+<pre>
+auto_setup_new_users: true
+new_users_are_active: true
+</pre>
+
+# User is created, immediately auto-setup, and auto-activated.
+# User can immediately start using workbench.
--- /dev/null
+---
+layout: default
+navsection: admin
+title: Using AWS Spot instances
+...
+
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
+
+This page describes how to set up the system to take advantage of "Amazon's EC2 spot instances":https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-instances.html.
+
+h3. Nodemanager
+
+Nodemanager should have configured cloud sizes that include the @preemptible@ boolean parameter. For example, for every on-demand cloud node size, you could create a @.spot@ variant, like this:
+
+<pre>
+[Size m4.large]
+cores = 2
+scratch = 32000
+
+[Size m4.large.spot]
+cores = 2
+instance_type = m4.large
+preemptible = true
+scratch = 32000
+</pre>
+
+h3. Slurm dispatcher
+
+The @crunch-dispatch-slurm@ service needs a matching instance type configuration on @/etc/arvados/config.yml@, following the previous example:
+
+<pre>
+Clusters:
+ uuid_prefix:
+ InstanceTypes:
+ - Name: m4.large
+ VCPUs: 2
+ RAM: 7782000000
+ Scratch: 32000000000
+ Price: 0.1
+ - Name: m4.large.spot
+ Preemptible: true
+ VCPUs: 2
+ RAM: 7782000000
+ Scratch: 32000000000
+ Price: 0.1
+</pre>
+
+@InstanceType@ names should match those defined on nodemanager's config file because it's @crunch-dispatch-slurm@'s job to select the instance type and communicate the decision to @nodemanager@ via Slurm.
+
+h3. API Server
+
+Container requests will need the @preemptible@ scheduling parameter included, to make the dispatcher request a spot instance. The API Server configuration file includes an option that when active, will auto assign the @preemptible@ parameter to any new child container request if it doesn't have it already. To activate this feature, the following should be added to the @application.yml@ file:
+
+<pre>
+preemptible_instances: true
+</pre>
+
+With this configuration active, child container requests should include the @preemptible = false@ parameter at creation time to avoid being scheduled for spot instance usage.
+
+h3. AWS Permissions
+
+When requesting spot instances, Amazon's API may return an authorization error depending on how users and permissions are set on the account. If this is the case check nodemanager's log for:
+
+<pre>
+BaseHTTPError: AuthFailure.ServiceLinkedRoleCreationNotPermitted: The provided credentials do not have permission to create the service-linked role for EC2 Spot Instances.
+</pre>
+
+The account needs to have a service linked role created. This can be done by logging into the AWS account, go to _IAM Management_ → _Roles_ and create the @AWSServiceRoleForEC2Spot@ role by clicking on the @Create@ button, selecting @EC2@ service and @EC2 - Spot Instances@ use case.
+
+h3. Cost Tracking
+
+Amazon's Spot instances prices are declared at instance request time and defined by the maximum price that the user is willing to pay per hour. By default, this price is the same amount as the on-demand version of each instance type, and this setting is the one that nodemanager uses for now, as it doesn't include any pricing data to the spot instance request.
+
+The real price that a spot instance has at any point in time is discovered at the end of each usage hour, depending on instance demand. For this reason, AWS provides a data feed subscription to get hourly logs, as described on "Amazon's User Guide":https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-data-feeds.html.
\ No newline at end of file
|_. Argument |_. Type |_. Description |_. Location |_. Example |
{background:#ccffcc}.|uuid|string|The UUID of the group in question.|path||
|limit|integer (default 100)|Maximum number of items to return.|query||
-|order|string|Order in which to return matching items. Sort within a resource type by prefixing the attribute with the resource name and a dot.|query|@"collections.modified_at desc"@|
+|order|array|Attributes to use as sort keys to determine the order resources are returned, each optionally followed by @asc@ or @desc@ to indicate ascending or descending order. Sort within a resource type by prefixing the attribute with the resource name and a period.|query|@["collections.modified_at desc"]@|
|filters|array|Conditions for filtering items.|query|@[["uuid", "is_a", "arvados#job"]]@|
|recursive|boolean (default false)|Include items owned by subprojects.|query|@true@|
The "browser authentication process is documented in detail on the Arvados wiki.":https://dev.arvados.org/projects/arvados/wiki/Workbench_authentication_process
+h2. User activation
+
+"Creation and activation of new users is described here.":{{site.baseurl}}/admin/activation.html
+
h2. Creating tokens via the API
The browser login method above issues a new token. Using that token, it is possible to make API calls to create additional tokens. To do so, use the @create@ method of the "API client authorizations":{{site.baseurl}}/api/methods/api_client_authorizations.html resource.
--- /dev/null
+---
+layout: default
+navsection: installguide
+title: Arvados on Kubernetes - Google Kubernetes Engine
+...
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
+
+This page documents the setup of the prerequisites to run the "Arvados on Kubernetes":/install/arvados-on-kubernetes.html @Helm@ chart on @Google Kubernetes Engine@ (GKE).
+
+h3. Install tooling
+
+Install @gcloud@:
+
+* Follow the instructions at "https://cloud.google.com/sdk/downloads":https://cloud.google.com/sdk/downloads
+
+Install @kubectl@:
+
+<pre>
+$ gcloud components install kubectl
+</pre>
+
+Install @helm@:
+
+* Follow the instructions at "https://docs.helm.sh/using_helm/#installing-helm":https://docs.helm.sh/using_helm/#installing-helm
+
+h3. Boot the GKE cluster
+
+This can be done via the "cloud console":https://console.cloud.google.com/kubernetes/ or via the command line:
+
+<pre>
+$ gcloud container clusters create <CLUSTERNAME> --zone us-central1-a --machine-type n1-standard-2 --cluster-version 1.10
+</pre>
+
+It takes a few minutes for the cluster to be initialized.
+
+h3. Reserve a static IP
+
+Reserve a "static IP":https://console.cloud.google.com/networking/addresses in GCE. Make sure the IP is in the same region as your GKE cluster, and is of the "Regional" type.
+
+h3. Connect to the GKE cluster.
+
+Via the web:
+* Click the "Connect" button next to your "GKE cluster"https://console.cloud.google.com/kubernetes/.
+* Execute the "Command-line access" command on your development machine.
+
+Alternatively, use this command:
+
+<pre>
+$ gcloud container clusters get-credentials <CLUSTERNAME> --zone us-central1-a --project <YOUR-PROJECT>
+</pre>
+
+Test the connection:
+
+<pre>
+$ kubectl get nodes
+</pre>
+
+Now proceed to the "Initialize helm on the Kubernetes cluster":/install/arvados-on-kubernetes.html#helm section.
--- /dev/null
+---
+layout: default
+navsection: installguide
+title: Arvados on Kubernetes - Minikube
+...
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
+
+This page documents the setup of the prerequisites to run the "Arvados on Kubernetes":/install/arvados-on-kubernetes.html @Helm@ chart on @Minikube@.
+
+h3. Install tooling
+
+Install @kubectl@:
+
+* Follow the instructions at "https://kubernetes.io/docs/tasks/tools/install-kubectl/":https://kubernetes.io/docs/tasks/tools/install-kubectl/
+
+Install @helm@:
+
+* Follow the instructions at "https://docs.helm.sh/using_helm/#installing-helm":https://docs.helm.sh/using_helm/#installing-helm
+
+h3. Install Minikube
+
+Follow the instructions at "https://kubernetes.io/docs/setup/minikube/":https://kubernetes.io/docs/setup/minikube/
+
+Test the connection:
+
+<pre>
+$ kubectl get nodes
+</pre>
+
+Now proceed to the "Initialize helm on the Kubernetes cluster":/install/arvados-on-kubernetes.html#helm section.
--- /dev/null
+---
+layout: default
+navsection: installguide
+title: Arvados on Kubernetes
+...
+{% comment %}
+Copyright (C) The Arvados Authors. All rights reserved.
+
+SPDX-License-Identifier: CC-BY-SA-3.0
+{% endcomment %}
+
+Arvados on Kubernetes is implemented as a Helm Chart.
+
+{% include 'notebox_begin_warning' %}
+This Helm Chart does not retain any state after it is deleted. An Arvados cluster created with this Helm Chart is entirely ephemeral, and all data stored on the cluster will be deleted when it is shut down. This will be fixed in a future version.
+{% include 'notebox_end' %}
+
+h2(#overview). Overview
+
+This Helm Chart provides a basic, small Arvados cluster.
+
+Current limitations, to be addressed in the future:
+
+* An Arvados cluster created with this Helm Chart is entirely ephemeral, and all data stored on the cluster will be deleted when it is shut down.
+* No dynamic scaling of compute nodes (but you can adjust @values.yaml@ and "reload the Helm Chart":#reload
+* All compute nodes are the same size
+* Compute nodes have no cpu/memory/disk constraints yet
+* No git server
+
+h2. Requirements
+
+* Kubernetes 1.10+ cluster with at least 3 nodes, 2 or more cores per node
+* @kubectl@ and @helm@ installed locally, and able to connect to your Kubernetes cluster
+
+If you do not have a Kubernetes cluster already set up, you can use "Google Kubernetes Engine":/install/arvados-on-kubernetes-GKE.html for multi-node development and testing or "another Kubernetes solution":https://kubernetes.io/docs/setup/pick-right-solution/. Minikube is not supported yet.
+
+h2(#helm). Initialize helm on the Kubernetes cluster
+
+If you already have helm running on the Kubernetes cluster, proceed directly to "Start the Arvados cluster":#Start below.
+
+<pre>
+$ helm init
+$ kubectl create serviceaccount --namespace kube-system tiller
+$ kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
+$ kubectl patch deploy --namespace kube-system tiller-deploy -p '{"spec":{"template":{"spec":{"serviceAccount":"tiller"}}}}'
+</pre>
+
+Test @helm@ by running
+
+<pre>
+$ helm ls
+</pre>
+
+There should be no errors. The command will return nothing.
+
+h2(#git). Clone the repository
+
+Clone the repository and nagivate to the @arvados-kubernetes/charts/arvados@ directory:
+
+<pre>
+$ git clone https://github.com/curoverse/arvados-kubernetes.git
+$ cd arvados-kubernetes/charts/arvados
+</pre>
+
+h2(#Start). Start the Arvados cluster
+
+Next, determine the IP address that the Arvados cluster will use to expose its API, Workbench, etc. If you want this Arvados cluster to be reachable from places other than the local machine, the IP address will need to be routable as appropriate.
+
+<pre>
+$ ./cert-gen.sh <IP ADDRESS>
+</pre>
+
+The @values.yaml@ file contains a number of variables that can be modified. At a minimum, review and/or modify the values for
+
+<pre>
+ adminUserEmail
+ adminUserPassword
+ superUserSecret
+ anonymousUserSecret
+</pre>
+
+Now start the Arvados cluster:
+
+<pre>
+$ helm install --name arvados . --set externalIP=<IP ADDRESS>
+</pre>
+
+At this point, you can use kubectl to see the Arvados cluster boot:
+
+<pre>
+$ kubectl get pods
+$ kubectl get svc
+</pre>
+
+After a few minutes, you can access Arvados Workbench at the IP address specified
+
+* https://<IP ADDRESS>
+
+with the username and password specified in the @values.yaml@ file.
+
+Alternatively, use the Arvados cli tools or SDKs:
+
+Set the environment variables:
+
+<pre>
+$ export ARVADOS_API_TOKEN=<superUserSecret from values.yaml>
+$ export ARVADOS_API_HOST=<STATIC IP>:444
+$ export ARVADOS_API_HOST_INSECURE=true
+</pre>
+
+Test access with:
+
+<pre>
+$ arv user current
+</pre>
+
+h2(#reload). Reload
+
+If you make changes to the Helm Chart (e.g. to @values.yaml@), you can reload Arvados with
+
+<pre>
+$ helm upgrade arvados .
+</pre>
+
+h2. Shut down
+
+{% include 'notebox_begin_warning' %}
+This Helm Chart does not retain any state after it is deleted. An Arvados cluster created with this Helm Chart is entirely ephemeral, and <strong>all data stored on the Arvados cluster will be deleted</strong> when it is shut down. This will be fixed in a future version.
+{% include 'notebox_end' %}
+
+<pre>
+$ helm del arvados --purge
+</pre>
+++ /dev/null
----
-layout: default
-navsection: installguide
-title: Create standard objects
-
-...
-{% comment %}
-Copyright (C) The Arvados Authors. All rights reserved.
-
-SPDX-License-Identifier: CC-BY-SA-3.0
-{% endcomment %}
-
-In these steps we use the Arvados CLI tools on the <strong>shell server</strong> to create a few Arvados objects. The CLI tools require an ARVADOS_API_TOKEN environment variable with a valid admin token. If you haven't already done so, set that up as shown in the "API token guide":../user/reference/api-tokens.html.
-
-h3. Arvados repository
-
-Here we create a repository object which will be used to set up a hosted clone of the arvados repository on this cluster.
-
-<notextile>
-<pre><code>~$ <span class="userinput">prefix=`arv --format=uuid user current | cut -d- -f1`</span>
-~$ <span class="userinput">echo "Site prefix is '$prefix'"</span>
-~$ <span class="userinput">all_users_group_uuid="$prefix-j7d0g-fffffffffffffff"</span>
-~$ <span class="userinput">repo_uuid=`arv --format=uuid repository create --repository "{\"owner_uuid\":\"$prefix-tpzed-000000000000000\", \"name\":\"arvados\"}"`</span>
-~$ <span class="userinput">echo "Arvados repository uuid is '$repo_uuid'"</span>
-</code></pre></notextile>
-
-Create a link object to make the repository object readable by the "All users" group, and therefore by every active user. This makes it possible for users to run the bundled Crunch scripts by specifying @"script_version":"master","repository":"arvados"@ rather than pulling the Arvados source tree into their own repositories.
-
-<notextile>
-<pre><code>~$ <span class="userinput">read -rd $'\000' newlink <<EOF; arv link create --link "$newlink"</span>
-<span class="userinput">{
- "tail_uuid":"$all_users_group_uuid",
- "head_uuid":"$repo_uuid",
- "link_class":"permission",
- "name":"can_read"
-}
-EOF</span>
-</code></pre></notextile>
-
-In a couple of minutes, your arvados-git-sync cron job will create an empty repository on your git server. Seed it with the real arvados repository. If your git credential helpers were configured correctly when you "set up your shell server":install-shell-server.html, the "git push" command will use your API token instead of prompting you for a username and password.
-
-<notextile>
-<pre><code>~$ <span class="userinput">cd /tmp</span>
-/tmp$ <span class="userinput">git clone --bare https://github.com/curoverse/arvados.git</span>
-/tmp <span class="userinput">git --git-dir arvados.git push https://git.<b>uuid_prefix.your.domain</b>/arvados.git '*:*'</span>
-</code></pre>
-</notextile>
-
-If you did not set up a HTTPS service, you can push to <code>git@git.uuid_prefix.your.domain:arvados.git</code> using your SSH key, or by logging in to your git server and using sudo.
-
-<notextile>
-<pre><code>gitserver:~$ <span class="userinput">sudo -u git -i bash</span>
-git@gitserver:~$ <span class="userinput">git clone --bare https://github.com/curoverse/arvados.git /tmp/arvados.git</span>
-git@gitserver:~$ <span class="userinput">cd /tmp/arvados.git</span>
-git@gitserver:/tmp/arvados.git$ <span class="userinput">gitolite push /var/lib/arvados/git/repositories/<b>your_arvados_repo_uuid</b>.git '*:*'</span>
-</code></pre>
-</notextile>
-
-h3. Default project for docker images
-
-Here we create a default project for the standard Arvados Docker images, and give all users read access to it. The project is owned by the system user.
-
-<notextile>
-<pre><code>~$ <span class="userinput">project_uuid=`arv --format=uuid group create --group "{\"owner_uuid\":\"$prefix-tpzed-000000000000000\", \"name\":\"Arvados Standard Docker Images\"}"`</span>
-~$ <span class="userinput">echo "Arvados project uuid is '$project_uuid'"</span>
-~$ <span class="userinput">read -rd $'\000' newlink <<EOF; arv link create --link "$newlink"</span>
-<span class="userinput">{
- "tail_uuid":"$all_users_group_uuid",
- "head_uuid":"$project_uuid",
- "link_class":"permission",
- "name":"can_read"
-}
-EOF</span>
-</code></pre></notextile>
-
-h3. Download and tag the latest arvados/jobs docker image
-
-The @arvados-cwl-runner@ needs access to an arvados/jobs image that is tagged as 'latest'. The following command downloads the latest arvados/jobs image from Docker Hub, loads it into Keep, and tags it as 'latest'.
-
-<notextile>
-<pre><code>~$ <span class="userinput">arv-keepdocker --pull arvados/jobs latest</span>
-</code></pre></notextile>
-
-If the image needs to be downloaded from Docker Hub, the command can take a few minutes to complete, depending on available network bandwidth.
This is the only configuration required by crunch-dispatch-slurm. The subsections below describe optional configuration flags you can set inside the main configuration object.
-h3. Client::KeepServiceURIs
+h3(#KeepServiceURIs). Client::KeepServiceURIs
Override Keep service discovery with a predefined list of Keep URIs. This can be useful if the compute nodes run a local keepstore that should handle all Keep traffic. Example:
</code></pre>
</notextile>
-h3. PollPeriod
+h3(#PollPeriod). PollPeriod
crunch-dispatch-slurm polls the API server periodically for new containers to run. The @PollPeriod@ option controls how often this poll happens. Set this to a string of numbers suffixed with one of the time units @ns@, @us@, @ms@, @s@, @m@, or @h@. For example:
</code></pre>
</notextile>
-h3. PrioritySpread
+h3(#PrioritySpread). PrioritySpread
crunch-dispatch-slurm adjusts the "nice" values of its SLURM jobs to ensure containers are prioritized correctly relative to one another. This option tunes the adjustment mechanism.
* If non-Arvados jobs run on your SLURM cluster, and your Arvados containers are waiting too long in the SLURM queue because their "nice" values are too high for them to compete with other SLURM jobs, you should use a smaller PrioritySpread value.
</code></pre>
</notextile>
+h3(#SbatchArguments). SbatchArguments
-
-h3. SbatchArguments
-
-When crunch-dispatch-slurm invokes @sbatch@, you can add switches to the command by specifying @SbatchArguments@. You can use this to send the jobs to specific cluster partitions or add resource requests. Set @SbatchArguments@ to an array of strings. For example:
+When crunch-dispatch-slurm invokes @sbatch@, you can add arguments to the command by specifying @SbatchArguments@. You can use this to send the jobs to specific cluster partitions or add resource requests. Set @SbatchArguments@ to an array of strings. For example:
<notextile>
<pre><code class="userinput">SbatchArguments:
</code></pre>
</notextile>
-h3. CrunchRunCommand: Dispatch to SLURM cgroups
+Note: If an argument is supplied multiple times, @slurm@ uses the value of the last occurrence of the argument on the command line. Arguments specified through Arvados are added after the arguments listed in SbatchArguments. This means, for example, an Arvados container with that specifies @partitions@ in @scheduling_parameter@ will override an occurrence of @--partition@ in SbatchArguments. As a result, for container parameters that can be specified through Arvados, SbatchArguments can be used to specify defaults but not enforce specific policy.
+
+h3(#CrunchRunCommand-cgroups). CrunchRunCommand: Dispatch to SLURM cgroups
If your SLURM cluster uses the @task/cgroup@ TaskPlugin, you can configure Crunch's Docker containers to be dispatched inside SLURM's cgroups. This provides consistent enforcement of resource constraints. To do this, use a crunch-dispatch-slurm configuration like the following:
</code></pre>
</notextile>
-The choice of subsystem ("memory" in this example) must correspond to one of the resource types enabled in SLURM's @cgroup.conf@. Limits for other resource types will also be respected. The specified subsystem is singled out only to let Crunch determine the name of the cgroup provided by SLURM.
+The choice of subsystem ("memory" in this example) must correspond to one of the resource types enabled in SLURM's @cgroup.conf@. Limits for other resource types will also be respected. The specified subsystem is singled out only to let Crunch determine the name of the cgroup provided by SLURM. When doing this, you should also set "ReserveExtraRAM":#ReserveExtraRAM .
{% include 'notebox_begin' %}
{% include 'notebox_end' %}
-h3. CrunchRunCommand: Using host networking for containers
+h3(#CrunchRunCommand-network). CrunchRunCommand: Using host networking for containers
Older Linux kernels (prior to 3.18) have bugs in network namespace handling which can lead to compute node lockups. This by is indicated by blocked kernel tasks in "Workqueue: netns cleanup_net". If you are experiencing this problem, as a workaround you can disable use of network namespaces by Docker across the cluster. Be aware this reduces container isolation, which may be a security risk.
</code></pre>
</notextile>
-h3. MinRetryPeriod: Rate-limit repeated attempts to start containers
+h3(#MinRetryPeriod). MinRetryPeriod: Rate-limit repeated attempts to start containers
If SLURM is unable to run a container, the dispatcher will submit it again after the next PollPeriod. If PollPeriod is very short, this can be excessive. If MinRetryPeriod is set, the dispatcher will avoid submitting the same container to SLURM more than once in the given time span.
</code></pre>
</notextile>
+h3(#ReserveExtraRAM). ReserveExtraRAM: Extra RAM for jobs
+
+Extra RAM to reserve (in bytes) on each SLURM job submitted by Arvados, which is added to the amount specified in the container's @runtime_constraints@. If not provided, the default value is zero. Helpful when using @-cgroup-parent-subsystem@, where @crunch-run@ and @arv-mount@ share the control group memory limit with the user process. In this situation, at least 256MiB is recommended to accomodate each container's @crunch-run@ and @arv-mount@ processes.
+
+<notextile>
+<pre><code class="userinput">ReserveExtraRAM: <b>268435456</b>
+</code></pre>
+</notextile>
+
h2. Restart the dispatcher
{% include 'notebox_begin' %}
SPDX-License-Identifier: CC-BY-SA-3.0
{% endcomment %}
-h2(#slurm). Set up SLURM
-
On the API server, install SLURM and munge, and generate a munge key.
On Debian-based systems:
---
layout: default
navsection: installguide
-title: Installation overview
+title: Installation options
...
{% comment %}
Copyright (C) The Arvados Authors. All rights reserved.
SPDX-License-Identifier: CC-BY-SA-3.0
{% endcomment %}
-Arvados components run on GNU/Linux systems, and do not depend on any particular cloud operating stack. Arvados supports Debian and derivatives such as Ubuntu, as well as Red Hat and derivatives such as CentOS.
+Arvados components run on GNU/Linux systems, and supports multiple cloud operating stacks. Arvados supports Debian and derivatives such as Ubuntu, as well as Red Hat and derivatives such as CentOS.
-Arvados components can be installed and configured in a number of different ways. Step-by-step instructions are available to perform a production installation from packages with manual configuration. This method assumes you have several (virtual) machines at your disposal for running the various Arvados components.
+Arvados components can be installed and configured in a number of different ways.
-* "Docker quick start":arvbox.html
-* "Manual installation":install-manual-prerequisites.html
+<div class="offset1">
+table(table table-bordered table-condensed).
+|||\5=. Appropriate for|
+||_. Ease of setup|_. Multiuser/networked access|_. Workflow Development and Testing|_. Large Scale Production|_. Development of Arvados|_. Arvados System Testing|
+|"Arvados-in-a-box":arvbox.html (arvbox)|Easy|no|yes|no|yes|yes|
+|"Arvados on Kubernetes":arvados-on-kubernetes.html|Easy ^1^|yes|yes ^2^|no ^2^|no|yes|
+|"Manual installation":install-manual-prerequisites.html|Complicated|yes|yes|yes|no|no|
+|"Cloud demo":https://cloud.curoverse.com by Veritas Genetics|N/A ^3^|yes|yes|no|no|no|
+|"Cluster Operation Subscription":https://curoverse.com/products by Veritas Genetics|N/A ^3^|yes|yes|yes|yes|yes|
+</div>
+
+* ^1^ Assumes a Kubernetes cluster is available
+* ^2^ Arvados on Kubernetes is under development and not yet ready for production use
+* ^3^ No installation necessary, Veritas Genetics run and managed
It is not strictly necessary to deploy _both_ SSH and HTTPS access, but we recommend deploying both:
* SSH is a more appropriate way to authenticate from a user's workstation because it does not require managing tokens on the client side;
* HTTPS is a more appropriate way to authenticate from a shell VM because it does not depend on SSH agent forwarding (SSH clients' agent forwarding features tend to behave as if the remote machine is fully trusted).
+* HTTPS is also used by Arvados Composer to access git repositories from the browser.
The HTTPS instructions given below will not work if you skip the SSH setup steps.
<pre><code>gitserver:~$ <span class="userinput">sudo nginx -s reload</span>
</code></pre>
</notextile>
+
+h2. Clone Arvados repository
+
+Here we create a repository object which will be used to set up a hosted clone of the arvados repository on this cluster.
+
+<notextile>
+<pre><code>~$ <span class="userinput">prefix=`arv --format=uuid user current | cut -d- -f1`</span>
+~$ <span class="userinput">echo "Site prefix is '$prefix'"</span>
+~$ <span class="userinput">all_users_group_uuid="$prefix-j7d0g-fffffffffffffff"</span>
+~$ <span class="userinput">repo_uuid=`arv --format=uuid repository create --repository "{\"owner_uuid\":\"$prefix-tpzed-000000000000000\", \"name\":\"arvados\"}"`</span>
+~$ <span class="userinput">echo "Arvados repository uuid is '$repo_uuid'"</span>
+</code></pre></notextile>
+
+Create a link object to make the repository object readable by the "All users" group, and therefore by every active user. This makes it possible for users to run the bundled Crunch scripts by specifying @"script_version":"master","repository":"arvados"@ rather than pulling the Arvados source tree into their own repositories.
+
+<notextile>
+<pre><code>~$ <span class="userinput">read -rd $'\000' newlink <<EOF; arv link create --link "$newlink"</span>
+<span class="userinput">{
+ "tail_uuid":"$all_users_group_uuid",
+ "head_uuid":"$repo_uuid",
+ "link_class":"permission",
+ "name":"can_read"
+}
+EOF</span>
+</code></pre></notextile>
+
+In a couple of minutes, your arvados-git-sync cron job will create an empty repository on your git server. Seed it with the real arvados repository. If your git credential helpers were configured correctly when you "set up your shell server":install-shell-server.html, the "git push" command will use your API token instead of prompting you for a username and password.
+
+<notextile>
+<pre><code>~$ <span class="userinput">cd /tmp</span>
+/tmp$ <span class="userinput">git clone --bare https://github.com/curoverse/arvados.git</span>
+/tmp <span class="userinput">git --git-dir arvados.git push https://git.<b>uuid_prefix.your.domain</b>/arvados.git '*:*'</span>
+</code></pre>
+</notextile>
+
+If you did not set up a HTTPS service, you can push to <code>git@git.uuid_prefix.your.domain:arvados.git</code> using your SSH key, or by logging in to your git server and using sudo.
+
+<notextile>
+<pre><code>gitserver:~$ <span class="userinput">sudo -u git -i bash</span>
+git@gitserver:~$ <span class="userinput">git clone --bare https://github.com/curoverse/arvados.git /tmp/arvados.git</span>
+git@gitserver:~$ <span class="userinput">cd /tmp/arvados.git</span>
+git@gitserver:/tmp/arvados.git$ <span class="userinput">gitolite push /var/lib/arvados/git/repositories/<b>your_arvados_repo_uuid</b>.git '*:*'</span>
+</code></pre>
+</notextile>
--- /dev/null
+---
+layout: default
+navsection: installguide
+title: Choosing which components to install
+...
+
+Arvados consists of many components, some of which may be omitted (at the cost of reduced functionality.) It may also be helpful to review the "Arvados Architecture":{{site.baseurl}}/architecture to understand how these components interact.
+
+table(table table-bordered table-condensed).
+|\3=. *Core*|
+|"Postgres database":install-postgresql.html |Stores data for the API server.|Required.|
+|"API server":install-api-server.html |Core Arvados logic for managing users, groups, collections, containers, and enforcing permissions.|Required.|
+|\3=. *Keep (storage)*|
+|"Keepstore":install-keepstore.html |Stores content-addressed blocks in a variety of backends (local filesystem, cloud object storage).|Required.|
+|"Keepproxy":install-keepproxy.html |Gateway service to access keep servers from external networks.|Required to be able to use arv-put, arv-get, or arv-mount outside the private Arvados network.|
+|"Keep-web":install-keep-web.html |Gateway service providing read/write HTTP and WebDAV support on top of Keep.|Required to be able to download files from Keep over plain HTTP in Workbench.|
+|"Keep-balance":install-keep-balance.html |Storage cluster maintenance daemon responsible for moving blocks to their optimal server location, adjusting block replication levels, and trashing unreferenced blocks.|Required to free deleted data from underlying storage, and to ensure proper replication and block distribution (including support for storage classes).|
+|\3=. *User interface*|
+|"Single Sign On server":install-sso.html |Login server.|Required for web based login to Workbench.|
+|"Workbench":install-workbench-app.html |Primary graphical user interface for working with file collections and running containers.|Optional. Depends on API server, SSO server, keep-web, websockets server.|
+|"Workflow Composer":install-composer.html |Graphical user interface for editing Common Workflow Language workflows.|Optional. Depends on git server (arv-git-httpd).|
+|\3=. *Additional services*|
+|"Websockets server":install-ws.html |Event distribution server.|Required to view streaming container logs in Workbench.|
+|"Shell server":install-shell-server.html |Synchronize (create/delete/configure) Unix shell accounts with Arvados users.|Optional.|
+|"Git server":install-arv-git-httpd.html |Arvados-hosted git repositories, with Arvados-token based authentication.|Optional, but required by Workflow Composer.|
+|\3=. *Crunch (running containers)*|
+|"crunch-dispatch-slurm":crunch2-slurm/install-prerequisites.html |Run analysis workflows using Docker containers distributed across a SLURM cluster.|Optional if you wish to use Arvados for data management only.|
+|"Node Manager":install-nodemanager.html |Allocate and free cloud VM instances on demand based on workload.|Optional, not needed for a static SLURM cluster (such as on-premise HPC).|
--- /dev/null
+---
+layout: default
+navsection: installguide
+title: Install Composer
+...
+
+Arvados Composer is a single-page javascript application for building Common Workflow Languge (CWL) Workflows.
+
+h2. Prerequisites
+
+In addition to Arvados core services, Composer requires "Arvados hosted git repositories":install-arv-git-httpd.html which are used for storing workflow files.
+
+h2. Install
+
+Composer may be installed on the same host as Workbench, or on a different host. Composer communicates directly with the Arvados API server. It does not require its own backend and should be served as a static file.
+
+On a Debian-based system, install the following package:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo apt-get install arvados-composer</span>
+</code></pre>
+</notextile>
+
+On a Red Hat-based system, install the following package:
+
+<notextile>
+<pre><code>~$ <span class="userinput">sudo yum install arvados-composer</span>
+</code></pre>
+</notextile>
+
+h2. Configure
+
+h3. composer.yml
+
+Edit @/etc/arvados/composer/composer.yml@ and set @apiEndPoint@ to your API server:
+
+<pre>
+apiEndPoint: https://zzzzz.arvadosapi.com
+</pre>
+
+h3. Nginx
+
+Add Composer to your Nginx configuration. This example will host Composer at @/composer@.
+
+<pre>
+location /composer {
+ root /var/www/arvados-composer
+ index index.html
+}
+</pre>
+
+h3. Workbench link to composer
+
+Edit the workbench @application.yml@ and set @composer_url@ to the location from which it is served.
+
+<pre>
+production:
+ composer_url: 'https://workbench.zzzzz.arvadosapi.com/composer'
+</pre>
Create an Arvados superuser token for use by keep-balance. *On the API server*, run:
-<notextile>
-<pre><code>apiserver:~$ <span class="userinput">cd /var/www/arvados-api/current</span>
-apiserver:/var/www/arvados-api/current$ <span class="userinput">sudo -u <b>webserver-user</b> RAILS_ENV=production bundle exec script/create_superuser_token.rb</span>
-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
-</code></pre>
-</notextile>
+{% include 'create_superuser_token' %}
h3. Update keepstore configuration files
h3. Tell the API server about the Keepproxy server
-The API server needs to be informed about the presence of your Keepproxy server. Please execute the following commands on your <strong>shell server</strong>.
+The API server needs to be informed about the presence of your Keepproxy server.
+
+First, if you don't already have an admin token, create a superuser token:
+
+{% include 'create_superuser_token' %}
+
+Configure your environment to run @arv@ using the output of create_superuser_token.rb:
+
+<pre>
+export ARVADOS_API_HOST=zzzzz.example.com
+export ARVADOS_API_TOKEN=zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
+</pre>
<notextile>
<pre><code>~$ <span class="userinput">uuid_prefix=`arv --format=uuid user current | cut -d- -f1`</span>
}
EOF</span>
</code></pre></notextile>
+
+h3. Testing keepproxy
+
+Log into a host that is on an external network from your private Arvados network. The host should be able to contact your keepproxy server (eg keep.$uuid_prefix.arvadosapi.com), but not your keepstore servers (eg keep[0-9].$uuid_prefix.arvadosapi.com).
+
+Install the "Python SDK":{{site.baseurl}}/sdk/python/sdk-python.html
+
+@ARVADOS_API_HOST@ and @ARVADOS_API_TOKEN@ must be set in the environment.
+
+You should now be able to use @arv-put@ to upload collections and @arv-get@ to fetch collections, for an example see "Testing keep.":install-keepstore.html#testing on the keepstore install page.
h3. Tell the API server about the Keepstore servers
-The API server needs to be informed about the presence of your Keepstore servers. For each of the Keepstore servers you have created, please execute the following commands on your <strong>shell server</strong>.
+The API server needs to be informed about the presence of your Keepstore servers.
-Make sure to update the @service_host@ value to match each of your Keepstore servers.
+First, if you don't already have an admin token, create a superuser token:
+
+{% include 'create_superuser_token' %}
+
+Configure your environment to run @arv@ using the output of create_superuser_token.rb:
+
+<pre>
+export ARVADOS_API_HOST=zzzzz.example.com
+export ARVADOS_API_TOKEN=zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
+</pre>
+
+Use this command to register each keepstore server you have installed. Make sure to update the @service_host@ value.
<notextile>
<pre><code>~$ <span class="userinput">prefix=`arv --format=uuid user current | cut -d- -f1`</span>
~$ <span class="userinput">echo "Site prefix is '$prefix'"</span>
~$ <span class="userinput">read -rd $'\000' keepservice <<EOF; arv keep_service create --keep-service "$keepservice"</span>
<span class="userinput">{
- "service_host":"<strong>keep0.$prefix.your.domain</strong>",
+ "service_host":"<strong>keep0.$uuid_prefix.your.domain</strong>",
"service_port":25107,
"service_ssl_flag":false,
"service_type":"disk"
}
EOF</span>
</code></pre></notextile>
+
+h3(#testing). Testing keep
+
+Install the "Python SDK":{{site.baseurl}}/sdk/python/sdk-python.html
+
+@ARVADOS_API_HOST@ and @ARVADOS_API_TOKEN@ must be set in the environment.
+
+You should now be able to use @arv-put@ to upload collections and @arv-get@ to fetch collections:
+
+<pre>
+$ echo "hello world!" > hello.txt
+
+$ arv-put --portable-data-hash hello.txt
+2018-07-12 13:35:25 arvados.arv_put[28702] INFO: Creating new cache file at /home/example/.cache/arvados/arv-put/1571ec0adb397c6a18d5c74cc95b3a2a
+0M / 0M 100.0% 2018-07-12 13:35:27 arvados.arv_put[28702] INFO:
+
+2018-07-12 13:35:27 arvados.arv_put[28702] INFO: Collection saved as 'Saved at 2018-07-12 17:35:25 UTC by example@example'
+59389a8f9ee9d399be35462a0f92541c+53
+
+$ arv-get 59389a8f9ee9d399be35462a0f92541c+53/hello.txt
+hello world!
+</pre>
<div class="offset1">
table(table table-bordered table-condensed).
-|_Function_|_Number of nodes_|
+|_. Function|_. Number of nodes|
|Arvados API, Crunch dispatcher, Git, Websockets and Workbench|1|
|Arvados Compute node|1|
|Arvados Keepproxy and Keep-web server|1|
h2. Supported GNU/Linux distributions
table(table table-bordered table-condensed).
-|_Distribution_|_State_|_Last supported version_|
+|_. Distribution|_. State|_. Last supported version|
|CentOS 7|Supported|Latest|
|Debian 8 ("jessie")|Supported|Latest|
|Debian 9 ("stretch")|Supported|Latest|
Configure apt to retrieve packages from the Arvados package repository. This command depends on your OS vendor and version:
table(table table-bordered table-condensed).
-|OS version|Command|
+|_. OS version|_. Command|
|Debian 8 ("jessie")|<notextile><code><span class="userinput">echo "deb http://apt.arvados.org/ jessie main" | sudo tee /etc/apt/sources.list.d/arvados.list</span></code></notextile>|
|Debian 9 ("stretch")|<notextile><code><span class="userinput">echo "deb http://apt.arvados.org/ stretch main" | sudo tee /etc/apt/sources.list.d/arvados.list</span></code></notextile>|
|Ubuntu 14.04 ("trusty")[1]|<notextile><code><span class="userinput">echo "deb http://apt.arvados.org/ trusty main" | sudo tee /etc/apt/sources.list.d/arvados.list</span></code></notextile>|
<div class="offset1">
table(table table-bordered table-condensed).
-|_Function_|_Hostname_|
+|_. Function|_. Hostname|
|Arvados API|@uuid_prefix@.your.domain|
|Arvados Git server|git.@uuid_prefix@.your.domain|
|Arvados Keepproxy server|keep.@uuid_prefix@.your.domain|
Arvados Node Manager provides elastic computing for Arvados and SLURM by creating and destroying virtual machines on demand. Node Manager currently supports Amazon Web Services (AWS), Google Cloud Platform (GCP) and Microsoft Azure.
-Note: node manager is only required for elastic computing cloud environments. Fixed size clusters do not require node manager.
+Note: node manager is only required for elastic computing cloud environments. Fixed size clusters (such as on-premise HPC) do not require node manager.
h2. Install
# an Arvados node that hasn't been updated for this long.
node_stale_after = 14400
+# Number of consecutive times a node must report as "idle" before it
+# will be considered eligible for shutdown. Node status is checked
+# each poll period, and node can go idle at any point during a poll
+# period (meaning a node could be reported as idle that has only been
+# idle for 1 second). With a 60 second poll period, three consecutive
+# status updates of "idle" suggests the node has been idle at least
+# 121 seconds.
+consecutive_idle_count = 3
+
# Scaling factor to be applied to nodes' available RAM size. Usually there's a
# variable discrepancy between the advertised RAM value on cloud nodes and the
# actual amount available.
# an Arvados node that hasn't been updated for this long.
node_stale_after = 14400
+# Number of consecutive times a node must report as "idle" before it
+# will be considered eligible for shutdown. Node status is checked
+# each poll period, and node can go idle at any point during a poll
+# period (meaning a node could be reported as idle that has only been
+# idle for 1 second). With a 60 second poll period, three consecutive
+# status updates of "idle" suggests the node has been idle at least
+# 121 seconds.
+consecutive_idle_count = 3
+
# Scaling factor to be applied to nodes' available RAM size. Usually there's a
# variable discrepancy between the advertised RAM value on cloud nodes and the
# actual amount available.
# an Arvados node that hasn't been updated for this long.
node_stale_after = 14400
+# Number of consecutive times a node must report as "idle" before it
+# will be considered eligible for shutdown. Node status is checked
+# each poll period, and node can go idle at any point during a poll
+# period (meaning a node could be reported as idle that has only been
+# idle for 1 second). With a 60 second poll period, three consecutive
+# status updates of "idle" suggests the node has been idle at least
+# 121 seconds.
+consecutive_idle_count = 3
+
# Scaling factor to be applied to nodes' available RAM size. Usually there's a
# variable discrepancy between the advertised RAM value on cloud nodes and the
# actual amount available.
enableReuse: false
cwltool:Secrets:
secrets: [input1, input2]
+ cwltool:TimeLimit:
+ timelimit: 14400
+ arv:WorkflowRunnerResources:
+ ramMin: 2048
+ coresMin: 2
</pre>
The one exception to this is @arv:APIRequirement@, see note below.
table(table table-bordered table-condensed).
|_. Field |_. Type |_. Description |
|secrets|array<string>|Input parameters which are considered "secret". Must be strings.|
+
+
+h2. cwltool:TimeLimit
+
+Set an upper limit on the execution time of a CommandLineTool or ExpressionTool. A tool execution which exceeds the time limit may be preemptively terminated and considered failed. May also be used by batch systems to make scheduling decisions.
+
+table(table table-bordered table-condensed).
+|_. Field |_. Type |_. Description |
+|timelimit|int|Execution time limit in seconds. If set to zero, no limit is enforced.|
+
+h2. arv:WorkflowRunnerResources
+
+Specify resource requirements for the workflow runner process (arvados-cwl-runner) that manages a workflow run. Must be applied to the top level workflow. Will also be set implicitly when using @--submit-runner-ram@ on the command line along with @--create-workflow@ or @--update-workflow@. Use this to adjust the runner's allocation if the workflow runner is getting "out of memory" exceptions or being killed by the out-of-memory (OOM) killer.
+
+table(table table-bordered table-condensed).
+|_. Field |_. Type |_. Description |
+|ramMin|int|RAM, in mebibytes, to reserve for the arvados-cwl-runner process. Default 1 GiB|
+|coresMin|int|Number of cores to reserve to the arvados-cwl-runner process. Default 1 core.|
type Multi map[string]Handler
func (m Multi) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
- if len(args) < 1 {
- fmt.Fprintf(stderr, "usage: %s command [args]\n", prog)
- m.Usage(stderr)
- return 2
- }
_, basename := filepath.Split(prog)
- if strings.HasPrefix(basename, "arvados-") {
- basename = basename[8:]
- } else if strings.HasPrefix(basename, "crunch-") {
- basename = basename[7:]
- }
+ basename = strings.TrimPrefix(basename, "arvados-")
+ basename = strings.TrimPrefix(basename, "crunch-")
if cmd, ok := m[basename]; ok {
return cmd.RunCommand(prog, args, stdin, stdout, stderr)
+ } else if len(args) < 1 {
+ fmt.Fprintf(stderr, "usage: %s command [args]\n", prog)
+ m.Usage(stderr)
+ return 2
} else if cmd, ok = m[args[0]]; ok {
return cmd.RunCommand(prog+" "+args[0], args[1:], stdin, stdout, stderr)
} else {
func (h *Handler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
h.setupOnce.Do(h.setup)
+ if req.Method != "GET" && req.Method != "HEAD" {
+ // http.ServeMux returns 301 with a cleaned path if
+ // the incoming request has a double slash. Some
+ // clients (including the Go standard library) change
+ // the request method to GET when following a 301
+ // redirect if the original method was not HEAD
+ // (RFC7231 6.4.2 specifically allows this in the case
+ // of POST). Thus "POST //foo" gets misdirected to
+ // "GET /foo". To avoid this, eliminate double slashes
+ // before passing the request to ServeMux.
+ for strings.Contains(req.URL.Path, "//") {
+ req.URL.Path = strings.Replace(req.URL.Path, "//", "/", -1)
+ }
+ }
h.handlerStack.ServeHTTP(w, req)
}
})
mux.Handle("/", http.HandlerFunc(h.proxyRailsAPI))
h.handlerStack = mux
+
+ // Changing the global isn't the right way to do this, but a
+ // proper solution would conflict with an impending 13493
+ // merge anyway, so this will do for now.
+ arvados.InsecureHTTPClient.CheckRedirect = func(*http.Request, []*http.Request) error { return http.ErrUseLastResponse }
}
// headers that shouldn't be forwarded when proxying. See
xff = xffIn + "," + xff
}
hdrOut.Set("X-Forwarded-For", xff)
+ if hdrOut.Get("X-Forwarded-Proto") == "" {
+ hdrOut.Set("X-Forwarded-Proto", reqIn.URL.Scheme)
+ }
hdrOut.Add("Via", reqIn.Proto+" arvados-controller")
ctx := reqIn.Context()
reqOut := (&http.Request{
Method: reqIn.Method,
URL: urlOut,
+ Host: reqIn.Host,
Header: hdrOut,
Body: reqIn.Body,
}).WithContext(ctx)
c.Check(err, check.IsNil)
c.Check(jresp["errors"], check.FitsTypeOf, []interface{}{})
}
+
+func (s *HandlerSuite) TestProxyRedirect(c *check.C) {
+ req := httptest.NewRequest("GET", "https://example.org:1234/login?return_to=foo", nil)
+ resp := httptest.NewRecorder()
+ s.handler.ServeHTTP(resp, req)
+ c.Check(resp.Code, check.Equals, http.StatusFound)
+ c.Check(resp.Header().Get("Location"), check.Matches, `https://example\.org:1234/auth/joshid\?return_to=foo&?`)
+}
needRAM := ctr.RuntimeConstraints.RAM + ctr.RuntimeConstraints.KeepCacheRAM
needRAM = (needRAM * 100) / int64(100-discountConfiguredRAMPercent)
- availableTypes := make([]arvados.InstanceType, len(cc.InstanceTypes))
- copy(availableTypes, cc.InstanceTypes)
- sort.Slice(availableTypes, func(a, b int) bool {
- return availableTypes[a].Price < availableTypes[b].Price
- })
- err = ConstraintsNotSatisfiableError{
- errors.New("constraints not satisfiable by any configured instance type"),
- availableTypes,
- }
+ ok := false
for _, it := range cc.InstanceTypes {
switch {
- case err == nil && it.Price > best.Price:
- case it.Scratch < needScratch:
- case it.RAM < needRAM:
+ case ok && it.Price > best.Price:
+ case int64(it.Scratch) < needScratch:
+ case int64(it.RAM) < needRAM:
case it.VCPUs < needVCPUs:
case it.Preemptible != ctr.SchedulingParameters.Preemptible:
case it.Price == best.Price && (it.RAM < best.RAM || it.VCPUs < best.VCPUs):
default:
// Lower price || (same price && better specs)
best = it
- err = nil
+ ok = true
+ }
+ }
+ if !ok {
+ availableTypes := make([]arvados.InstanceType, 0, len(cc.InstanceTypes))
+ for _, t := range cc.InstanceTypes {
+ availableTypes = append(availableTypes, t)
}
+ sort.Slice(availableTypes, func(a, b int) bool {
+ return availableTypes[a].Price < availableTypes[b].Price
+ })
+ err = ConstraintsNotSatisfiableError{
+ errors.New("constraints not satisfiable by any configured instance type"),
+ availableTypes,
+ }
+ return
}
return
}
var _ = check.Suite(&NodeSizeSuite{})
-const GiB = int64(1 << 30)
+const GiB = arvados.ByteSize(1 << 30)
type NodeSizeSuite struct{}
func (*NodeSizeSuite) TestChooseUnsatisfiable(c *check.C) {
checkUnsatisfiable := func(ctr *arvados.Container) {
- _, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: []arvados.InstanceType{
- {Price: 1.1, RAM: 1000000000, VCPUs: 2, Name: "small1"},
- {Price: 2.2, RAM: 2000000000, VCPUs: 4, Name: "small2"},
- {Price: 4.4, RAM: 4000000000, VCPUs: 8, Name: "small4", Scratch: GiB},
+ _, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: map[string]arvados.InstanceType{
+ "small1": {Price: 1.1, RAM: 1000000000, VCPUs: 2, Name: "small1"},
+ "small2": {Price: 2.2, RAM: 2000000000, VCPUs: 4, Name: "small2"},
+ "small4": {Price: 4.4, RAM: 4000000000, VCPUs: 8, Name: "small4", Scratch: GiB},
}}, ctr)
c.Check(err, check.FitsTypeOf, ConstraintsNotSatisfiableError{})
}
checkUnsatisfiable(&arvados.Container{RuntimeConstraints: rc})
}
checkUnsatisfiable(&arvados.Container{
- Mounts: map[string]arvados.Mount{"/tmp": {Kind: "tmp", Capacity: 2 * GiB}},
+ Mounts: map[string]arvados.Mount{"/tmp": {Kind: "tmp", Capacity: int64(2 * GiB)}},
RuntimeConstraints: arvados.RuntimeConstraints{RAM: 12345, VCPUs: 1},
})
}
func (*NodeSizeSuite) TestChoose(c *check.C) {
- for _, menu := range [][]arvados.InstanceType{
+ for _, menu := range []map[string]arvados.InstanceType{
{
- {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Name: "costly"},
- {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best"},
- {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: 2 * GiB, Name: "small"},
+ "costly": {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Name: "costly"},
+ "best": {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best"},
+ "small": {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: 2 * GiB, Name: "small"},
},
{
- {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Name: "costly"},
- {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "goodenough"},
- {Price: 2.2, RAM: 4000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best"},
- {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: 2 * GiB, Name: "small"},
+ "costly": {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Name: "costly"},
+ "goodenough": {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "goodenough"},
+ "best": {Price: 2.2, RAM: 4000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best"},
+ "small": {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: 2 * GiB, Name: "small"},
},
{
- {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: 2 * GiB, Name: "small"},
- {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "goodenough"},
- {Price: 2.2, RAM: 4000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best"},
- {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Name: "costly"},
+ "small": {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: 2 * GiB, Name: "small"},
+ "goodenough": {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "goodenough"},
+ "best": {Price: 2.2, RAM: 4000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best"},
+ "costly": {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Name: "costly"},
},
{
- {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: GiB, Name: "small"},
- {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: GiB, Name: "nearly"},
- {Price: 3.3, RAM: 4000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best"},
- {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Name: "costly"},
+ "small": {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: GiB, Name: "small"},
+ "nearly": {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: GiB, Name: "nearly"},
+ "best": {Price: 3.3, RAM: 4000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "best"},
+ "costly": {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Name: "costly"},
},
} {
best, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
Mounts: map[string]arvados.Mount{
- "/tmp": {Kind: "tmp", Capacity: 2 * GiB},
+ "/tmp": {Kind: "tmp", Capacity: 2 * int64(GiB)},
},
RuntimeConstraints: arvados.RuntimeConstraints{
VCPUs: 2,
}
}
-func (*NodeSizeSuite) TestChoosePreemptible(c *check.C) {
- menu := []arvados.InstanceType{
- {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Preemptible: true, Name: "costly"},
- {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "almost best"},
- {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Preemptible: true, Name: "best"},
- {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: 2 * GiB, Preemptible: true, Name: "small"},
+func (*NodeSizeSuite) TestChoosePreemptable(c *check.C) {
+ menu := map[string]arvados.InstanceType{
+ "costly": {Price: 4.4, RAM: 4000000000, VCPUs: 8, Scratch: 2 * GiB, Preemptible: true, Name: "costly"},
+ "almost best": {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Name: "almost best"},
+ "best": {Price: 2.2, RAM: 2000000000, VCPUs: 4, Scratch: 2 * GiB, Preemptible: true, Name: "best"},
+ "small": {Price: 1.1, RAM: 1000000000, VCPUs: 2, Scratch: 2 * GiB, Preemptible: true, Name: "small"},
}
best, err := ChooseInstanceType(&arvados.Cluster{InstanceTypes: menu}, &arvados.Container{
Mounts: map[string]arvados.Mount{
- "/tmp": {Kind: "tmp", Capacity: 2 * GiB},
+ "/tmp": {Kind: "tmp", Capacity: 2 * int64(GiB)},
},
RuntimeConstraints: arvados.RuntimeConstraints{
VCPUs: 2,
--- /dev/null
+.onLoad <- function(libName, pkgName)
+{
+ minAllowedRVersion <- "3.3.0"
+ currentRVersion <- getRversion()
+
+ if(currentRVersion < minAllowedRVersion)
+ print(paste0("Minimum R version required to run ", pkgName, " is ",
+ minAllowedRVersion, ". Your current version is ",
+ toString(currentRVersion), ". Please update R and try again."))
+}
apt-get install build-essential libxml2-dev libssl-dev libcurl4-gnutls-dev
```
+Minimum R version required to run ArvadosR is 3.3.0.
+
### Usage
s.add_runtime_dependency 'json', '>= 1.7.7', '<3'
s.add_runtime_dependency 'trollop', '~> 2.0'
s.add_runtime_dependency 'andand', '~> 1.3', '>= 1.3.3'
- s.add_runtime_dependency 'oj', '~> 2.0', '>= 2.0.3'
+ s.add_runtime_dependency 'oj', '~> 3.0'
s.add_runtime_dependency 'curb', '~> 0.8'
s.homepage =
'https://arvados.org'
parser.add_argument("--submit-runner-ram", type=int,
help="RAM (in MiB) required for the workflow runner job (default 1024)",
- default=1024)
+ default=None)
parser.add_argument("--submit-runner-image", type=str,
help="Docker image for workflow runner job, default arvados/jobs:%s" % __version__,
if api_client is None:
api_client = arvados.safeapi.ThreadSafeApiCache(api_params={"model": OrderedJsonModel()}, keep_params={"num_retries": 4})
keep_client = api_client.keep
+ # Make an API object now so errors are reported early.
+ api_client.users().current().execute()
if keep_client is None:
keep_client = arvados.keep.KeepClient(api_client=api_client, num_retries=4)
runner = ArvCwlRunner(api_client, arvargs, keep_client=keep_client, num_retries=4)
"_type": "@id"
refScope: 0
+- name: cwltool:TimeLimit
+ type: record
+ inVocab: false
+ extends: cwl:ProcessRequirement
+ doc: |
+ Set an upper limit on the execution time of a CommandLineTool or
+ ExpressionTool. A tool execution which exceeds the time limit may
+ be preemptively terminated and considered failed. May also be
+ used by batch systems to make scheduling decisions.
+ fields:
+ - name: class
+ type: string
+ doc: "Always 'TimeLimit'"
+ jsonldPredicate:
+ "_id": "@type"
+ "_type": "@vocab"
+ - name: timelimit
+ type: [long, string]
+ doc: |
+ The time limit, in seconds. A time limit of zero means no
+ time limit. Negative time limits are an error.
+
- name: RunInSingleContainer
type: record
extends: cwl:ProcessRequirement
_type: "@vocab"
- name: enableReuse
type: boolean
+
+- name: WorkflowRunnerResources
+ type: record
+ extends: cwl:ProcessRequirement
+ inVocab: false
+ doc: |
+ Specify memory or cores resource request for the CWL runner process itself.
+ fields:
+ class:
+ type: string
+ doc: "Always 'arv:WorkflowRunnerResources'"
+ jsonldPredicate:
+ _id: "@type"
+ _type: "@vocab"
+ ramMin:
+ type: int?
+ doc: Minimum RAM, in mebibytes (2**20)
+ jsonldPredicate: "https://w3id.org/cwl/cwl#ResourceRequirement/ramMin"
+ coresMin:
+ type: int?
+ doc: Minimum cores allocated to cwl-runner
+ jsonldPredicate: "https://w3id.org/cwl/cwl#ResourceRequirement/coresMin"
\ No newline at end of file
import ciso8601
import uuid
+from arvados_cwl.util import get_current_container, get_intermediate_collection_info
import ruamel.yaml as yaml
from cwltool.errors import WorkflowException
keepemptydirs(vwd)
- with Perf(metrics, "generatefiles.save_new %s" % self.name):
- vwd.save_new()
+ if not runtimeContext.current_container:
+ runtimeContext.current_container = get_current_container(self.arvrunner.api, self.arvrunner.num_retries, logger)
+ info = get_intermediate_collection_info(self.name, runtimeContext.current_container, runtimeContext.intermediate_output_ttl)
+ vwd.save_new(name=info["name"],
+ owner_uuid=self.arvrunner.project_uuid,
+ ensure_unique_name=True,
+ trash_at=info["trash_at"],
+ properties=info["properties"])
prev = None
for f, p in sorteditems:
if self.output_ttl < 0:
raise WorkflowException("Invalid value %d for output_ttl, cannot be less than zero" % container_request["output_ttl"])
+ if self.timelimit is not None:
+ scheduling_parameters["max_run_time"] = self.timelimit
+
+ container_request["output_name"] = "Output for step %s" % (self.name)
container_request["output_ttl"] = self.output_ttl
container_request["mounts"] = mounts
container_request["secret_mounts"] = secret_mounts
},
"secret_mounts": secret_mounts,
"runtime_constraints": {
- "vcpus": 1,
+ "vcpus": self.submit_runner_cores,
"ram": 1024*1024 * self.submit_runner_ram,
"API": True
},
from schema_salad.sourceline import SourceLine
+from arvados_cwl.util import get_current_container, get_intermediate_collection_info
import ruamel.yaml as yaml
import arvados.collection
if vwd:
with Perf(metrics, "generatefiles.save_new %s" % self.name):
- vwd.save_new()
+ if not runtimeContext.current_container:
+ runtimeContext.current_container = get_current_container(self.arvrunner.api, self.arvrunner.num_retries, logger)
+ info = get_intermediate_collection_info(self.name, runtimeContext.current_container, runtimeContext.intermediate_output_ttl)
+ vwd.save_new(name=info["name"],
+ owner_uuid=self.arvrunner.project_uuid,
+ ensure_unique_name=True,
+ trash_at=info["trash_at"],
+ properties=info["properties"])
for f, p in generatemapper.items():
if p.type == "File":
upload_dependencies(arvRunner, name, tool.doc_loader,
packed, tool.tool["id"], False)
- # TODO nowhere for submit_runner_ram to go.
+ if submit_runner_ram:
+ hints = main.get("hints", [])
+ found = False
+ for h in hints:
+ if h["class"] == "http://arvados.org/cwl#WorkflowRunnerResources":
+ h["ramMin"] = submit_runner_ram
+ found = True
+ break
+ if not found:
+ hints.append({"class": "http://arvados.org/cwl#WorkflowRunnerResources",
+ "ramMin": submit_runner_ram})
+ main["hints"] = hints
body = {
"workflow": {
"name": name,
"description": tool.tool.get("doc", ""),
- "definition":yaml.round_trip_dump(packed)
+ "definition":json.dumps(packed, sort_keys=True, indent=4, separators=(',',': '))
}}
if project_uuid:
body["workflow"]["owner_uuid"] = project_uuid
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
from cwltool.context import LoadingContext, RuntimeContext
class ArvLoadingContext(LoadingContext):
self.wait = True
self.cwl_runner_job = None
self.storage_classes = "default"
+ self.current_container = None
super(ArvRuntimeContext, self).__init__(kwargs)
import os
import urllib
+from arvados_cwl.util import get_current_container, get_intermediate_collection_info
import arvados.commands.run
import arvados.collection
from schema_salad.sourceline import SourceLine
+from arvados.errors import ApiError
from cwltool.pathmapper import PathMapper, MapperEnt, abspath, adjustFileObjs, adjustDirObjs
from cwltool.workflow import WorkflowException
for l in srcobj.get("listing", []):
self.addentry(l, c, ".", remap)
- check = self.arvrunner.api.collections().list(filters=[["portable_data_hash", "=", c.portable_data_hash()]], limit=1).execute(num_retries=self.arvrunner.num_retries)
- if not check["items"]:
- c.save_new(owner_uuid=self.arvrunner.project_uuid)
+ container = get_current_container(self.arvrunner.api, self.arvrunner.num_retries, logger)
+ info = get_intermediate_collection_info(None, container, self.arvrunner.intermediate_output_ttl)
+
+ c.save_new(name=info["name"],
+ owner_uuid=self.arvrunner.project_uuid,
+ ensure_unique_name=True,
+ trash_at=info["trash_at"],
+ properties=info["properties"])
ab = self.collection_pattern % c.portable_data_hash()
self._pathmap[srcobj["location"]] = MapperEnt("keep:"+c.portable_data_hash(), ab, "Directory", True)
num_retries=self.arvrunner.num_retries )
self.addentry(srcobj, c, ".", remap)
- check = self.arvrunner.api.collections().list(filters=[["portable_data_hash", "=", c.portable_data_hash()]], limit=1).execute(num_retries=self.arvrunner.num_retries)
- if not check["items"]:
- c.save_new(owner_uuid=self.arvrunner.project_uuid)
+ container = get_current_container(self.arvrunner.api, self.arvrunner.num_retries, logger)
+ info = get_intermediate_collection_info(None, container, self.arvrunner.intermediate_output_ttl)
+
+ c.save_new(name=info["name"],
+ owner_uuid=self.arvrunner.project_uuid,
+ ensure_unique_name=True,
+ trash_at=info["trash_at"],
+ properties=info["properties"])
ab = self.file_pattern % (c.portable_data_hash(), srcobj["basename"])
self._pathmap[srcobj["location"]] = MapperEnt("keep:%s/%s" % (c.portable_data_hash(), srcobj["basename"]),
else:
return None
+
class StagingPathMapper(PathMapper):
_follow_dirs = True
from functools import partial
import logging
import json
-import subprocess
+import subprocess32 as subprocess
from collections import namedtuple
from StringIO import StringIO
self.priority = priority
self.secret_store = secret_store
+ self.submit_runner_cores = 1
+ self.submit_runner_ram = 1024 # defaut 1 GiB
+
+ runner_resource_req, _ = self.tool.get_requirement("http://arvados.org/cwl#WorkflowRunnerResources")
+ if runner_resource_req:
+ if runner_resource_req.get("coresMin"):
+ self.submit_runner_cores = runner_resource_req["coresMin"]
+ if runner_resource_req.get("ramMin"):
+ self.submit_runner_ram = runner_resource_req["ramMin"]
+
if submit_runner_ram:
+ # Command line / initializer overrides default and/or spec from workflow
self.submit_runner_ram = submit_runner_ram
- else:
- self.submit_runner_ram = 3000
if self.submit_runner_ram <= 0:
- raise Exception("Value of --submit-runner-ram must be greater than zero")
+ raise Exception("Value of submit-runner-ram must be greater than zero")
+
+ if self.submit_runner_cores <= 0:
+ raise Exception("Value of submit-runner-cores must be greater than zero")
self.merged_map = merged_map or {}
--- /dev/null
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import datetime
+from arvados.errors import ApiError
+
+def get_intermediate_collection_info(workflow_step_name, current_container, intermediate_output_ttl):
+ if workflow_step_name:
+ name = "Intermediate collection for step %s" % (workflow_step_name)
+ else:
+ name = "Intermediate collection"
+ trash_time = None
+ if intermediate_output_ttl > 0:
+ trash_time = datetime.datetime.utcnow() + datetime.timedelta(seconds=intermediate_output_ttl)
+ container_uuid = None
+ if current_container:
+ container_uuid = current_container['uuid']
+ props = {"type": "intermediate", "container": container_uuid}
+
+ return {"name" : name, "trash_at" : trash_time, "properties" : props}
+
+def get_current_container(api, num_retries=0, logger=None):
+ current_container = None
+ try:
+ current_container = api.containers().current().execute(num_retries=num_retries)
+ except ApiError as e:
+ # Status code 404 just means we're not running in a container.
+ if e.resp.status != 404 and logger:
+ logger.info("Getting current container: %s", e)
+ return current_container
'ruamel.yaml >=0.13.11, <0.15',
'arvados-python-client>=1.1.4.20180607143841',
'setuptools',
- 'ciso8601 >=1.0.6, <2.0.0'
+ 'ciso8601 >=1.0.6, <2.0.0',
+ 'subprocess32>=3.5.1',
],
data_files=[
('share/doc/arvados-cwl-runner', ['LICENSE-2.0.txt', 'README.rst']),
],
test_suite='tests',
- tests_require=['mock>=1.0'],
+ tests_require=[
+ 'mock>=1.0',
+ 'subprocess32>=3.5.1',
+ ],
zip_safe=True
)
#
# SPDX-License-Identifier: Apache-2.0
-cwlVersion: v1.0
-$graph:
-- class: Workflow
- inputs: []
- outputs: []
- steps:
- - in: []
- out: []
- run: '#step1.cwl'
- id: '#main/step1'
- - in: []
- out: []
- run: '#step2.cwl'
- id: '#main/step2'
- id: '#main'
-- class: CommandLineTool
- inputs:
- - type: File
- default:
- class: File
- location: keep:b9fca8bf06b170b8507b80b2564ee72b+57/a.txt
- id: '#step1.cwl/a'
- - type: File
- default:
- class: File
- location: keep:b9fca8bf06b170b8507b80b2564ee72b+57/b.txt
- id: '#step1.cwl/b'
- outputs: []
- arguments: [echo, $(inputs.a), $(inputs.b)]
- id: '#step1.cwl'
-- class: CommandLineTool
- inputs:
- - type: File
- default:
- class: File
- location: keep:8e2d09a066d96cdffdd2be41579e4e2e+57/b.txt
- id: '#step2.cwl/b'
- - type: File
- default:
- class: File
- location: keep:8e2d09a066d96cdffdd2be41579e4e2e+57/c.txt
- id: '#step2.cwl/c'
- outputs: []
- arguments: [echo, $(inputs.c), $(inputs.b)]
- id: '#step2.cwl'
+{
+ "$graph": [
+ {
+ "class": "Workflow",
+ "id": "#main",
+ "inputs": [],
+ "outputs": [],
+ "steps": [
+ {
+ "id": "#main/step1",
+ "in": [],
+ "out": [],
+ "run": "#step1.cwl"
+ },
+ {
+ "id": "#main/step2",
+ "in": [],
+ "out": [],
+ "run": "#step2.cwl"
+ }
+ ]
+ },
+ {
+ "arguments": [
+ "echo",
+ "$(inputs.a)",
+ "$(inputs.b)"
+ ],
+ "class": "CommandLineTool",
+ "id": "#step1.cwl",
+ "inputs": [
+ {
+ "default": {
+ "class": "File",
+ "location": "keep:b9fca8bf06b170b8507b80b2564ee72b+57/a.txt"
+ },
+ "id": "#step1.cwl/a",
+ "type": "File"
+ },
+ {
+ "default": {
+ "class": "File",
+ "location": "keep:b9fca8bf06b170b8507b80b2564ee72b+57/b.txt"
+ },
+ "id": "#step1.cwl/b",
+ "type": "File"
+ }
+ ],
+ "outputs": []
+ },
+ {
+ "arguments": [
+ "echo",
+ "$(inputs.c)",
+ "$(inputs.b)"
+ ],
+ "class": "CommandLineTool",
+ "id": "#step2.cwl",
+ "inputs": [
+ {
+ "default": {
+ "class": "File",
+ "location": "keep:8e2d09a066d96cdffdd2be41579e4e2e+57/b.txt"
+ },
+ "id": "#step2.cwl/b",
+ "type": "File"
+ },
+ {
+ "default": {
+ "class": "File",
+ "location": "keep:8e2d09a066d96cdffdd2be41579e4e2e+57/c.txt"
+ },
+ "id": "#step2.cwl/c",
+ "type": "File"
+ }
+ ],
+ "outputs": []
+ }
+ ],
+ "cwlVersion": "v1.0"
+}
\ No newline at end of file
--- /dev/null
+class: CommandLineTool
+cwlVersion: v1.0
+requirements:
+ InitialWorkDirRequirement:
+ listing:
+ - $(inputs.inp1)
+ - $(inputs.inp2)
+ - $(inputs.inp3)
+inputs:
+ inp1: File
+ inp2: [File, Directory]
+ inp3: Directory
+outputs: []
+arguments: [echo, $(inputs.inp1), $(inputs.inp2), $(inputs.inp3)]
--- /dev/null
+cwlVersion: v1.0
+class: Workflow
+$namespaces:
+ arv: "http://arvados.org/cwl#"
+requirements:
+ SubworkflowFeatureRequirement: {}
+inputs:
+ inp1:
+ type: File
+ default:
+ class: File
+ location: hello1.txt
+ inp2:
+ type: [File, Directory]
+ default:
+ class: File
+ basename: "hello2.txt"
+ contents: "Hello world"
+ inp3:
+ type: [File, Directory]
+ default:
+ class: Directory
+ basename: inp3
+ listing:
+ - class: File
+ basename: "hello3.txt"
+ contents: "hello world"
+outputs: []
+steps:
+ step1:
+ requirements:
+ arv:RunInSingleContainer: {}
+ in:
+ inp1: inp1
+ inp2: inp2
+ inp3: inp3
+ out: []
+ run: subwf.cwl
--- /dev/null
+cwlVersion: v1.0
+class: Workflow
+inputs:
+ inp1: File
+ inp2: File
+ inp3: Directory
+outputs: []
+steps:
+ step1:
+ in:
+ inp1: inp1
+ inp2: inp2
+ inp3: inp3
+ out: []
+ run: echo.cwl
logging.getLogger('arvados.cwl-runner').setLevel(logging.WARN)
logging.getLogger('arvados.arv-run').setLevel(logging.WARN)
-
class TestContainer(unittest.TestCase):
def helper(self, runner, enable_reuse=True):
"capacity": 1073741824 }
},
'state': 'Committed',
+ 'output_name': 'Output for step test_run_'+str(enable_reuse),
'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
'output_path': '/var/spool/cwl',
'output_ttl': 0,
"capacity": 5242880000 }
},
'state': 'Committed',
+ 'output_name': 'Output for step test_resource_requirements',
'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
'output_path': '/var/spool/cwl',
'output_ttl': 7200,
}
},
'state': 'Committed',
+ 'output_name': 'Output for step test_initial_work_dir',
'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
'output_path': '/var/spool/cwl',
'output_ttl': 0,
},
},
'state': 'Committed',
+ "output_name": "Output for step test_run_redirect",
'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
'output_path': '/var/spool/cwl',
'output_ttl': 0,
"capacity": 1073741824 }
},
'state': 'Committed',
+ 'output_name': 'Output for step test_run_mounts',
'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
'output_path': '/var/spool/cwl',
'output_ttl': 0,
"capacity": 1073741824 }
},
'state': 'Committed',
+ 'output_name': 'Output for step test_secrets',
'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
'output_path': '/var/spool/cwl',
'output_ttl': 0,
}
}
}))
+
+ # The test passes no builder.resources
+ # Hence the default resources will apply: {'cores': 1, 'ram': 1024, 'outdirSize': 1024, 'tmpdirSize': 1024}
+ @mock.patch("arvados.commands.keepdocker.list_images_in_arv")
+ def test_timelimit(self, keepdocker):
+ arv_docker_clear_cache()
+
+ runner = mock.MagicMock()
+ runner.project_uuid = "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
+ runner.ignore_docker_for_reuse = False
+ runner.intermediate_output_ttl = 0
+ runner.secret_store = cwltool.secrets.SecretStore()
+
+ keepdocker.return_value = [("zzzzz-4zz18-zzzzzzzzzzzzzz3", "")]
+ runner.api.collections().get().execute.return_value = {
+ "portable_data_hash": "99999999999999999999999999999993+99"}
+
+ tool = cmap({
+ "inputs": [],
+ "outputs": [],
+ "baseCommand": "ls",
+ "arguments": [{"valueFrom": "$(runtime.outdir)"}],
+ "id": "#",
+ "class": "CommandLineTool",
+ "hints": [
+ {
+ "class": "http://commonwl.org/cwltool#TimeLimit",
+ "timelimit": 42
+ }
+ ]
+ })
+
+ loadingContext, runtimeContext = self.helper(runner)
+ runtimeContext.name = "test_timelimit"
+
+ arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, loadingContext)
+ arvtool.formatgraph = None
+
+ for j in arvtool.job({}, mock.MagicMock(), runtimeContext):
+ j.run(runtimeContext)
+
+ _, kwargs = runner.api.container_requests().create.call_args
+ self.assertEqual(42, kwargs['body']['scheduling_parameters'].get('max_run_time'))
stubs.expect_container_request_uuid + '\n')
+ @stubs
+ def test_submit_wf_runner_resources(self, stubs):
+ capture_stdout = cStringIO.StringIO()
+ try:
+ exited = arvados_cwl.main(
+ ["--submit", "--no-wait", "--api=containers", "--debug",
+ "tests/wf/submit_wf_runner_resources.cwl", "tests/submit_test_job.json"],
+ capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
+ self.assertEqual(exited, 0)
+ except:
+ logging.exception("")
+
+ expect_container = copy.deepcopy(stubs.expect_container_spec)
+ expect_container["runtime_constraints"] = {
+ "API": True,
+ "vcpus": 2,
+ "ram": 2000 * 2**20
+ }
+ expect_container["name"] = "submit_wf_runner_resources.cwl"
+ expect_container["mounts"]["/var/lib/cwl/workflow.json"]["content"]["$graph"][1]["hints"] = [
+ {
+ "class": "http://arvados.org/cwl#WorkflowRunnerResources",
+ "coresMin": 2,
+ "ramMin": 2000
+ }
+ ]
+ expect_container["mounts"]["/var/lib/cwl/workflow.json"]["content"]["$graph"][0]["$namespaces"] = {
+ "arv": "http://arvados.org/cwl#",
+ }
+
+ stubs.api.container_requests().create.assert_called_with(
+ body=JsonDiffMatcher(expect_container))
+ self.assertEqual(capture_stdout.getvalue(),
+ stubs.expect_container_request_uuid + '\n')
+
+
@mock.patch("arvados.commands.keepdocker.find_one_image_hash")
@mock.patch("cwltool.docker.DockerCommandLineJob.get_image")
@mock.patch("arvados.api")
--- /dev/null
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+import mock
+import datetime
+import httplib2
+
+from arvados_cwl.util import *
+from arvados.errors import ApiError
+
+class MockDateTime(datetime.datetime):
+ @classmethod
+ def utcnow(cls):
+ return datetime.datetime(2018, 1, 1, 0, 0, 0, 0)
+
+datetime.datetime = MockDateTime
+
+class TestUtil(unittest.TestCase):
+ def test_get_intermediate_collection_info(self):
+ name = "one"
+ current_container = {"uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz"}
+ intermediate_output_ttl = 120
+
+ info = get_intermediate_collection_info(name, current_container, intermediate_output_ttl)
+
+ self.assertEqual(info["name"], "Intermediate collection for step one")
+ self.assertEqual(info["trash_at"], datetime.datetime(2018, 1, 1, 0, 2, 0, 0))
+ self.assertEqual(info["properties"], {"type" : "intermediate", "container" : "zzzzz-8i9sb-zzzzzzzzzzzzzzz"})
+
+ def test_get_current_container_success(self):
+ api = mock.MagicMock()
+ api.containers().current().execute.return_value = {"uuid" : "zzzzz-8i9sb-zzzzzzzzzzzzzzz"}
+
+ current_container = get_current_container(api)
+
+ self.assertEqual(current_container, {"uuid" : "zzzzz-8i9sb-zzzzzzzzzzzzzzz"})
+
+ def test_get_current_container_error(self):
+ api = mock.MagicMock()
+ api.containers().current().execute.side_effect = ApiError(httplib2.Response({"status": 300}), "")
+ logger = mock.MagicMock()
+
+ self.assertRaises(ApiError, get_current_container(api, num_retries=0, logger=logger))
#
# SPDX-License-Identifier: Apache-2.0
-cwlVersion: v1.0
-$graph:
-- class: CommandLineTool
- requirements:
- - class: DockerRequirement
- dockerPull: debian:8
- inputs:
- - id: '#submit_tool.cwl/x'
- type: File
- default:
- class: File
- location: keep:5d373e7629203ce39e7c22af98a0f881+52/blub.txt
- inputBinding:
- position: 1
- outputs: []
- baseCommand: cat
- id: '#submit_tool.cwl'
-- class: Workflow
- inputs:
- - id: '#main/x'
- type: File
- default: {class: File, location: keep:169f39d466a5438ac4a90e779bf750c7+53/blorp.txt,
- size: 16, basename: blorp.txt, nameroot: blorp, nameext: .txt}
- - id: '#main/y'
- type: Directory
- default: {class: Directory, location: keep:99999999999999999999999999999998+99,
- basename: 99999999999999999999999999999998+99}
- - id: '#main/z'
- type: Directory
- default: {class: Directory, basename: anonymous, listing: [{basename: renamed.txt,
- class: File, location: keep:99999999999999999999999999999998+99/file1.txt,
- nameroot: renamed, nameext: .txt}]}
- outputs: []
- steps:
- - id: '#main/step1'
- in:
- - {id: '#main/step1/x', source: '#main/x'}
- out: []
- run: '#submit_tool.cwl'
- id: '#main'
+{
+ "$graph": [
+ {
+ "baseCommand": "cat",
+ "class": "CommandLineTool",
+ "id": "#submit_tool.cwl",
+ "inputs": [
+ {
+ "default": {
+ "class": "File",
+ "location": "keep:5d373e7629203ce39e7c22af98a0f881+52/blub.txt"
+ },
+ "id": "#submit_tool.cwl/x",
+ "inputBinding": {
+ "position": 1
+ },
+ "type": "File"
+ }
+ ],
+ "outputs": [],
+ "requirements": [
+ {
+ "class": "DockerRequirement",
+ "dockerPull": "debian:8"
+ }
+ ]
+ },
+ {
+ "class": "Workflow",
+ "id": "#main",
+ "inputs": [
+ {
+ "default": {
+ "basename": "blorp.txt",
+ "class": "File",
+ "location": "keep:169f39d466a5438ac4a90e779bf750c7+53/blorp.txt",
+ "nameext": ".txt",
+ "nameroot": "blorp",
+ "size": 16
+ },
+ "id": "#main/x",
+ "type": "File"
+ },
+ {
+ "default": {
+ "basename": "99999999999999999999999999999998+99",
+ "class": "Directory",
+ "location": "keep:99999999999999999999999999999998+99"
+ },
+ "id": "#main/y",
+ "type": "Directory"
+ },
+ {
+ "default": {
+ "basename": "anonymous",
+ "class": "Directory",
+ "listing": [
+ {
+ "basename": "renamed.txt",
+ "class": "File",
+ "location": "keep:99999999999999999999999999999998+99/file1.txt",
+ "nameext": ".txt",
+ "nameroot": "renamed"
+ }
+ ]
+ },
+ "id": "#main/z",
+ "type": "Directory"
+ }
+ ],
+ "outputs": [],
+ "steps": [
+ {
+ "id": "#main/step1",
+ "in": [
+ {
+ "id": "#main/step1/x",
+ "source": "#main/x"
+ }
+ ],
+ "out": [],
+ "run": "#submit_tool.cwl"
+ }
+ ]
+ }
+ ],
+ "cwlVersion": "v1.0"
+}
\ No newline at end of file
--- /dev/null
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Test case for arvados-cwl-runner
+#
+# Used to test whether scanning a workflow file for dependencies
+# (e.g. submit_tool.cwl) and uploading to Keep works as intended.
+
+class: Workflow
+cwlVersion: v1.0
+$namespaces:
+ arv: "http://arvados.org/cwl#"
+hints:
+ arv:WorkflowRunnerResources:
+ ramMin: 2000
+ coresMin: 2
+inputs:
+ - id: x
+ type: File
+ - id: y
+ type: Directory
+ - id: z
+ type: Directory
+outputs: []
+steps:
+ - id: step1
+ in:
+ - { id: x, source: "#x" }
+ out: []
+ run: ../tool/submit_tool.cwl
--- /dev/null
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package arvados
+
+import (
+ "encoding/json"
+ "fmt"
+ "math"
+ "strings"
+)
+
+type ByteSize int64
+
+var prefixValue = map[string]int64{
+ "": 1,
+ "K": 1000,
+ "Ki": 1 << 10,
+ "M": 1000000,
+ "Mi": 1 << 20,
+ "G": 1000000000,
+ "Gi": 1 << 30,
+ "T": 1000000000000,
+ "Ti": 1 << 40,
+ "P": 1000000000000000,
+ "Pi": 1 << 50,
+ "E": 1000000000000000000,
+ "Ei": 1 << 60,
+}
+
+func (n *ByteSize) UnmarshalJSON(data []byte) error {
+ if len(data) == 0 || data[0] != '"' {
+ var i int64
+ err := json.Unmarshal(data, &i)
+ if err != nil {
+ return err
+ }
+ *n = ByteSize(i)
+ return nil
+ }
+ var s string
+ err := json.Unmarshal(data, &s)
+ if err != nil {
+ return err
+ }
+ split := strings.LastIndexAny(s, "0123456789.+-eE") + 1
+ if split == 0 {
+ return fmt.Errorf("invalid byte size %q", s)
+ }
+ if s[split-1] == 'E' {
+ // We accepted an E as if it started the exponent part
+ // of a json number, but if the next char isn't +, -,
+ // or digit, then the E must have meant Exa. Instead
+ // of "4.5E"+"iB" we want "4.5"+"EiB".
+ split--
+ }
+ var val json.Number
+ dec := json.NewDecoder(strings.NewReader(s[:split]))
+ dec.UseNumber()
+ err = dec.Decode(&val)
+ if err != nil {
+ return err
+ }
+ if split == len(s) {
+ return nil
+ }
+ prefix := strings.Trim(s[split:], " ")
+ if strings.HasSuffix(prefix, "B") {
+ prefix = prefix[:len(prefix)-1]
+ }
+ pval, ok := prefixValue[prefix]
+ if !ok {
+ return fmt.Errorf("invalid unit %q", strings.Trim(s[split:], " "))
+ }
+ if intval, err := val.Int64(); err == nil {
+ if pval > 1 && (intval*pval)/pval != intval {
+ return fmt.Errorf("size %q overflows int64", s)
+ }
+ *n = ByteSize(intval * pval)
+ return nil
+ } else if floatval, err := val.Float64(); err == nil {
+ if floatval*float64(pval) > math.MaxInt64 {
+ return fmt.Errorf("size %q overflows int64", s)
+ }
+ *n = ByteSize(int64(floatval * float64(pval)))
+ return nil
+ } else {
+ return fmt.Errorf("bug: json.Number for %q is not int64 or float64: %s", s, err)
+ }
+}
--- /dev/null
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package arvados
+
+import (
+ "github.com/ghodss/yaml"
+ check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&ByteSizeSuite{})
+
+type ByteSizeSuite struct{}
+
+func (s *ByteSizeSuite) TestUnmarshal(c *check.C) {
+ for _, testcase := range []struct {
+ in string
+ out int64
+ }{
+ {"0", 0},
+ {"5", 5},
+ {"5B", 5},
+ {"5 B", 5},
+ {" 4 KiB ", 4096},
+ {"0K", 0},
+ {"0Ki", 0},
+ {"0 KiB", 0},
+ {"4K", 4000},
+ {"4KB", 4000},
+ {"4Ki", 4096},
+ {"4KiB", 4096},
+ {"4MB", 4000000},
+ {"4MiB", 4194304},
+ {"4GB", 4000000000},
+ {"4 GiB", 4294967296},
+ {"4TB", 4000000000000},
+ {"4TiB", 4398046511104},
+ {"4PB", 4000000000000000},
+ {"4PiB", 4503599627370496},
+ {"4EB", 4000000000000000000},
+ {"4EiB", 4611686018427387904},
+ {"4.5EiB", 5188146770730811392},
+ {"1.5 GB", 1500000000},
+ {"1.5 GiB", 1610612736},
+ {"1.234 GiB", 1324997410}, // rounds down from 1324997410.816
+ {"1e2 KB", 100000},
+ {"20E-1 KiB", 2048},
+ {"1E0EB", 1000000000000000000},
+ {"1E-1EB", 100000000000000000},
+ {"1E-1EiB", 115292150460684704},
+ {"4.5E15 K", 4500000000000000000},
+ } {
+ var n ByteSize
+ err := yaml.Unmarshal([]byte(testcase.in+"\n"), &n)
+ c.Logf("%v => %v: %v", testcase.in, testcase.out, n)
+ c.Check(err, check.IsNil)
+ c.Check(int64(n), check.Equals, testcase.out)
+ }
+ for _, testcase := range []string{
+ "B", "K", "KB", "KiB", "4BK", "4iB", "4A", "b", "4b", "4mB", "4m", "4mib", "4KIB", "4K iB", "4Ki B", "BB", "4BB",
+ "400000 EB", // overflows int64
+ "4.11e4 EB", // ok as float64, but overflows int64
+ } {
+ var n ByteSize
+ err := yaml.Unmarshal([]byte(testcase+"\n"), &n)
+ c.Logf("%v => error: %v", n, err)
+ c.Check(err, check.NotNil)
+ }
+}
package arvados
import (
+ "encoding/json"
+ "errors"
"fmt"
"os"
ClusterID string `json:"-"`
ManagementToken string
NodeProfiles map[string]NodeProfile
- InstanceTypes []InstanceType
+ InstanceTypes InstanceTypeMap
HTTPRequestTimeout Duration
}
Name string
ProviderType string
VCPUs int
- RAM int64
- Scratch int64
+ RAM ByteSize
+ Scratch ByteSize
Price float64
Preemptible bool
}
+type InstanceTypeMap map[string]InstanceType
+
+var errDuplicateInstanceTypeName = errors.New("duplicate instance type name")
+
+// UnmarshalJSON handles old config files that provide an array of
+// instance types instead of a hash.
+func (it *InstanceTypeMap) UnmarshalJSON(data []byte) error {
+ if len(data) > 0 && data[0] == '[' {
+ var arr []InstanceType
+ err := json.Unmarshal(data, &arr)
+ if err != nil {
+ return err
+ }
+ if len(arr) == 0 {
+ *it = nil
+ return nil
+ }
+ *it = make(map[string]InstanceType, len(arr))
+ for _, t := range arr {
+ if _, ok := (*it)[t.Name]; ok {
+ return errDuplicateInstanceTypeName
+ }
+ (*it)[t.Name] = t
+ }
+ return nil
+ }
+ var hash map[string]InstanceType
+ err := json.Unmarshal(data, &hash)
+ if err != nil {
+ return err
+ }
+ // Fill in Name field using hash key.
+ *it = InstanceTypeMap(hash)
+ for name, t := range *it {
+ t.Name = name
+ (*it)[name] = t
+ }
+ return nil
+}
+
// GetNodeProfile returns a NodeProfile for the given hostname. An
// error is returned if the appropriate configuration can't be
// determined (e.g., this does not appear to be a system node). If
--- /dev/null
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package arvados
+
+import (
+ "github.com/ghodss/yaml"
+ check "gopkg.in/check.v1"
+)
+
+var _ = check.Suite(&ConfigSuite{})
+
+type ConfigSuite struct{}
+
+func (s *ConfigSuite) TestInstanceTypesAsArray(c *check.C) {
+ var cluster Cluster
+ yaml.Unmarshal([]byte("InstanceTypes:\n- Name: foo\n"), &cluster)
+ c.Check(len(cluster.InstanceTypes), check.Equals, 1)
+ c.Check(cluster.InstanceTypes["foo"].Name, check.Equals, "foo")
+}
+
+func (s *ConfigSuite) TestInstanceTypesAsHash(c *check.C) {
+ var cluster Cluster
+ yaml.Unmarshal([]byte("InstanceTypes:\n foo:\n ProviderType: bar\n"), &cluster)
+ c.Check(len(cluster.InstanceTypes), check.Equals, 1)
+ c.Check(cluster.InstanceTypes["foo"].Name, check.Equals, "foo")
+ c.Check(cluster.InstanceTypes["foo"].ProviderType, check.Equals, "bar")
+}
+
+func (s *ConfigSuite) TestInstanceTypeSize(c *check.C) {
+ var it InstanceType
+ err := yaml.Unmarshal([]byte("Name: foo\nScratch: 4GB\nRAM: 4GiB\n"), &it)
+ c.Check(err, check.IsNil)
+ c.Check(int64(it.Scratch), check.Equals, int64(4000000000))
+ c.Check(int64(it.RAM), check.Equals, int64(4294967296))
+}
type SchedulingParameters struct {
Partitions []string `json:"partitions"`
Preemptible bool `json:"preemptible"`
+ MaxRunTime int `json:"max_run_time"`
}
// ContainerList is an arvados#containerList resource.
import pprint
import re
import string
-import subprocess
import sys
-import threading
import time
import types
import zlib
delay, exc_info=True)
for conn in self.connections.values():
conn.close()
+ except httplib2.SSLHandshakeError as e:
+ # Intercept and re-raise with a better error message.
+ raise httplib2.SSLHandshakeError("Could not connect to %s\n%s\nPossible causes: remote SSL/TLS certificate expired, or was issued by an untrusted certificate authority." % (uri, e))
+
time.sleep(delay)
delay = delay * self._retry_delay_backoff
if apiconfig is None:
apiconfig = config.settings()
+ errors = []
for x in ['ARVADOS_API_HOST', 'ARVADOS_API_TOKEN']:
if x not in apiconfig:
- raise ValueError("%s is not set. Aborting." % x)
+ errors.append(x)
+ if errors:
+ raise ValueError(" and ".join(errors)+" not set.\nPlease set in %s or export environment variable." % config.default_config_file)
host = apiconfig.get('ARVADOS_API_HOST')
token = apiconfig.get('ARVADOS_API_TOKEN')
insecure = config.flag_is_true('ARVADOS_API_HOST_INSECURE', apiconfig)
import json
import os
import re
-import subprocess
+import subprocess32 as subprocess
import sys
import tarfile
import tempfile
def __init__(self, root, user_agent_pool=queue.LifoQueue(),
upload_counter=None,
download_counter=None,
- headers={}):
+ headers={},
+ insecure=False):
self.root = root
self._user_agent_pool = user_agent_pool
self._result = {'error': None}
self.put_headers = headers
self.upload_counter = upload_counter
self.download_counter = download_counter
+ self.insecure = insecure
def usable(self):
"""Is it worth attempting a request?"""
'{}: {}'.format(k,v) for k,v in self.get_headers.items()])
curl.setopt(pycurl.WRITEFUNCTION, response_body.write)
curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
+ if self.insecure:
+ curl.setopt(pycurl.SSL_VERIFYPEER, 0)
if method == "HEAD":
curl.setopt(pycurl.NOBODY, True)
self._setcurltimeouts(curl, timeout)
'{}: {}'.format(k,v) for k,v in self.put_headers.items()])
curl.setopt(pycurl.WRITEFUNCTION, response_body.write)
curl.setopt(pycurl.HEADERFUNCTION, self._headerfunction)
+ if self.insecure:
+ curl.setopt(pycurl.SSL_VERIFYPEER, 0)
self._setcurltimeouts(curl, timeout)
try:
curl.perform()
if local_store is None:
local_store = os.environ.get('KEEP_LOCAL_STORE')
+ if api_client is None:
+ self.insecure = config.flag_is_true('ARVADOS_API_HOST_INSECURE')
+ else:
+ self.insecure = api_client.insecure
+
self.block_cache = block_cache if block_cache else KeepBlockCache()
self.timeout = timeout
self.proxy_timeout = proxy_timeout
root, self._user_agent_pool,
upload_counter=self.upload_counter,
download_counter=self.download_counter,
- headers=headers)
+ headers=headers,
+ insecure=self.insecure)
return local_roots
@staticmethod
root: self.KeepService(root, self._user_agent_pool,
upload_counter=self.upload_counter,
download_counter=self.download_counter,
- headers=headers)
+ headers=headers,
+ insecure=self.insecure)
for root in hint_roots
}
self.apiconfig = copy.copy(apiconfig)
self.api_params = api_params
self.local = threading.local()
+
+ # Initialize an API object for this thread before creating
+ # KeepClient, this will report if ARVADOS_API_HOST or
+ # ARVADOS_API_TOKEN are missing.
+ self.localapi()
+
self.keep = keep.KeepClient(api_client=self, **keep_params)
def localapi(self):
'ruamel.yaml >=0.13.11, <0.15',
'setuptools',
'ws4py <0.4',
+ 'subprocess32>=3.5.1',
],
test_suite='tests',
tests_require=['pbr<1.7.0', 'mock>=1.0', 'PyYAML'],
events {
}
http {
- access_log "{{ACCESSLOG}}" combined;
+ log_format customlog
+ '[$time_local] $server_name $status $body_bytes_sent $request_time $request_method "$scheme://$http_host$request_uri" $remote_addr:$remote_port '
+ '"$http_referer" "$http_user_agent"';
+ access_log "{{ACCESSLOG}}" customlog;
client_body_temp_path "{{TMPDIR}}";
upstream arv-git-http {
server localhost:{{GITPORT}};
}
server {
listen *:{{GITSSLPORT}} ssl default_server;
- server_name _;
+ server_name arv-git-http;
ssl_certificate "{{SSLCERT}}";
ssl_certificate_key "{{SSLKEY}}";
location / {
proxy_pass http://arv-git-http;
+ proxy_set_header Host $http_host;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header X-Forwarded-Proto https;
+ proxy_redirect off;
}
}
upstream keepproxy {
}
server {
listen *:{{KEEPPROXYSSLPORT}} ssl default_server;
- server_name _;
+ server_name keepproxy;
ssl_certificate "{{SSLCERT}}";
ssl_certificate_key "{{SSLKEY}}";
location / {
proxy_pass http://keepproxy;
+ proxy_set_header Host $http_host;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header X-Forwarded-Proto https;
+ proxy_redirect off;
}
}
upstream keep-web {
}
server {
listen *:{{KEEPWEBSSLPORT}} ssl default_server;
- server_name ~^(?<request_host>.*)$;
+ server_name keep-web;
ssl_certificate "{{SSLCERT}}";
ssl_certificate_key "{{SSLKEY}}";
location / {
proxy_pass http://keep-web;
- proxy_set_header Host $request_host:{{KEEPWEBPORT}};
+ proxy_set_header Host $http_host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header X-Forwarded-Proto https;
+ proxy_redirect off;
}
}
server {
listen *:{{KEEPWEBDLSSLPORT}} ssl default_server;
- server_name ~.*;
+ server_name keep-web-dl ~.*;
ssl_certificate "{{SSLCERT}}";
ssl_certificate_key "{{SSLKEY}}";
location / {
proxy_pass http://keep-web;
- proxy_set_header Host download:{{KEEPWEBPORT}};
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
- proxy_redirect //download:{{KEEPWEBPORT}}/ https://$host:{{KEEPWEBDLSSLPORT}}/;
+ proxy_set_header X-Forwarded-Proto https;
+
+ # Unlike other proxy sections, here we need to override the
+ # requested Host header and use proxy_redirect because of the
+ # way the test suite orchestrates services. Keep-web's "download
+ # only" behavior relies on the Host header matching a configured
+ # value, but when run_test_servers.py writes keep-web's command
+ # line, the keep-web-dl TLS port (which clients will connect to
+ # and include in their Host header) has not yet been assigned.
+ #
+ # In production, "proxy_set_header Host $http_host;
+ # proxy_redirect off;" works: keep-web's redirect URLs will
+ # match the request URL received by Nginx.
+ #
+ # Here, keep-web will issue redirects to https://download/ and
+ # Nginx will rewrite them.
+ #
+ proxy_set_header Host download;
+ proxy_redirect https://download/ https://$host:{{KEEPWEBDLSSLPORT}}/;
}
}
upstream ws {
}
server {
listen *:{{WSSPORT}} ssl default_server;
- server_name ~^(?<request_host>.*)$;
+ server_name websocket;
ssl_certificate "{{SSLCERT}}";
ssl_certificate_key "{{SSLKEY}}";
location / {
proxy_pass http://ws;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
- proxy_set_header Host $request_host:{{WSPORT}};
+ proxy_set_header Host $http_host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header X-Forwarded-Proto https;
+ proxy_redirect off;
}
}
upstream controller {
}
server {
listen *:{{CONTROLLERSSLPORT}} ssl default_server;
- server_name _;
+ server_name controller;
ssl_certificate "{{SSLCERT}}";
ssl_certificate_key "{{SSLKEY}}";
location / {
proxy_pass http://controller;
+ proxy_set_header Host $http_host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header X-Forwarded-Proto https;
+ proxy_redirect off;
}
}
}
sock.close()
return port
-def _wait_until_port_listens(port, timeout=10):
+def _wait_until_port_listens(port, timeout=10, warn=True):
"""Wait for a process to start listening on the given port.
If nothing listens on the port within the specified timeout (given
except subprocess.CalledProcessError:
time.sleep(0.1)
continue
- return
- print(
- "WARNING: Nothing is listening on port {} (waited {} seconds).".
- format(port, timeout),
- file=sys.stderr)
+ return True
+ if warn:
+ print(
+ "WARNING: Nothing is listening on port {} (waited {} seconds).".
+ format(port, timeout),
+ file=sys.stderr)
+ return False
def _logfilename(label):
"""Set up a labelled log file, and return a path to write logs to.
'POST',
headers={'Authorization': 'OAuth2 {}'.format(token)})
os.environ['ARVADOS_API_HOST_INSECURE'] = 'true'
- os.environ['ARVADOS_API_HOST'] = existing_api_host
os.environ['ARVADOS_API_TOKEN'] = token
+ if _wait_until_port_listens(_getport('controller-ssl'), timeout=0.5, warn=False):
+ os.environ['ARVADOS_API_HOST'] = '0.0.0.0:'+str(_getport('controller-ssl'))
+ else:
+ os.environ['ARVADOS_API_HOST'] = existing_api_host
def stop(force=False):
"""Stop the API server, if one is running.
keepweb = subprocess.Popen(
['keep-web',
'-allow-anonymous',
- '-attachment-only-host=download:'+str(keepwebport),
+ '-attachment-only-host=download',
'-listen=:'+str(keepwebport)],
env=env, stdin=open('/dev/null'), stdout=logf, stderr=logf)
with open(_pidfile('keep-web'), 'w') as f:
self.assertEqual('100::1', service.hostname)
self.assertEqual(10, service.port)
+ def test_insecure_disables_tls_verify(self):
+ api_client = self.mock_keep_services(count=1)
+ force_timeout = socket.timeout("timed out")
+
+ api_client.insecure = True
+ with tutil.mock_keep_responses(b'foo', 200) as mock:
+ keep_client = arvados.KeepClient(api_client=api_client)
+ keep_client.get('acbd18db4cc2f85cedef654fccc4a4d8+3')
+ self.assertEqual(
+ mock.responses[0].getopt(pycurl.SSL_VERIFYPEER),
+ 0)
+
+ api_client.insecure = False
+ with tutil.mock_keep_responses(b'foo', 200) as mock:
+ keep_client = arvados.KeepClient(api_client=api_client)
+ keep_client.get('acbd18db4cc2f85cedef654fccc4a4d8+3')
+ # getopt()==None here means we didn't change the
+ # default. If we were using real pycurl instead of a mock,
+ # it would return the default value 1.
+ self.assertEqual(
+ mock.responses[0].getopt(pycurl.SSL_VERIFYPEER),
+ None)
+
# test_*_timeout verify that KeepClient instructs pycurl to use
# the appropriate connection and read timeouts. They don't care
# whether pycurl actually exhibits the expected timeout behavior
def __getattr__(self, r):
if r == "api_token":
return "abc"
+ elif r == "insecure":
+ return False
else:
raise arvados.errors.KeepReadError()
keep_client = arvados.KeepClient(api_client=ApiMock(),
gem 'multi_json'
gem 'oj'
-gem 'oj_mimic_json'
# for building assets
gem 'sass-rails', '~> 4.0'
gem 'themes_for_rails', git: 'https://github.com/curoverse/themes_for_rails'
gem 'arvados', '>= 0.1.20150615153458'
-gem 'arvados-cli', '>= 0.1.20161017193526'
gem 'httpclient'
gem 'sshkey'
i18n (~> 0)
json (>= 1.7.7, < 3)
jwt (>= 0.1.5, < 2)
- arvados-cli (1.1.4.20180412190507)
- activesupport (>= 3.2.13, < 5)
- andand (~> 1.3, >= 1.3.3)
- arvados (~> 0.1, >= 0.1.20150128223554)
- curb (~> 0.8)
- google-api-client (~> 0.6, >= 0.6.3, < 0.8.9)
- json (>= 1.7.7, < 3)
- oj (~> 2.0, >= 2.0.3)
- trollop (~> 2.0)
autoparse (0.3.3)
addressable (>= 2.3.1)
extlib (>= 0.9.15)
coffee-script-source (1.12.2)
concurrent-ruby (1.0.5)
crass (1.0.4)
- curb (0.9.4)
database_cleaner (1.7.0)
erubis (2.7.0)
eventmachine (1.2.6)
multi_json (~> 1.3)
multi_xml (~> 0.5)
rack (>= 1.2, < 3)
- oj (2.18.5)
- oj_mimic_json (1.0.1)
+ oj (3.6.4)
omniauth (1.4.3)
hashie (>= 1.2, < 4)
rack (>= 1.6.2, < 3)
simplecov-html (0.7.1)
simplecov-rcov (0.2.3)
simplecov (>= 0.4.1)
- sprockets (2.12.4)
+ sprockets (2.12.5)
hike (~> 1.2)
multi_json (~> 1.0)
rack (~> 1.0)
acts_as_api
andand
arvados (>= 0.1.20150615153458)
- arvados-cli (>= 0.1.20161017193526)
coffee-rails (~> 4.0)
database_cleaner
factory_girl_rails
mocha
multi_json
oj
- oj_mimic_json
omniauth (~> 1.4.0)
omniauth-oauth2 (~> 1.1)
passenger
uglifier (~> 2.0)
BUNDLED WITH
- 1.16.1
+ 1.16.2
version: "v1",
revision: "20131114",
source_version: AppVersion.hash,
+ sourceVersion: AppVersion.hash, # source_version should be deprecated in the future
+ packageVersion: AppVersion.package_version,
generatedAt: db_current_time.iso8601,
title: "Arvados API",
description: "The API to interact with Arvados.",
end
end
- if Rails.configuration.new_users_are_active
+ if Rails.configuration.new_users_are_active ||
+ Rails.configuration.auto_activate_users_from.include?(remote_user['uuid'][0..4])
# Update is_active to whatever it is at the remote end
user.is_active = remote_user['is_active']
elsif !remote_user['is_active']
before_validation :fill_field_defaults, :if => :new_record?
before_validation :validate_runtime_constraints
- before_validation :set_container
before_validation :set_default_preemptible_scheduling_parameter
+ before_validation :set_container
validates :command, :container_image, :output_path, :cwd, :presence => true
validates :output_ttl, numericality: { only_integer: true, greater_than_or_equal_to: 0 }
validates :priority, numericality: { only_integer: true, greater_than_or_equal_to: 0, less_than_or_equal_to: 1000 }
end
def set_default_preemptible_scheduling_parameter
+ c = get_requesting_container()
if self.state == Committed
# If preemptible instances (eg: AWS Spot Instances) are allowed,
# ask them on child containers by default.
- if Rails.configuration.preemptible_instances and
- !self.requesting_container_uuid.nil? and
+ if Rails.configuration.preemptible_instances and !c.nil? and
self.scheduling_parameters['preemptible'].nil?
self.scheduling_parameters['preemptible'] = true
end
if !Rails.configuration.preemptible_instances and scheduling_parameters['preemptible']
errors.add :scheduling_parameters, "preemptible instances are not allowed"
end
+ if scheduling_parameters.include? 'max_run_time' and
+ (!scheduling_parameters['max_run_time'].is_a?(Integer) ||
+ scheduling_parameters['max_run_time'] < 0)
+ errors.add :scheduling_parameters, "max_run_time must be positive integer"
+ end
end
end
end
def set_requesting_container_uuid
- return if !current_api_client_authorization
- if (c = Container.where('auth_uuid=?', current_api_client_authorization.uuid).select([:uuid, :priority]).first)
+ c = get_requesting_container()
+ if !c.nil?
self.requesting_container_uuid = c.uuid
self.priority = c.priority>0 ? 1 : 0
end
end
+
+ def get_requesting_container
+ return self.requesting_container_uuid if !self.requesting_container_uuid.nil?
+ return if !current_api_client_authorization
+ if (c = Container.where('auth_uuid=?', current_api_client_authorization.uuid).select([:uuid, :priority]).first)
+ return c
+ end
+ end
end
before_create :set_initial_username, :if => Proc.new { |user|
user.username.nil? and user.email
}
+ after_create :setup_on_activate
after_create :add_system_group_permission_link
after_create :invalidate_permissions_cache
after_create :auto_setup_new_user, :if => Proc.new { |user|
if !oid_login_perms.any?
# create openid login permission
- oid_login_perm = Link.create(link_class: 'permission',
+ oid_login_perm = Link.create!(link_class: 'permission',
name: 'can_login',
tail_uuid: self.email,
head_uuid: self.uuid,
### New user and & email settings
###
- # Config parameters to automatically setup new users.
+ # Config parameters to automatically setup new users. If enabled,
+ # this users will be able to self-activate. Enable this if you want
+ # to run an open instance where anyone can create an account and use
+ # the system without requiring manual approval.
+ #
# The params auto_setup_new_users_with_* are meaningful only when auto_setup_new_users is turned on.
# auto_setup_name_blacklist is a list of usernames to be blacklisted for auto setup.
auto_setup_new_users: false
auto_setup_new_users_with_repository: false
auto_setup_name_blacklist: [arvados, git, gitolite, gitolite-admin, root, syslog]
- # When new_users_are_active is set to true, the user agreement check is skipped.
+ # When new_users_are_active is set to true, new users will be active
+ # immediately. This skips the "self-activate" step which enforces
+ # user agreements. Should only be enabled for development.
new_users_are_active: false
# The e-mail address of the user you would like to become marked as an admin
# remote_hosts above.
remote_hosts_via_dns: false
+ # List of cluster prefixes. These are "trusted" clusters, users
+ # from the clusters listed here will be automatically setup and
+ # activated. This is separate from the settings
+ # auto_setup_new_users and new_users_are_active.
+ auto_activate_users_from: []
+
###
### Remaining assorted configuration options.
###
# "git log".
source_version: false
+ # Override the automatic package version string. With the default version of
+ # false, the package version is read from package-build.version in Rails.root
+ # (included in vendor packages).
+ package_version: false
+
# Enable asynchronous permission graph rebuild. Must run
# script/permission-updater.rb as a separate process. When the permission
# cache is invalidated, the background process will update the permission
--- /dev/null
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+require 'oj'
+
+Oj::Rails.set_encoder()
+Oj::Rails.set_decoder()
+Oj::Rails.optimize()
+Oj::Rails.mimic_JSON()
+
#
# SPDX-License-Identifier: AGPL-3.0
+ActiveSupport::JSON::Encoding.time_precision = 9
+
class ActiveSupport::TimeWithZone
remove_method :as_json
def as_json *args
def self.forget
@hash = nil
+ @package_version = nil
end
# Return abbrev commit hash for current code version: "abc1234", or
@hash || "unknown"
end
+
+ def self.package_version
+ if (cached = Rails.configuration.package_version || @package_version)
+ return cached
+ end
+
+ begin
+ @package_version = IO.read(Rails.root.join("package-build.version")).strip
+ rescue Errno::ENOENT
+ @package_version = "unknown"
+ end
+
+ @package_version
+ end
end
@fetched_commits[sha1] = ($? == 0)
end
- def tag_commit(commit_hash, tag_name)
+ def tag_commit(job, commit_hash, tag_name)
# @git_tags[T]==V if we know commit V has been tagged T in the
# arvados_internal repository.
if not @git_tags[tag_name]
next
end
ready &&= get_commit repo.server_path, job.script_version
- ready &&= tag_commit job.script_version, job.uuid
+ ready &&= tag_commit job, job.script_version, job.uuid
end
# This should be unnecessary, because API server does it during
# job create/update, but it's still not a bad idea to verify the
# tag is correct before starting the job:
- ready &&= tag_commit job.script_version, job.uuid
+ ready &&= tag_commit job, job.script_version, job.uuid
# The arvados_sdk_version doesn't support use of arbitrary
# remote URLs, so the requested version isn't necessarily copied
# into the internal repository yet.
if job.arvados_sdk_version
ready &&= get_commit @arvados_repo_path, job.arvados_sdk_version
- ready &&= tag_commit job.arvados_sdk_version, "#{job.uuid}-arvados-sdk"
+ ready &&= tag_commit job, job.arvados_sdk_version, "#{job.uuid}-arvados-sdk"
end
if not ready
return Oj.dump(o, mode: :compat)
end
def self.load(s)
+ if s.nil? or s == ''
+ # Oj 2.18.5 used to return nil. Not anymore on 3.6.4.
+ # Upgraded for performance issues (see #13803 and
+ # https://github.com/ohler55/oj/issues/441)
+ return nil
+ end
Oj.strict_load(s, symbol_keys: false)
end
end
def check_update_whitelist permitted_fields
attribute_names.each do |field|
if !permitted_fields.include?(field.to_sym) && really_changed(field)
- errors.add field, "cannot be modified in this state (#{send(field+"_was").inspect}, #{send(field).inspect})"
+ errors.add field, "cannot be modified in state '#{self.state}' (#{send(field+"_was").inspect}, #{send(field).inspect})"
end
end
end
assert_includes discovery_doc, 'defaultTrashLifetime'
assert_equal discovery_doc['defaultTrashLifetime'], Rails.application.config.default_trash_lifetime
assert_match(/^[0-9a-f]+(-modified)?$/, discovery_doc['source_version'])
+ assert_match(/^[0-9a-f]+(-modified)?$/, discovery_doc['sourceVersion'])
+ assert_match(/^unknown$/, discovery_doc['packageVersion'])
assert_equal discovery_doc['websocketUrl'], Rails.application.config.websocket_address
assert_equal discovery_doc['workbenchUrl'], Rails.application.config.workbench_address
assert_equal('zzzzz', discovery_doc['uuidPrefix'])
end
- test "discovery document overrides source_version with config" do
+ test "discovery document overrides source_version & sourceVersion with config" do
Rails.configuration.source_version = 'aaa888fff'
get :index
assert_response :success
discovery_doc = JSON.parse(@response.body)
+ # Key source_version will be replaced with sourceVersion
assert_equal 'aaa888fff', discovery_doc['source_version']
+ assert_equal 'aaa888fff', discovery_doc['sourceVersion']
+ end
+
+ test "discovery document overrides packageVersion with config" do
+ Rails.configuration.package_version = '1.0.0-stable'
+ get :index
+ assert_response :success
+ discovery_doc = JSON.parse(@response.body)
+ assert_equal '1.0.0-stable', discovery_doc['packageVersion']
end
test "empty disable_api_methods" do
assert_response :success
assert_equal 'zbbbb-tpzed-000000000000000', json_response['uuid']
assert_equal false, json_response['is_admin']
+ assert_equal false, json_response['is_active']
assert_equal 'foo@example.com', json_response['email']
assert_equal 'barney', json_response['username']
refute_includes(group_uuids, groups(:trashed_project).uuid)
refute_includes(group_uuids, groups(:testusergroup_admins).uuid)
end
+
+ test 'auto-activate user from trusted cluster' do
+ Rails.configuration.auto_activate_users_from = ['zbbbb']
+ get '/arvados/v1/users/current', {format: 'json'}, auth(remote: 'zbbbb')
+ assert_response :success
+ assert_equal 'zbbbb-tpzed-000000000000000', json_response['uuid']
+ assert_equal false, json_response['is_admin']
+ assert_equal true, json_response['is_active']
+ assert_equal 'foo@example.com', json_response['email']
+ assert_equal 'barney', json_response['username']
+ end
+
+ test 'pre-activate remote user' do
+ post '/arvados/v1/users', {
+ "user" => {
+ "uuid" => "zbbbb-tpzed-000000000000000",
+ "email" => 'foo@example.com',
+ "username" => 'barney',
+ "is_active" => true
+ }
+ }, {'HTTP_AUTHORIZATION' => "OAuth2 #{api_token(:admin)}"}
+ assert_response :success
+
+ get '/arvados/v1/users/current', {format: 'json'}, auth(remote: 'zbbbb')
+ assert_response :success
+ assert_equal 'zbbbb-tpzed-000000000000000', json_response['uuid']
+ assert_equal nil, json_response['is_admin']
+ assert_equal true, json_response['is_active']
+ assert_equal 'foo@example.com', json_response['email']
+ assert_equal 'barney', json_response['username']
+ end
+
end
end
def self.slow_test(name, &block)
- define_method(name, block) unless skip_slow_tests?
+ test(name, &block) unless skip_slow_tests?
end
end
[{"partitions" => "fastcpu"}, ContainerRequest::Committed, ActiveRecord::RecordInvalid],
[{"partitions" => "fastcpu"}, ContainerRequest::Uncommitted],
[{"partitions" => ["fastcpu","vfastcpu"]}, ContainerRequest::Committed],
+ [{"max_run_time" => "one day"}, ContainerRequest::Committed, ActiveRecord::RecordInvalid],
+ [{"max_run_time" => "one day"}, ContainerRequest::Uncommitted],
+ [{"max_run_time" => -1}, ContainerRequest::Committed, ActiveRecord::RecordInvalid],
+ [{"max_run_time" => -1}, ContainerRequest::Uncommitted],
+ [{"max_run_time" => 86400}, ContainerRequest::Committed],
].each do |sp, state, expected|
test "create container request with scheduling_parameters #{sp} in state #{state} and verify #{expected}" do
common_attrs = {cwd: "test",
end
end
+ test "Having preemptible_instances=true create a committed child container request and verify the scheduling parameter of its container" do
+ common_attrs = {cwd: "test",
+ priority: 1,
+ command: ["echo", "hello"],
+ output_path: "test",
+ state: ContainerRequest::Committed,
+ mounts: {"test" => {"kind" => "json"}}}
+ set_user_from_auth :active
+ Rails.configuration.preemptible_instances = true
+
+ cr = with_container_auth(Container.find_by_uuid 'zzzzz-dz642-runningcontainr') do
+ create_minimal_req!(common_attrs)
+ end
+ assert_equal 'zzzzz-dz642-runningcontainr', cr.requesting_container_uuid
+ assert_equal true, cr.scheduling_parameters["preemptible"]
+
+ c = Container.find_by_uuid(cr.container_uuid)
+ assert_equal true, c.scheduling_parameters["preemptible"]
+ end
+
[['Committed', true, {name: "foobar", priority: 123}],
['Committed', false, {container_count: 2}],
['Committed', false, {container_count: 0}],
assert_equal(expect_username, user.username)
# check user setup
- verify_link_exists(Rails.configuration.auto_setup_new_users,
+ verify_link_exists(Rails.configuration.auto_setup_new_users || active,
groups(:all_users).uuid, user.uuid,
"permission", "can_read")
# Check for OID login link.
- verify_link_exists(Rails.configuration.auto_setup_new_users,
+ verify_link_exists(Rails.configuration.auto_setup_new_users || active,
user.uuid, user.email, "permission", "can_login")
# Check for repository.
if named_repo = (prior_repo or
}
type slurmFake struct {
- didBatch [][]string
- didCancel []string
- didRelease []string
- didRenice [][]string
- queue string
+ didBatch [][]string
+ didCancel []string
+ didRelease []string
+ didRenice [][]string
+ queue string
+ rejectNice10K bool
// If non-nil, run this func during the 2nd+ call to Cancel()
onCancel func()
// Error returned by Batch()
func (sf *slurmFake) Renice(name string, nice int64) error {
sf.didRenice = append(sf.didRenice, []string{name, fmt.Sprintf("%d", nice)})
+ if sf.rejectNice10K && nice > 10000 {
+ return errors.New("scontrol: error: Invalid nice value, must be between -10000 and 10000")
+ }
return nil
}
}
for _, trial := range []struct {
- types []arvados.InstanceType
+ types map[string]arvados.InstanceType
sbatchArgs []string
err error
}{
// Choose node type => use --constraint arg
{
- types: []arvados.InstanceType{
- {Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
- {Name: "a1.small", Price: 0.04, RAM: 256000000, VCPUs: 2},
- {Name: "a1.medium", Price: 0.08, RAM: 512000000, VCPUs: 4},
- {Name: "a1.large", Price: 0.16, RAM: 1024000000, VCPUs: 8},
+ types: map[string]arvados.InstanceType{
+ "a1.tiny": {Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
+ "a1.small": {Name: "a1.small", Price: 0.04, RAM: 256000000, VCPUs: 2},
+ "a1.medium": {Name: "a1.medium", Price: 0.08, RAM: 512000000, VCPUs: 4},
+ "a1.large": {Name: "a1.large", Price: 0.16, RAM: 1024000000, VCPUs: 8},
},
sbatchArgs: []string{"--constraint=instancetype=a1.medium"},
},
},
// No node type is big enough => error
{
- types: []arvados.InstanceType{
- {Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
+ types: map[string]arvados.InstanceType{
+ "a1.tiny": {Name: "a1.tiny", Price: 0.02, RAM: 128000000, VCPUs: 1},
},
err: dispatchcloud.ConstraintsNotSatisfiableError{},
},
"time"
)
+const slurm15NiceLimit int64 = 10000
+
type slurmJob struct {
uuid string
wantPriority int64
priority int64 // current slurm priority (incorporates nice value)
nice int64 // current slurm nice value
+ hitNiceLimit bool
}
// Squeue implements asynchronous polling monitor of the SLURM queue using the
})
renice := wantNice(jobs, sqc.PrioritySpread)
for i, job := range jobs {
- if renice[i] == job.nice {
+ niceNew := renice[i]
+ if job.hitNiceLimit && niceNew > slurm15NiceLimit {
+ niceNew = slurm15NiceLimit
+ }
+ if niceNew == job.nice {
continue
}
- sqc.Slurm.Renice(job.uuid, renice[i])
+ err := sqc.Slurm.Renice(job.uuid, niceNew)
+ if err != nil && niceNew > slurm15NiceLimit && strings.Contains(err.Error(), "Invalid nice value") {
+ log.Printf("container %q clamping nice values at %d, priority order will not be correct -- see https://dev.arvados.org/projects/arvados/wiki/SLURM_integration#Limited-nice-values-SLURM-15", job.uuid, slurm15NiceLimit)
+ job.hitNiceLimit = true
+ }
}
}
replacing.nice = n
newq[uuid] = replacing
- if state == "PENDING" && ((reason == "BadConstraints" && p == 0) || reason == "launch failed requeued held") && replacing.wantPriority > 0 {
+ if state == "PENDING" && ((reason == "BadConstraints" && p <= 2*slurm15NiceLimit) || reason == "launch failed requeued held") && replacing.wantPriority > 0 {
// When using SLURM 14.x or 15.x, our queued
// jobs land in this state when "scontrol
// reconfigure" invalidates their feature
// constraints by clearing all node features.
// They stay in this state even after the
// features reappear, until we run "scontrol
- // release {jobid}".
+ // release {jobid}". Priority is usually 0 in
+ // this state, but sometimes (due to a race
+ // with nice adjustments?) it's a small
+ // positive value.
//
// "scontrol release" is silent and successful
// regardless of whether the features have
// "launch failed requeued held" seems to be
// another manifestation of this problem,
// resolved the same way.
- log.Printf("releasing held job %q", uuid)
+ log.Printf("releasing held job %q (priority=%d, state=%q, reason=%q)", uuid, p, state, reason)
sqc.Slurm.Release(uuid)
} else if p < 1<<20 && replacing.wantPriority > 0 {
log.Printf("warning: job %q has low priority %d, nice %d, state %q, reason %q", uuid, p, n, state, reason)
}
}
+// If a limited nice range prevents desired priority adjustments, give
+// up and clamp nice to 10K.
+func (s *SqueueSuite) TestReniceInvalidNiceValue(c *C) {
+ uuids := []string{"zzzzz-dz642-fake0fake0fake0", "zzzzz-dz642-fake1fake1fake1", "zzzzz-dz642-fake2fake2fake2"}
+ slurm := &slurmFake{
+ queue: uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 0 4294555222 PENDING Resources\n",
+ rejectNice10K: true,
+ }
+ sqc := &SqueueChecker{
+ Slurm: slurm,
+ PrioritySpread: 1,
+ Period: time.Hour,
+ }
+ sqc.startOnce.Do(sqc.start)
+ sqc.check()
+ sqc.SetPriority(uuids[0], 2)
+ sqc.SetPriority(uuids[1], 1)
+
+ // First attempt should renice to 555001, which will fail
+ sqc.reniceAll()
+ c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}})
+
+ // Next attempt should renice to 10K, which will succeed
+ sqc.reniceAll()
+ c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+ // ...so we'll change the squeue response to reflect the
+ // updated priority+nice, and make sure sqc sees that...
+ slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294545222 PENDING Resources\n"
+ sqc.check()
+
+ // Next attempt should leave nice alone because it's already
+ // at the 10K limit
+ sqc.reniceAll()
+ c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}})
+
+ // Back to normal if desired nice value falls below 10K
+ slurm.queue = uuids[0] + " 0 4294000222 PENDING Resources\n" + uuids[1] + " 10000 4294000111 PENDING Resources\n"
+ sqc.check()
+ sqc.reniceAll()
+ c.Check(slurm.didRenice, DeepEquals, [][]string{{uuids[1], "555001"}, {uuids[1], "10000"}, {uuids[1], "9890"}})
+
+ sqc.Stop()
+}
+
// If the given UUID isn't in the slurm queue yet, SetPriority()
// should wait for it to appear on the very next poll, then give up.
func (s *SqueueSuite) TestSetPriorityBeforeQueued(c *C) {
"git.curoverse.com/arvados.git/sdk/go/arvadosclient"
"git.curoverse.com/arvados.git/sdk/go/keepclient"
"git.curoverse.com/arvados.git/sdk/go/manifest"
+ "github.com/shirou/gopsutil/process"
"golang.org/x/net/context"
dockertypes "github.com/docker/docker/api/types"
ImageRemove(ctx context.Context, image string, options dockertypes.ImageRemoveOptions) ([]dockertypes.ImageDeleteResponseItem, error)
}
+type PsProcess interface {
+ CmdlineSlice() ([]string, error)
+}
+
// ContainerRunner is the main stateful struct used for a single execution of a
// container.
type ContainerRunner struct {
finalState string
parentTemp string
+ ListProcesses func() ([]PsProcess, error)
+
statLogger io.WriteCloser
statReporter *crunchstat.Reporter
hoststatLogger io.WriteCloser
cStateLock sync.Mutex
cCancelled bool // StopContainer() invoked
- enableNetwork string // one of "default" or "always"
- networkMode string // passed through to HostConfig.NetworkMode
- arvMountLog *ThrottledLogger
+ enableNetwork string // one of "default" or "always"
+ networkMode string // passed through to HostConfig.NetworkMode
+ arvMountLog *ThrottledLogger
+ checkContainerd time.Duration
}
// setupSignals sets up signal handling to gracefully terminate the underlying
var errorBlacklist = []string{
"(?ms).*[Cc]annot connect to the Docker daemon.*",
"(?ms).*oci runtime error.*starting container process.*container init.*mounting.*to rootfs.*no such file or directory.*",
+ "(?ms).*grpc: the connection is unavailable.*",
}
var brokenNodeHook *string = flag.String("broken-node-hook", "", "Script to run if node is detected to be broken (for example, Docker daemon is not running)")
+func (runner *ContainerRunner) runBrokenNodeHook() {
+ if *brokenNodeHook == "" {
+ runner.CrunchLog.Printf("No broken node hook provided, cannot mark node as broken.")
+ } else {
+ runner.CrunchLog.Printf("Running broken node hook %q", *brokenNodeHook)
+ // run killme script
+ c := exec.Command(*brokenNodeHook)
+ c.Stdout = runner.CrunchLog
+ c.Stderr = runner.CrunchLog
+ err := c.Run()
+ if err != nil {
+ runner.CrunchLog.Printf("Error running broken node hook: %v", err)
+ }
+ }
+}
+
func (runner *ContainerRunner) checkBrokenNode(goterr error) bool {
for _, d := range errorBlacklist {
if m, e := regexp.MatchString(d, goterr.Error()); m && e == nil {
runner.CrunchLog.Printf("Error suggests node is unable to run containers: %v", goterr)
- if *brokenNodeHook == "" {
- runner.CrunchLog.Printf("No broken node hook provided, cannot mark node as broken.")
- } else {
- runner.CrunchLog.Printf("Running broken node hook %q", *brokenNodeHook)
- // run killme script
- c := exec.Command(*brokenNodeHook)
- c.Stdout = runner.CrunchLog
- c.Stderr = runner.CrunchLog
- err := c.Run()
- if err != nil {
- runner.CrunchLog.Printf("Error running broken node hook: %v", err)
- }
- }
+ runner.runBrokenNodeHook()
return true
}
}
runner.ContainerConfig.Volumes = runner.Volumes
maxRAM := int64(runner.Container.RuntimeConstraints.RAM)
+ if maxRAM < 4*1024*1024 {
+ // Docker daemon won't let you set a limit less than 4 MiB
+ maxRAM = 4 * 1024 * 1024
+ }
runner.HostConfig = dockercontainer.HostConfig{
Binds: runner.Binds,
LogConfig: dockercontainer.LogConfig{
return nil
}
+// checkContainerd checks if "containerd" is present in the process list.
+func (runner *ContainerRunner) CheckContainerd() error {
+ if runner.checkContainerd == 0 {
+ return nil
+ }
+ p, _ := runner.ListProcesses()
+ for _, i := range p {
+ e, _ := i.CmdlineSlice()
+ if len(e) > 0 {
+ if strings.Index(e[0], "containerd") > -1 {
+ return nil
+ }
+ }
+ }
+
+ // Not found
+ runner.runBrokenNodeHook()
+ runner.stop(nil)
+ return fmt.Errorf("'containerd' not found in process list.")
+}
+
// WaitFinish waits for the container to terminate, capture the exit code, and
// close the stdout/stderr logging.
func (runner *ContainerRunner) WaitFinish() error {
+ var runTimeExceeded <-chan time.Time
runner.CrunchLog.Print("Waiting for container to finish")
waitOk, waitErr := runner.Docker.ContainerWait(context.TODO(), runner.ContainerID, dockercontainer.WaitConditionNotRunning)
arvMountExit := runner.ArvMountExit
+ if timeout := runner.Container.SchedulingParameters.MaxRunTime; timeout > 0 {
+ runTimeExceeded = time.After(time.Duration(timeout) * time.Second)
+ }
+
+ containerdGone := make(chan error)
+ defer close(containerdGone)
+ if runner.checkContainerd > 0 {
+ go func() {
+ ticker := time.NewTicker(time.Duration(runner.checkContainerd))
+ defer ticker.Stop()
+ for {
+ select {
+ case <-ticker.C:
+ if ck := runner.CheckContainerd(); ck != nil {
+ containerdGone <- ck
+ return
+ }
+ case <-containerdGone:
+ // Channel closed, quit goroutine
+ return
+ }
+ }
+ }()
+ }
+
for {
select {
case waitBody := <-waitOk:
// arvMountExit will always be ready now that
// it's closed, but that doesn't interest us.
arvMountExit = nil
+
+ case <-runTimeExceeded:
+ runner.CrunchLog.Printf("maximum run time exceeded. Stopping container.")
+ runner.stop(nil)
+ runTimeExceeded = nil
+
+ case err := <-containerdGone:
+ return err
}
}
}
return
}
+ // Sanity check that containerd is running.
+ err = runner.CheckContainerd()
+ if err != nil {
+ return
+ }
+
// check for and/or load image
err = runner.LoadImage()
if err != nil {
cr.NewLogWriter = cr.NewArvLogWriter
cr.RunArvMount = cr.ArvMountCmd
cr.MkTempDir = ioutil.TempDir
+ cr.ListProcesses = func() ([]PsProcess, error) {
+ pr, err := process.Processes()
+ if err != nil {
+ return nil, err
+ }
+ ps := make([]PsProcess, len(pr))
+ for i, j := range pr {
+ ps[i] = j
+ }
+ return ps, nil
+ }
cr.MkArvClient = func(token string) (IArvadosClient, error) {
cl, err := arvadosclient.MakeArvadosClient()
if err != nil {
`)
memprofile := flag.String("memprofile", "", "write memory profile to `file` after running container")
getVersion := flag.Bool("version", false, "Print version information and exit.")
+ checkContainerd := flag.Duration("check-containerd", 60*time.Second, "Periodic check if (docker-)containerd is running (use 0s to disable).")
flag.Parse()
// Print version information if requested
cr.expectCgroupParent = *cgroupParent
cr.enableNetwork = *enableNetwork
cr.networkMode = *networkMode
+ cr.checkContainerd = *checkContainerd
if *cgroupParentSubsystem != "" {
p := findCgroup(*cgroupParentSubsystem)
cr.setCgroupParent = p
"mounts": {"/tmp": {"kind": "tmp"} },
"output_path": "/tmp",
"priority": 1,
- "runtime_constraints": {}
+ "runtime_constraints": {}
}`, nil, 0, func(t *TestDockerClient) {
t.logWriter.Write(dockerLog(1, "hello world\n"))
t.logWriter.Close()
}
+func (s *TestSuite) TestRunTimeExceeded(c *C) {
+ api, _, _ := s.fullRunHelper(c, `{
+ "command": ["sleep", "3"],
+ "container_image": "d4ab34d3d4f8a72f5c4973051ae69fab+122",
+ "cwd": ".",
+ "environment": {},
+ "mounts": {"/tmp": {"kind": "tmp"} },
+ "output_path": "/tmp",
+ "priority": 1,
+ "runtime_constraints": {},
+ "scheduling_parameters":{"max_run_time": 1}
+}`, nil, 0, func(t *TestDockerClient) {
+ time.Sleep(3 * time.Second)
+ t.logWriter.Close()
+ })
+
+ c.Check(api.CalledWith("container.state", "Cancelled"), NotNil)
+ c.Check(api.Logs["crunch-run"].String(), Matches, "(?ms).*maximum run time exceeded.*")
+}
+
func (s *TestSuite) TestCrunchstat(c *C) {
api, _, _ := s.fullRunHelper(c, `{
"command": ["sleep", "1"],
c.Check(api.CalledWith("collection.manifest_text", ". 34819d7beeabb9260a5c854bc85b3e44+10 0:10:secret.conf\n"), IsNil)
c.Check(api.CalledWith("collection.manifest_text", ""), NotNil)
}
+
+type FakeProcess struct {
+ cmdLine []string
+}
+
+func (fp FakeProcess) CmdlineSlice() ([]string, error) {
+ return fp.cmdLine, nil
+}
+
+func (s *TestSuite) helpCheckContainerd(c *C, lp func() ([]PsProcess, error)) error {
+ kc := &KeepTestClient{}
+ defer kc.Close()
+ cr, err := NewContainerRunner(s.client, &ArvTestClient{callraw: true}, kc, s.docker, "zzzzz-zzzzz-zzzzzzzzzzzzzzz")
+ cr.checkContainerd = time.Duration(100 * time.Millisecond)
+ c.Assert(err, IsNil)
+ cr.ListProcesses = lp
+
+ s.docker.fn = func(t *TestDockerClient) {
+ time.Sleep(1 * time.Second)
+ t.logWriter.Close()
+ }
+
+ err = cr.CreateContainer()
+ c.Check(err, IsNil)
+
+ err = cr.StartContainer()
+ c.Check(err, IsNil)
+
+ err = cr.WaitFinish()
+ return err
+
+}
+
+func (s *TestSuite) TestCheckContainerdPresent(c *C) {
+ err := s.helpCheckContainerd(c, func() ([]PsProcess, error) {
+ return []PsProcess{FakeProcess{[]string{"docker-containerd"}}}, nil
+ })
+ c.Check(err, IsNil)
+}
+
+func (s *TestSuite) TestCheckContainerdMissing(c *C) {
+ err := s.helpCheckContainerd(c, func() ([]PsProcess, error) {
+ return []PsProcess{FakeProcess{[]string{"abc"}}}, nil
+ })
+ c.Check(err, ErrorMatches, `'containerd' not found in process list.`)
+}
cr.CrunchLog.Print("Goodbye")
cr.CrunchLog.Close()
- c.Check(api.Calls > 1, Equals, true)
+ c.Check(api.Calls > 0, Equals, true)
c.Check(api.Calls < 2000000, Equals, true)
mt, err := cr.LogCollection.MarshalManifest(".")
}
var updated arvados.Collection
defer c.pdhs.Remove(coll.UUID)
- err := client.RequestAndDecode(&updated, "PATCH", "/arvados/v1/collections/"+coll.UUID, client.UpdateBody(coll), nil)
+ err := client.RequestAndDecode(&updated, "PATCH", "arvados/v1/collections/"+coll.UUID, client.UpdateBody(coll), nil)
if err == nil {
c.collections.Add(client.AuthToken+"\000"+coll.PortableDataHash, &cachedCollection{
expire: time.Now().Add(time.Duration(c.TTL)),
var newCollection arvados.Collection
arv := arvados.NewClientFromEnv()
arv.AuthToken = arvadostest.ActiveToken
- err = arv.RequestAndDecode(&newCollection, "POST", "/arvados/v1/collections", bytes.NewBufferString(url.Values{"collection": {"{}"}}.Encode()), nil)
+ err = arv.RequestAndDecode(&newCollection, "POST", "arvados/v1/collections", bytes.NewBufferString(url.Values{"collection": {"{}"}}.Encode()), nil)
c.Assert(err, check.IsNil)
readPath, writePath, pdhPath := pathFunc(newCollection)
if xff := r.Header.Get("X-Forwarded-For"); xff != "" {
remoteAddr = xff + "," + remoteAddr
}
+ if xfp := r.Header.Get("X-Forwarded-Proto"); xfp != "" && xfp != "http" {
+ r.URL.Scheme = xfp
+ }
w := httpserver.WrapResponseWriter(wOrig)
defer func() {
u = newu
}
redir := (&url.URL{
+ Scheme: r.URL.Scheme,
Host: r.Host,
Path: u.Path,
RawQuery: redirQuery.Encode(),
if resp.Code != http.StatusSeeOther {
return resp
}
- c.Check(resp.Body.String(), check.Matches, `.*href="//`+regexp.QuoteMeta(html.EscapeString(hostPath))+`(\?[^"]*)?".*`)
+ c.Check(resp.Body.String(), check.Matches, `.*href="http://`+regexp.QuoteMeta(html.EscapeString(hostPath))+`(\?[^"]*)?".*`)
cookies := (&http.Response{Header: resp.Header()}).Cookies()
u, _ = u.Parse(resp.Header().Get("Location"))
kc := runProxy(c, nil, false)
defer closeListener()
- // Point keepproxy to a non-existant keepstore
+ // Point keepproxy at a non-existent keepstore
locals := map[string]string{
TestProxyUUID: "http://localhost:12345",
}
return super(ComputeNodeShutdownActor, self)._finished()
def cancel_shutdown(self, reason, **kwargs):
+ if not self.cancellable:
+ return False
if self.cancel_reason is not None:
# already cancelled
- return
+ return False
self.cancel_reason = reason
self._logger.info("Shutdown cancelled: %s.", reason)
self._finished(success_flag=False)
+ return True
def _cancel_on_exception(orig_func):
@functools.wraps(orig_func)
self._logger.info("Starting shutdown")
arv_node = self._arvados_node()
if self._cloud.destroy_node(self.cloud_node):
+ self.cancellable = False
self._logger.info("Shutdown success")
if arv_node:
self._later.clean_arvados_node(arv_node)
def __init__(self, cloud_node, cloud_node_start_time, shutdown_timer,
timer_actor, update_actor, cloud_client,
arvados_node=None, poll_stale_after=600, node_stale_after=3600,
- boot_fail_after=1800
+ boot_fail_after=1800, consecutive_idle_count=0
):
super(ComputeNodeMonitorActor, self).__init__()
self._later = self.actor_ref.tell_proxy()
self.boot_fail_after = boot_fail_after
self.subscribers = set()
self.arvados_node = None
+ self.consecutive_idle_count = consecutive_idle_count
+ self.consecutive_idle = 0
self._later.update_arvados_node(arvados_node)
self.last_shutdown_opening = None
self._later.consider_shutdown()
else:
boot_grace = "boot exceeded"
- # API server side not implemented yet.
- idle_grace = 'idle exceeded'
+ if crunch_worker_state == "idle":
+ # Must report as "idle" at least "consecutive_idle_count" times
+ if self.consecutive_idle < self.consecutive_idle_count:
+ idle_grace = 'idle wait'
+ else:
+ idle_grace = 'idle exceeded'
+ else:
+ idle_grace = 'not idle'
node_state = (crunch_worker_state, window, boot_grace, idle_grace)
t = transitions[node_state]
if arvados_node is not None:
self.arvados_node = arvados_node
self._update.sync_node(self.cloud_node, self.arvados_node)
+ if self.arvados_node['crunch_worker_state'] == "idle":
+ self.consecutive_idle += 1
+ else:
+ self.consecutive_idle = 0
self._later.consider_shutdown()
from __future__ import absolute_import, print_function
-import subprocess
+import subprocess32 as subprocess
import time
from . import ComputeNodeMonitorActor
return driver_class(**auth_kwargs)
@RetryMixin._retry()
- def _set_sizes(self):
- self.sizes = {sz.id: sz for sz in self.real.list_sizes()}
+ def sizes(self):
+ if self._sizes is None:
+ self._sizes = {sz.id: sz for sz in self.real.list_sizes()}
+ return self._sizes
def __init__(self, auth_kwargs, list_kwargs, create_kwargs,
driver_class, retry_wait=1, max_retry_wait=180):
if new_pair is not None:
self.create_kwargs[new_pair[0]] = new_pair[1]
- self._set_sizes()
+ self._sizes = None
def _init_ping_host(self, ping_host):
self.ping_host = ping_host
for n in nodes:
# Need to populate Node.size
if not n.size:
- n.size = self.sizes[n.extra["properties"]["hardwareProfile"]["vmSize"]]
+ n.size = self.sizes()[n.extra["properties"]["hardwareProfile"]["vmSize"]]
n.extra['arvados_node_size'] = n.extra.get('tags', {}).get('arvados_node_size')
return nodes
nodelist = super(ComputeNodeDriver, self).list_nodes()
for node in nodelist:
self._ensure_private_ip(node)
- node.size = self.sizes["1"]
+ node.size = self.sizes()["1"]
return nodelist
def create_node(self, size, arvados_node):
nodes = super(ComputeNodeDriver, self).list_nodes()
for n in nodes:
if not n.size:
- n.size = self.sizes[n.extra["instance_type"]]
+ n.size = self.sizes()[n.extra["instance_type"]]
n.extra['arvados_node_size'] = n.extra.get('tags', {}).get('arvados_node_size')
return nodes
super(ComputeNodeDriver, self).__init__(
auth_kwargs, list_kwargs, create_kwargs,
driver_class)
- self._sizes_by_id = {sz.id: sz for sz in self.sizes.itervalues()}
self._disktype_links = {dt.name: self._object_link(dt)
for dt in self.real.ex_list_disktypes()}
# It's supposed to be the actual size object. Check that it's not,
# and monkeypatch the results when that's the case.
if not hasattr(node.size, 'id'):
- node.size = self._sizes_by_id[node.size]
+ node.size = self.sizes()[node.size]
# Get arvados-assigned cloud size id
node.extra['arvados_node_size'] = node.extra.get('metadata', {}).get('arvados_node_size')
return nodelist
'boot_fail_after': str(sys.maxint),
'node_stale_after': str(60 * 60 * 2),
'watchdog': '600',
- 'node_mem_scaling': '0.95'},
+ 'node_mem_scaling': '0.95',
+ 'consecutive_idle_count': '2'},
'Manage': {'address': '127.0.0.1',
'port': '-1',
'ManagementToken': ''},
node_setup_class=dispatch.ComputeNodeSetupActor,
node_shutdown_class=dispatch.ComputeNodeShutdownActor,
node_actor_class=dispatch.ComputeNodeMonitorActor,
- max_total_price=0):
+ max_total_price=0,
+ consecutive_idle_count=1):
super(NodeManagerDaemonActor, self).__init__()
self._node_setup = node_setup_class
self._node_shutdown = node_shutdown_class
self.poll_stale_after = poll_stale_after
self.boot_fail_after = boot_fail_after
self.node_stale_after = node_stale_after
+ self.consecutive_idle_count = consecutive_idle_count
self.last_polls = {}
for poll_name in ['server_wishlist', 'arvados_nodes', 'cloud_nodes']:
poll_actor = locals()[poll_name + '_actor']
poll_stale_after=self.poll_stale_after,
node_stale_after=self.node_stale_after,
cloud_client=self._cloud_driver,
- boot_fail_after=self.boot_fail_after)
+ boot_fail_after=self.boot_fail_after,
+ consecutive_idle_count=self.consecutive_idle_count)
actorTell = actor.tell_proxy()
actorTell.subscribe(self._later.node_can_shutdown)
self._cloud_nodes_actor.subscribe_to(cloud_node.id,
nodes_wanted = self._nodes_wanted(cloud_size)
if nodes_wanted < 1:
return None
- arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
- self._logger.info("Want %i more %s nodes. Booting a node.",
- nodes_wanted, cloud_size.id)
- new_setup = self._node_setup.start(
- timer_actor=self._timer,
- arvados_client=self._new_arvados(),
- arvados_node=arvados_node,
- cloud_client=self._new_cloud(),
- cloud_size=self.server_calculator.find_size(cloud_size.id)).proxy()
- self.booting[new_setup.actor_ref.actor_urn] = new_setup
- self.sizes_booting[new_setup.actor_ref.actor_urn] = cloud_size
-
- if arvados_node is not None:
- self.arvados_nodes[arvados_node['uuid']].assignment_time = (
- time.time())
- new_setup.subscribe(self._later.node_setup_finished)
+
+ if not self.cancel_node_shutdown(cloud_size):
+ arvados_node = self.arvados_nodes.find_stale_node(self.node_stale_after)
+ self._logger.info("Want %i more %s nodes. Booting a node.",
+ nodes_wanted, cloud_size.id)
+ new_setup = self._node_setup.start(
+ timer_actor=self._timer,
+ arvados_client=self._new_arvados(),
+ arvados_node=arvados_node,
+ cloud_client=self._new_cloud(),
+ cloud_size=self.server_calculator.find_size(cloud_size.id))
+ self.booting[new_setup.actor_urn] = new_setup.proxy()
+ self.sizes_booting[new_setup.actor_urn] = cloud_size
+
+ if arvados_node is not None:
+ self.arvados_nodes[arvados_node['uuid']].assignment_time = (
+ time.time())
+ new_setup.tell_proxy().subscribe(self._later.node_setup_finished)
+
if nodes_wanted > 1:
self._later.start_node(cloud_size)
if (nodes_excess < 1) or not self.booting:
return None
for key, node in self.booting.iteritems():
- if node and node.cloud_size.get().id == size.id and node.stop_if_no_cloud_node().get():
- del self.booting[key]
- del self.sizes_booting[key]
+ try:
+ if node and node.cloud_size.get().id == size.id and node.stop_if_no_cloud_node().get(2):
+ del self.booting[key]
+ del self.sizes_booting[key]
+ if nodes_excess > 1:
+ self._later.stop_booting_node(size)
+ return
+ except pykka.Timeout:
+ pass
- if nodes_excess > 1:
- self._later.stop_booting_node(size)
- break
+ @_check_poll_freshness
+ def cancel_node_shutdown(self, size):
+ # Go through shutdown actors and see if there are any of the appropriate size that can be cancelled
+ for record in self.cloud_nodes.nodes.itervalues():
+ try:
+ if (record.shutdown_actor is not None and
+ record.cloud_node.size.id == size.id and
+ record.shutdown_actor.cancel_shutdown("Node size is in wishlist").get(2)):
+ return True
+ except (pykka.ActorDeadError, pykka.Timeout) as e:
+ pass
+ return False
def _begin_node_shutdown(self, node_actor, cancellable):
cloud_node_obj = node_actor.cloud_node.get()
import logging
import re
-import subprocess
+import subprocess32 as subprocess
import arvados.util
config.getint('Daemon', 'boot_fail_after'),
config.getint('Daemon', 'node_stale_after'),
node_setup, node_shutdown, node_monitor,
- max_total_price=config.getfloat('Daemon', 'max_total_price')).tell_proxy()
+ max_total_price=config.getfloat('Daemon', 'max_total_price'),
+ consecutive_idle_count=config.getint('Daemon', 'consecutive_idle_count'),).tell_proxy()
watchdog = WatchdogActor.start(config.getint('Daemon', 'watchdog'),
cloud_node_poller.actor_ref,
from __future__ import absolute_import, print_function
-import subprocess
+import subprocess32 as subprocess
from . import clientactor
from . import config
# an Arvados node that hasn't been updated for this long.
node_stale_after = 14400
+# Number of consecutive times a node must report as "idle" before it
+# will be considered eligible for shutdown. Node status is checked
+# each poll period, and node can go idle at any point during a poll
+# period (meaning a node could be reported as idle that has only been
+# idle for 1 second). With a 60 second poll period, three consecutive
+# status updates of "idle" suggests the node has been idle at least
+# 121 seconds.
+consecutive_idle_count = 3
+
# Scaling factor to be applied to nodes' available RAM size. Usually there's a
# variable discrepancy between the advertised RAM value on cloud nodes and the
# actual amount available.
# File path for Certificate Authorities
certs_file = /etc/ssl/certs/ca-certificates.crt
+
[Logging]
# Log file path
file = /var/log/arvados/node-manager.log
# an Arvados node that hasn't been updated for this long.
node_stale_after = 14400
+# Number of consecutive times a node must report as "idle" before it
+# will be considered eligible for shutdown. Node status is checked
+# each poll period, and node can go idle at any point during a poll
+# period (meaning a node could be reported as idle that has only been
+# idle for 1 second). With a 60 second poll period, three consecutive
+# status updates of "idle" suggests the node has been idle at least
+# 121 seconds.
+consecutive_idle_count = 3
+
# Scaling factor to be applied to nodes' available RAM size. Usually there's a
# variable discrepancy between the advertised RAM value on cloud nodes and the
# actual amount available.
# an Arvados node that hasn't been updated for this long.
node_stale_after = 14400
+# Number of consecutive times a node must report as "idle" before it
+# will be considered eligible for shutdown. Node status is checked
+# each poll period, and node can go idle at any point during a poll
+# period (meaning a node could be reported as idle that has only been
+# idle for 1 second). With a 60 second poll period, three consecutive
+# status updates of "idle" suggests the node has been idle at least
+# 121 seconds.
+consecutive_idle_count = 3
+
# Scaling factor to be applied to nodes' available RAM size. Usually there's a
# variable discrepancy between the advertised RAM value on cloud nodes and the
# actual amount available.
'future',
'pykka',
'python-daemon',
- 'setuptools'
+ 'setuptools',
+ 'subprocess32>=3.5.1',
],
dependency_links=[
"https://github.com/curoverse/libcloud/archive/apache-libcloud-2.3.1.dev1.zip"
'pbr<1.7.0',
'mock>=1.0',
'apache-libcloud>=2.3.1.dev1',
+ 'subprocess32>=3.5.1',
],
zip_safe=False
)
"""
-import subprocess
+import subprocess32 as subprocess
import os
import sys
import re
global compute_nodes
if g.group(1) in compute_nodes:
del compute_nodes[g.group(1)]
- return 0
+ return 0
+ else:
+ return 1
+
def jobs_req(g):
global all_jobs
self.make_actor()
self.shutdowns._set_state(True, 600)
self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT),
- (False, "node state is ('unpaired', 'open', 'boot wait', 'idle exceeded')"))
+ (False, "node state is ('unpaired', 'open', 'boot wait', 'not idle')"))
def test_shutdown_when_invalid_cloud_node_size(self):
self.make_mocks(1)
def test_shutdown_without_arvados_node(self):
self.make_actor(start_time=0)
self.shutdowns._set_state(True, 600)
- self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'idle exceeded')"),
+ self.assertEquals((True, "node state is ('down', 'open', 'boot exceeded', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_missing(self):
last_ping_at='1970-01-01T01:02:03.04050607Z')
self.make_actor(10, arv_node)
self.shutdowns._set_state(True, 600)
- self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+ self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_running_broken(self):
self.make_actor(12, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
- self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"),
+ self.assertEquals((True, "node state is ('down', 'open', 'boot wait', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_missing_broken(self):
self.make_actor(11, arv_node)
self.shutdowns._set_state(True, 600)
self.cloud_client.broken.return_value = True
- self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'idle exceeded')"))
+ self.assertEquals(self.node_actor.shutdown_eligible().get(self.TIMEOUT), (True, "node state is ('down', 'open', 'boot wait', 'not idle')"))
def test_no_shutdown_when_window_closed(self):
self.make_actor(3, testutil.arvados_node_mock(3, job_uuid=None))
def test_no_shutdown_when_node_running_job(self):
self.make_actor(4, testutil.arvados_node_mock(4, job_uuid=True))
self.shutdowns._set_state(True, 600)
- self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'idle exceeded')"),
+ self.assertEquals((False, "node state is ('busy', 'open', 'boot wait', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_shutdown_when_node_state_unknown(self):
self.make_actor(5, testutil.arvados_node_mock(
5, crunch_worker_state='fail'))
self.shutdowns._set_state(True, 600)
- self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'idle exceeded')"),
+ self.assertEquals((True, "node state is ('fail', 'open', 'boot wait', 'not idle')"),
self.node_actor.shutdown_eligible().get(self.TIMEOUT))
def test_no_shutdown_when_node_state_stale(self):
from __future__ import absolute_import, print_function
-import subprocess
+import subprocess32 as subprocess
import time
import unittest
ComputeNodeSetupActorTestCase, \
ComputeNodeUpdateActorTestCase
-@mock.patch('subprocess.check_output')
+@mock.patch('subprocess32.check_output')
class SLURMComputeNodeShutdownActorTestCase(ComputeNodeShutdownActorMixin,
unittest.TestCase):
ACTOR_CLASS = slurm_dispatch.ComputeNodeShutdownActor
super(SLURMComputeNodeShutdownActorTestCase,
self).test_uncancellable_shutdown()
-@mock.patch('subprocess.check_output')
+@mock.patch('subprocess32.check_output')
class SLURMComputeNodeUpdateActorTestCase(ComputeNodeUpdateActorTestCase):
ACTOR_CLASS = slurm_dispatch.ComputeNodeUpdateActor
class SLURMComputeNodeSetupActorTestCase(ComputeNodeSetupActorTestCase):
ACTOR_CLASS = slurm_dispatch.ComputeNodeSetupActor
- @mock.patch('subprocess.check_output')
+ @mock.patch('subprocess32.check_output')
def test_update_node_features(self, check_output):
# `scontrol update` happens only if the Arvados node record
# has a hostname. ComputeNodeSetupActorTestCase.make_mocks
self.wait_for_assignment(self.setup_actor, 'cloud_node')
check_output.assert_called_with(['scontrol', 'update', 'NodeName=compute99', 'Weight=1000', 'Features=instancetype=z1.test'])
- @mock.patch('subprocess.check_output')
+ @mock.patch('subprocess32.check_output')
def test_failed_arvados_calls_retried(self, check_output):
super(SLURMComputeNodeSetupActorTestCase, self).test_failed_arvados_calls_retried()
- @mock.patch('subprocess.check_output')
+ @mock.patch('subprocess32.check_output')
def test_subscribe(self, check_output):
super(SLURMComputeNodeSetupActorTestCase, self).test_subscribe()
- @mock.patch('subprocess.check_output')
+ @mock.patch('subprocess32.check_output')
def test_creation_with_arvados_node(self, check_output):
super(SLURMComputeNodeSetupActorTestCase, self).test_creation_with_arvados_node()
monitor = self.monitor_list()[0].proxy()
self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
self.assertTrue(self.node_shutdown.start.called)
+ getmock = mock.MagicMock()
+ getmock.get.return_value = False
+ self.last_shutdown.cancel_shutdown.return_value = getmock
self.daemon.update_server_wishlist(
[testutil.MockSize(6)]).get(self.TIMEOUT)
self.busywait(lambda: self.node_setup.start.called)
+ def test_nodes_shutting_down_cancelled(self):
+ size = testutil.MockSize(6)
+ cloud_node = testutil.cloud_node_mock(6, size=size)
+ self.make_daemon([cloud_node], [testutil.arvados_node_mock(6, crunch_worker_state='down')],
+ avail_sizes=[(size, {"cores":1})])
+ self.assertEqual(1, self.alive_monitor_count())
+ monitor = self.monitor_list()[0].proxy()
+ self.daemon.node_can_shutdown(monitor).get(self.TIMEOUT)
+ self.assertTrue(self.node_shutdown.start.called)
+ self.daemon.update_server_wishlist(
+ [testutil.MockSize(6)]).get(self.TIMEOUT)
+ self.busywait(lambda: self.last_shutdown.cancel_shutdown.called)
+
def test_nodes_shutting_down_not_replaced_at_max_nodes(self):
cloud_node = testutil.cloud_node_mock(7)
self.make_daemon([cloud_node], [testutil.arvados_node_mock(7)],
super(JobQueueMonitorActorTestCase, self).build_monitor(*args, **kwargs)
self.client.jobs().queue().execute.side_effect = side_effect
- @mock.patch("subprocess.check_call")
- @mock.patch("subprocess.check_output")
+ @mock.patch("subprocess32.check_call")
+ @mock.patch("subprocess32.check_output")
def test_unsatisfiable_jobs(self, mock_squeue, mock_scancel):
job_uuid = 'zzzzz-8i9sb-zzzzzzzzzzzzzzz'
container_uuid = 'yyyyy-dz642-yyyyyyyyyyyyyyy'
self.client.jobs().cancel.assert_called_with(uuid=job_uuid)
mock_scancel.assert_called_with(['scancel', '--name='+container_uuid])
- @mock.patch("subprocess.check_output")
+ @mock.patch("subprocess32.check_output")
def test_subscribers_get_server_lists(self, mock_squeue):
mock_squeue.return_value = ""
self.subscriber.assert_called_with([testutil.MockSize(1),
testutil.MockSize(2)])
- @mock.patch("subprocess.check_output")
+ @mock.patch("subprocess32.check_output")
def test_squeue_server_list(self, mock_squeue):
mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)|1234567890
2|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)|1234567890
self.subscriber.assert_called_with([testutil.MockSize(1),
testutil.MockSize(2)])
- @mock.patch("subprocess.check_output")
+ @mock.patch("subprocess32.check_output")
def test_squeue_server_list_suffix(self, mock_squeue):
mock_squeue.return_value = """1|1024M|0|(ReqNodeNotAvail, UnavailableNodes:compute123)|zzzzz-dz642-zzzzzzzzzzzzzzy|(null)|1234567890
1|2G|0|(ReqNodeNotAvail)|zzzzz-dz642-zzzzzzzzzzzzzzz|(null)|1234567890
self.subscriber.assert_called_with([testutil.MockSize(1),
testutil.MockSize(2)])
- @mock.patch("subprocess.check_output")
+ @mock.patch("subprocess32.check_output")
def test_squeue_server_list_instancetype_constraint(self, mock_squeue):
mock_squeue.return_value = """1|1024|0|(Resources)|zzzzz-dz642-zzzzzzzzzzzzzzy|instancetype=z2.test|1234567890\n"""
super(JobQueueMonitorActorTestCase, self).build_monitor(jobqueue.ServerCalculator(
*args, **kwargs)
self.client.nodes().list().execute.side_effect = side_effect
- @mock.patch("subprocess.check_output")
+ @mock.patch("subprocess32.check_output")
def test_uuid_is_subscription_key(self, sinfo_mock):
sinfo_mock.return_value = ""
node = testutil.arvados_node_mock()
self.subscriber.assert_called_with(node)
self.assertEqual("down", node["crunch_worker_state"])
- @mock.patch("subprocess.check_output")
+ @mock.patch("subprocess32.check_output")
def test_update_from_sinfo(self, sinfo_mock):
sinfo_mock.return_value = """compute1|idle|instancetype=a1.test
compute2|alloc|(null)
"revision": "d682213848ed68c0a260ca37d6dd5ace8423f5ba",
"revisionTime": "2017-12-05T20:32:29Z"
},
+ {
+ "checksumSHA1": "st4vb0GmDeoKbsfxdpNZ2MPl76M=",
+ "path": "github.com/StackExchange/wmi",
+ "revision": "cdffdb33acae0e14efff2628f9bae377b597840e",
+ "revisionTime": "2018-04-12T20:51:11Z"
+ },
{
"checksumSHA1": "spyv5/YFBjYyZLZa1U2LBfDR8PM=",
"path": "github.com/beorn7/perks/quantile",
"revision": "0ca9ea5df5451ffdf184b4428c902747c2c11cd7",
"revisionTime": "2017-03-27T23:54:44Z"
},
+ {
+ "checksumSHA1": "Kqv7bA4oJG0nPwQvGWDwGGaKONo=",
+ "path": "github.com/go-ole/go-ole",
+ "revision": "7a0fa49edf48165190530c675167e2f319a05268",
+ "revisionTime": "2018-06-25T08:58:08Z"
+ },
+ {
+ "checksumSHA1": "PArleDBtadu2qO4hJwHR8a3IOTA=",
+ "path": "github.com/go-ole/go-ole/oleutil",
+ "revision": "7a0fa49edf48165190530c675167e2f319a05268",
+ "revisionTime": "2018-06-25T08:58:08Z"
+ },
{
"checksumSHA1": "wn2shNJMwRZpvuvkf1s7h0wvqHI=",
"path": "github.com/gogo/protobuf/proto",
"revision": "1744e2970ca51c86172c8190fadad617561ed6e7",
"revisionTime": "2017-11-10T11:01:46Z"
},
+ {
+ "checksumSHA1": "q14d3C3xvWevU3dSv4P5K0+OSD0=",
+ "path": "github.com/shirou/gopsutil/cpu",
+ "revision": "63728fcf6b24475ecfea044e22242447666c2f52",
+ "revisionTime": "2018-07-05T13:28:12Z"
+ },
+ {
+ "checksumSHA1": "LZ9GloiGLTISmQ4dalK2XspH6Wo=",
+ "path": "github.com/shirou/gopsutil/host",
+ "revision": "63728fcf6b24475ecfea044e22242447666c2f52",
+ "revisionTime": "2018-07-05T13:28:12Z"
+ },
+ {
+ "checksumSHA1": "cyoqI0gryzjxGTkaAfyUqMiuUR0=",
+ "path": "github.com/shirou/gopsutil/internal/common",
+ "revision": "63728fcf6b24475ecfea044e22242447666c2f52",
+ "revisionTime": "2018-07-05T13:28:12Z"
+ },
+ {
+ "checksumSHA1": "vEQLjAO5T5K9zXblEMYdoaBZzj0=",
+ "path": "github.com/shirou/gopsutil/mem",
+ "revision": "63728fcf6b24475ecfea044e22242447666c2f52",
+ "revisionTime": "2018-07-05T13:28:12Z"
+ },
+ {
+ "checksumSHA1": "KMWFRa0DVpabo9d8euB4RYjUBQE=",
+ "path": "github.com/shirou/gopsutil/net",
+ "revision": "63728fcf6b24475ecfea044e22242447666c2f52",
+ "revisionTime": "2018-07-05T13:28:12Z"
+ },
+ {
+ "checksumSHA1": "fbO7c1gv1kSvWKOb/+5HUWFkBaA=",
+ "path": "github.com/shirou/gopsutil/process",
+ "revision": "63728fcf6b24475ecfea044e22242447666c2f52",
+ "revisionTime": "2018-07-05T13:28:12Z"
+ },
+ {
+ "checksumSHA1": "Nve7SpDmjsv6+rhkXAkfg/UQx94=",
+ "path": "github.com/shirou/w32",
+ "revision": "bb4de0191aa41b5507caa14b0650cdbddcd9280b",
+ "revisionTime": "2016-09-30T03:27:40Z"
+ },
{
"checksumSHA1": "8QeSG127zQqbA+YfkO1WkKx/iUI=",
"path": "github.com/src-d/gcfg",