Merge branch '18842-arv-mount-disk-config' refs #18842
authorPeter Amstutz <peter.amstutz@curii.com>
Fri, 2 Dec 2022 18:28:28 +0000 (13:28 -0500)
committerPeter Amstutz <peter.amstutz@curii.com>
Fri, 2 Dec 2022 18:28:28 +0000 (13:28 -0500)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

doc/user/cwl/cwl-run-options.html.textile.liquid
doc/user/cwl/cwl-style.html.textile.liquid
sdk/python/tests/test_collections.py
tools/arvbox/bin/arvbox
tools/arvbox/lib/arvbox/docker/common.sh
tools/arvbox/lib/arvbox/docker/service/ready/run-service
tools/arvbox/lib/arvbox/docker/service/sdk/run-service

index 94e46ae1bc3487179dc1a72167d2d8bf94235085..703ec89139baf45afcafc8cf84c93b304ecee2b1 100644 (file)
@@ -38,34 +38,40 @@ table(table table-bordered table-condensed).
 |==--output-name OUTPUT_NAME==|Name to use for collection that stores the final output.|
 |==--output-tags OUTPUT_TAGS==|Tags for the final output collection separated by commas, e.g., =='--output-tags tag0,tag1,tag2'==.|
 |==--ignore-docker-for-reuse==|Ignore Docker image version when deciding whether to reuse past containers.|
-|==--submit==|              Submit workflow runner to Arvados to manage the workflow (default).|
-|==--local==|               Run workflow on local host (still submits containers to Arvados).|
+|==--submit==|              Submit workflow to run on Arvados.|
+|==--local==|               Run workflow on local host (submits containers to Arvados).|
 |==--create-template==|     (Deprecated) synonym for --create-workflow.|
 |==--create-workflow==|     Register an Arvados workflow that can be run from Workbench|
-|==--update-workflow== UUID|Update an existing Arvados workflow or pipeline template with the given UUID.|
+|==--update-workflow== UUID|Update an existing Arvados workflow with the given UUID.|
 |==--wait==|                After submitting workflow runner, wait for completion.|
 |==--no-wait==|             Submit workflow runner and exit.|
 |==--log-timestamps==|      Prefix logging lines with timestamp|
 |==--no-log-timestamps==|   No timestamp on logging lines|
 |==--compute-checksum==|    Compute checksum of contents while collecting outputs|
-|==--submit-runner-ram== SUBMIT_RUNNER_RAM|RAM (in MiB) required for the workflow runner (default 1024)|
-|==--submit-runner-image== SUBMIT_RUNNER_IMAGE|Docker image for workflow runner|
+|==--submit-runner-ram== SUBMIT_RUNNER_RAM|RAM (in MiB) required for the workflow runner job (default 1024)|
+|==--submit-runner-image== SUBMIT_RUNNER_IMAGE|Docker image for workflow runner job|
 |==--always-submit-runner==|When invoked with --submit --wait, always submit a runner to manage the workflow, even when only running a single CommandLineTool|
 |==--match-submitter-images==|Where Arvados has more than one Docker image of the same name, use image from the Docker instance on the submitting node.|
 |==--submit-request-uuid== UUID|Update and commit to supplied container request instead of creating a new one.|
 |==--submit-runner-cluster== CLUSTER_ID|Submit workflow runner to a remote cluster|
-|==--name NAME==|Name to use for workflow execution instance.|
+|==--collection-cache-size== COLLECTION_CACHE_SIZE|Collection cache size (in MiB, default 256).|
+|==--name== NAME|Name to use for workflow execution instance.|
 |==--on-error== {stop,continue}|Desired workflow behavior when a step fails.  One of 'stop' (do not submit any more steps) or 'continue' (may submit other steps that are not downstream from the error). Default is 'continue'.|
-|==--enable-dev==|Enable loading and running development versions of CWL spec.|
-|==--storage-classes== STORAGE_CLASSES|Specify comma separated list of storage classes to be used when saving the final workflow output to Keep.|
-|==--intermediate-storage-classes== STORAGE_CLASSES|Specify comma separated list of storage classes to be used when intermediate workflow output to Keep.|
+|==--enable-dev==|Enable loading and running development versions of the CWL standards.|
+|==--storage-classes== STORAGE_CLASSES|Specify comma separated list of storage classes to be used when saving final workflow output to Keep.|
+|==--intermediate-storage-classes== INTERMEDIATE_STORAGE_CLASSES|Specify comma separated list of storage classes to be used when saving intermediate workflow output to Keep.|
 |==--intermediate-output-ttl== N|If N > 0, intermediate output collections will be trashed N seconds after creation. Default is 0 (don't trash).|
 |==--priority== PRIORITY|Workflow priority (range 1..1000, higher has precedence over lower)|
-|==--thread-count== THREAD_COUNT|Number of threads to use for container submit and output collection.|
+|==--thread-count== THREAD_COUNT|Number of threads to use for job submit and output collection.|
 |==--http-timeout== HTTP_TIMEOUT|API request timeout in seconds. Default is 300 seconds (5 minutes).|
-|==--enable-preemptible==|Use preemptible instances. Control individual steps with "arv:UsePreemptible":cwl-extensions.html#UsePreemptible hint.|
+|==--defer-downloads==|When submitting a workflow, defer downloading HTTP URLs to workflow launch instead of downloading to Keep before submit.|
+|==--varying-url-params== VARYING_URL_PARAMS|A comma separated list of URL query parameters that should be ignored when storing HTTP URLs in Keep.|
+|==--prefer-cached-downloads==|If a HTTP URL is found in Keep, skip upstream URL freshness check (will not notice if the upstream has changed, but also not error if upstream is unavailable).|
+|==--enable-preemptible==|Use preemptible instances. Control individual steps with arv:UsePreemptible hint.|
 |==--disable-preemptible==|Don't use preemptible instances.|
-|==--skip-schemas==|Skip loading of extension schemas (the $schemas section).|
+|==--copy-deps==|         Copy dependencies into the destination project.|
+|==--no-copy-deps==|      Leave dependencies where they are.|
+|==--skip-schemas==|      Skip loading of schemas|
 |==--trash-intermediate==|Immediately trash intermediate outputs on workflow success.|
 |==--no-trash-intermediate==|Do not trash intermediate outputs (default).|
 
index 303ae37e9e94b98d5694cd8de5c71930ca6196ce..911c9ba5a539f1769178f6ca7984b34736aededa 100644 (file)
@@ -172,7 +172,7 @@ Workflows should always provide @DockerRequirement@ in the @hints@ or @requireme
 
 h3. Build a reusable library of components
 
-Build a reusable library of components.  Share tool wrappers and subworkflows between projects.  Make use of and contribute to "community maintained workflows and tools":https://github.com/common-workflow-library and tool registries such as "Dockstore":http://dockstore.org .
+Share tool wrappers and subworkflows between projects.  Make use of and contribute to "community maintained workflows and tools":https://github.com/common-workflow-library and tool registries such as "Dockstore":http://dockstore.org .
 
 h3. Supply scripts as input parameters
 
@@ -208,7 +208,7 @@ h3. Getting the temporary and output directories
 
 You can get the designated temporary directory using @$(runtime.tmpdir)@ in your CWL file, or from the @$TMPDIR@ environment variable in your script.
 
-Similarly, you can get the designated output directory using $(runtime.outdir), or from the @HOME@ environment variable in your script.
+Similarly, you can get the designated output directory using @$(runtime.outdir)@, or from the @HOME@ environment variable in your script.
 
 h3. Specifying @ResourceRequirement@
 
@@ -234,3 +234,36 @@ steps:
         coresMin: 2
         tmpdirMin: 90000
 {% endcodeblock %}
+
+h3. Importing data into Keep
+
+You can use HTTP URLs as File input parameters and @arvados-cwl-runner@ will download them to Keep for you:
+
+{% codeblock as yaml %}
+fastq1:
+  class: File
+  location: https://example.com/genomes/sampleA_1.fastq
+fastq2:
+  class: File
+  location: https://example.com/genomes/sampleA_2.fastq
+{% endcodeblock %}
+
+Files are downloaded and stored in Keep collections with HTTP header information stored in metadata.  If a file was previously downloaded, @arvados-cwl-runner@ uses HTTP caching rules to decide if a file should be re-downloaded or not.
+
+The default behavior is to transfer the files on the client, prior to submitting the workflow run.  This guarantees the data is available when the workflow is submitted.  However, if data transfer is time consuming and you are submitting multiple workflow runs in a row, or the node submitting the workflow has limited bandwidth, you can use the @--defer-download@ option to have the data transfer performed by workflow runner process on a compute node, after the workflow is submitted.
+
+@arvados-cwl-runner@ provides two additional options to control caching behavior.
+
+* @--varying-url-params@ will ignore the listed URL query parameters from any HTTP URLs when checking if a URL has already been downloaded to Keep.
+* @--prefer-cached-downloads@ will search Keep for the previously downloaded URL and use that if found, without checking the upstream resource. This means changes in the upstream resource won't be detected, but it also means the workflow will not fail if the upstream resource becomes inaccessible.
+
+One use of this is to import files from "AWS S3 signed URLs":https://docs.aws.amazon.com/AmazonS3/latest/userguide/ShareObjectPreSignedURL.html
+
+Here is an example usage.  The use of @--varying-url-params=AWSAccessKeyId,Signature,Expires@ is especially relevant, this removes these parameters from the cached URL, which means that if a new signed URL for the same object is generated later, it can be found in the cache.
+
+{% codeblock as sh %}
+arvados-cwl-runner --defer-download \
+                   --varying-url-params=AWSAccessKeyId,Signature,Expires \
+                  --prefer-cached-downloads \
+                  workflow.cwl params.yml
+{% endcodeblock %}
index 8aded823bde3410a68dfa0e1110e0d0537d6c868..8986cf225840054bc5cd4161f7edd0b2c3f58b32 100644 (file)
@@ -900,7 +900,7 @@ class NewCollectionTestCase(unittest.TestCase, CollectionTestMixin):
         c1.save_new()
         loc = c1.manifest_locator()
         c2 = Collection(loc)
-        self.assertEqual(c1.manifest_text, c2.manifest_text)
+        self.assertEqual(c1.manifest_text(strip=True), c2.manifest_text(strip=True))
         self.assertEqual(c1.replication_desired, c2.replication_desired)
 
     def test_replication_desired_not_loaded_if_provided(self):
@@ -909,7 +909,7 @@ class NewCollectionTestCase(unittest.TestCase, CollectionTestMixin):
         c1.save_new()
         loc = c1.manifest_locator()
         c2 = Collection(loc, replication_desired=2)
-        self.assertEqual(c1.manifest_text, c2.manifest_text)
+        self.assertEqual(c1.manifest_text(strip=True), c2.manifest_text(strip=True))
         self.assertNotEqual(c1.replication_desired, c2.replication_desired)
 
     def test_storage_classes_desired_kept_on_load(self):
@@ -918,7 +918,7 @@ class NewCollectionTestCase(unittest.TestCase, CollectionTestMixin):
         c1.save_new()
         loc = c1.manifest_locator()
         c2 = Collection(loc)
-        self.assertEqual(c1.manifest_text, c2.manifest_text)
+        self.assertEqual(c1.manifest_text(strip=True), c2.manifest_text(strip=True))
         self.assertEqual(c1.storage_classes_desired(), c2.storage_classes_desired())
 
     def test_storage_classes_change_after_save(self):
@@ -931,7 +931,7 @@ class NewCollectionTestCase(unittest.TestCase, CollectionTestMixin):
         c2.save(storage_classes=['highIO'])
         self.assertEqual(['highIO'], c2.storage_classes_desired())
         c3 = Collection(loc)
-        self.assertEqual(c1.manifest_text, c3.manifest_text)
+        self.assertEqual(c1.manifest_text(strip=True), c3.manifest_text(strip=True))
         self.assertEqual(['highIO'], c3.storage_classes_desired())
 
     def test_storage_classes_desired_not_loaded_if_provided(self):
@@ -940,7 +940,7 @@ class NewCollectionTestCase(unittest.TestCase, CollectionTestMixin):
         c1.save_new()
         loc = c1.manifest_locator()
         c2 = Collection(loc, storage_classes_desired=['default'])
-        self.assertEqual(c1.manifest_text, c2.manifest_text)
+        self.assertEqual(c1.manifest_text(strip=True), c2.manifest_text(strip=True))
         self.assertNotEqual(c1.storage_classes_desired(), c2.storage_classes_desired())
 
     def test_init_manifest(self):
index e7416947d65d2abd5023f77f1b4de997b71c910d..b3b9a5fcb441900535954012ebc0ee05f77bf10f 100755 (executable)
@@ -44,10 +44,6 @@ if test -z "$ARVADOS_ROOT" ; then
     ARVADOS_ROOT="$ARVBOX_DATA/arvados"
 fi
 
-if test -z "$COMPOSER_ROOT" ; then
-    COMPOSER_ROOT="$ARVBOX_DATA/composer"
-fi
-
 if test -z "$WORKBENCH2_ROOT" ; then
     WORKBENCH2_ROOT="$ARVBOX_DATA/workbench2"
 fi
@@ -138,7 +134,6 @@ wait_for_arvbox() {
 docker_run_dev() {
     docker run \
            "--volume=$ARVADOS_ROOT:/usr/src/arvados:rw" \
-           "--volume=$COMPOSER_ROOT:/usr/src/composer:rw" \
            "--volume=$WORKBENCH2_ROOT:/usr/src/workbench2:rw" \
            "--volume=$PG_DATA:/var/lib/postgresql:rw" \
            "--volume=$VAR_DATA:$ARVADOS_CONTAINER_PATH:rw" \
@@ -257,10 +252,6 @@ run() {
             git clone https://git.arvados.org/arvados.git "$ARVADOS_ROOT"
            git -C "$ARVADOS_ROOT" checkout $ARVADOS_BRANCH
         fi
-        if ! test -d "$COMPOSER_ROOT" ; then
-            git clone https://github.com/arvados/composer.git "$COMPOSER_ROOT"
-            git -C "$COMPOSER_ROOT" checkout arvados-fork
-        fi
         if ! test -d "$WORKBENCH2_ROOT" ; then
             git clone https://git.arvados.org/arvados-workbench2.git "$WORKBENCH2_ROOT"
            git -C "$ARVADOS_ROOT" checkout $WORKBENCH2_BRANCH
@@ -613,7 +604,6 @@ case "$subcmd" in
                "$ARVBOX_BASE/$1/gopath" \
                "$ARVBOX_BASE/$1/Rlibs" \
                "$ARVBOX_BASE/$1/arvados" \
-               "$ARVBOX_BASE/$1/composer" \
                "$ARVBOX_BASE/$1/workbench2" \
                "$ARVBOX_BASE/$2"
             echo "Created new arvbox $2"
index ba81426f0bfc35a7b916496970edf0cbb9648300..d900f0377207a7a0717ec49c84643e8a9367aff9 100644 (file)
@@ -15,6 +15,10 @@ export ARVADOS_CONTAINER_PATH=/var/lib/arvados-arvbox
 export GEM_HOME=$HOME/.gem
 GEMLOCK=$HOME/gems.lock
 
+export LANG=en_US.UTF-8
+export LANGUAGE=en_US:en
+export LC_ALL=en_US.UTF-8
+
 defaultdev=$(/sbin/ip route|awk '/default/ { print $5 }')
 dockerip=$(/sbin/ip route | grep default | awk '{ print $3 }')
 containerip=$(ip addr show $defaultdev | grep 'inet ' | sed 's/ *inet \(.*\)\/.*/\1/')
index 5007fe0be3e8e459fdd107246b5987b324051bfd..1e9aae0c45eb6a4685324c6edcc99504f6bf3dff 100755 (executable)
@@ -89,7 +89,8 @@ fi
 
 echo
 echo "Your Arvados-in-a-box is ready!"
-echo "Workbench is running at https://$localip"
-echo "Workbench2 is running at https://$localip:${services[workbench2-ssl]}"
+echo "Workbench is hosted at https://$localip"
+echo "Workbench2 is hosted at https://$localip:${services[workbench2-ssl]}"
+echo "Documentation is hosted at http://$localip:${services[doc]}"
 
 rm -r /tmp/arvbox-ready
index 62eb50c7a17387e8ed1bef6ab4fbc334a96a82c9..d3ff7e868345b383fb7c98e27a88a36ad44db1ed 100755 (executable)
@@ -14,6 +14,9 @@ cat > ~/.pip/pip.conf <<EOF
 download_cache = /var/lib/pip
 EOF
 
+cd /usr/src/arvados/sdk/ruby
+run_bundler --binstubs=binstubs
+
 cd /usr/src/arvados/sdk/cli
 run_bundler --binstubs=binstubs