Merge branch '17301-cwl-oom' refs #17301
authorPeter Amstutz <peter.amstutz@curii.com>
Mon, 25 Apr 2022 20:06:06 +0000 (16:06 -0400)
committerPeter Amstutz <peter.amstutz@curii.com>
Mon, 25 Apr 2022 20:06:06 +0000 (16:06 -0400)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

doc/architecture/manifest-format.html.textile.liquid
sdk/cwl/arvados_cwl/runner.py
sdk/cwl/tests/18888-download_def.cwl [new file with mode: 0644]
sdk/cwl/tests/arvados-tests.sh
sdk/cwl/tests/collection_per_tool/collection_per_tool_packed.cwl
sdk/cwl/tests/scripts/download_all_data.sh [new file with mode: 0755]
sdk/cwl/tests/submit_test_job.json
sdk/cwl/tests/test_submit.py
sdk/cwl/tests/wf/expect_packed.cwl
sdk/cwl/tests/wf/expect_upload_packed.cwl

index 1780768bc340ac1d823fa8c3bb7e30f499ba08f5..e1057a42ea6f82a5691a908a3e65e1fa072dc613 100644 (file)
@@ -60,6 +60,28 @@ A normalized manifest is a manifest that meets the following additional restrict
 * Blocks within a stream are ordered based on order of file tokens of the stream.  A given block is listed at most once in a stream.
 * Filename must not contain @"/"@ (the stream name represents the path prefix)
 
+h3. Estimating manifest size
+
+Here's a formula for estimating manifest size as stored in the database, assuming efficiently packed blocks.
+
+<pre>
+manifest_size =
+   + (total data size / 64 MB) * 40
+   + sum(number of files * 20)
+   + sum(size of all directory paths)
+   + sum(size of all file names)
+</pre>
+
+Here is the size when including block signatures.  The block signatures authorize access to fetch each block from a Keep server, as <a href="#token_signatures">described below</a>.  The signed manifest text is what is actually transferred to/from the API server and stored in RAM by @arv-mount@.  The effective upper limit on how large a collection manifest can be is determined by @API.MaxRequestSize@ in @config.yml@ as well as the maximum request size configuration in your reverse proxy or load balancer (e.g. @client_max_body_size@ in Nginx).
+
+<pre>
+manifest_size =
+   + (total data size / 64 MB) * 94
+   + sum(number of files * 20)
+   + sum(size of all directory paths)
+   + sum(size of all file names)
+</pre>
+
 h3. Example manifests
 
 A manifest with four files in two directories:
@@ -122,7 +144,7 @@ table(table table-bordered table-condensed).
 |@d41d8cd98f00b204e9800998ecf8427e+0+z@|Hint does not start with uppercase letter|
 |@d41d8cd98f00b204e9800998ecf8427e+0+Zfoo*bar@|Hint contains invalid character @*@|
 
-h3. Token signatures
+h3(#token_signatures). Token signatures
 
 A token signature (sign-hint) provides proof-of-access for a data block.  It is computed by taking a SHA1 HMAC of the blob signing token (a shared secret between the API server and keep servers), block digest, current API token, expiration timestamp, and blob signature TTL.
 
index e5a81cdc73fe77238b743705c1dd4c8847240d18..7d4310b0e0ce94b9430ded7f60ca04416e2964b9 100644 (file)
@@ -358,6 +358,7 @@ def upload_dependencies(arvrunner, name, document_loader,
             if "location" not in f and "path" in f:
                 f["location"] = f["path"]
                 del f["path"]
+            normalizeFilesDirs(f)
             optional_deps.append(f)
         visit_class(obj["default"], ("File", "Directory"), defaults_are_optional)
 
diff --git a/sdk/cwl/tests/18888-download_def.cwl b/sdk/cwl/tests/18888-download_def.cwl
new file mode 100644 (file)
index 0000000..2237c44
--- /dev/null
@@ -0,0 +1,29 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cwlVersion: v1.2
+class: CommandLineTool
+
+$namespaces:
+  arv: "http://arvados.org/cwl#"
+
+requirements:
+  NetworkAccess:
+    networkAccess: true
+  arv:RuntimeConstraints:
+    outputDirType: keep_output_dir
+
+inputs:
+  scripts:
+    type: Directory
+    default:
+      class: Directory
+      location: scripts/
+outputs:
+  out:
+    type: Directory
+    outputBinding:
+      glob: "."
+
+arguments: [$(inputs.scripts.path)/download_all_data.sh, "."]
index 7727ebfa04005bfdece5c8fcd5af90cbdd7cedb2..9cb5234cf04db6228763ab3155154f7ee29fc5b4 100755 (executable)
@@ -3,6 +3,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+set -e
+
 if ! arv-get d7514270f356df848477718d58308cc4+94 > /dev/null ; then
     arv-put --portable-data-hash testdir/*
 fi
@@ -16,4 +18,6 @@ if ! arv-get 20850f01122e860fb878758ac1320877+71 > /dev/null ; then
     arv-put --portable-data-hash samples/sample1_S01_R1_001.fastq.gz
 fi
 
+arvados-cwl-runner 18888-download_def.cwl --scripts scripts/
+
 exec cwltest --test arvados-tests.yml --tool arvados-cwl-runner $@ -- --disable-reuse --compute-checksum --api=containers
index 1054d8f29bdb627c6b8710429534dada68edddea..c934274fcb5eedd70bfc293a1ac23375b091edf7 100644 (file)
             "inputs": [
                 {
                     "default": {
+                        "basename": "a.txt",
                         "class": "File",
-                        "location": "keep:b9fca8bf06b170b8507b80b2564ee72b+57/a.txt"
+                        "location": "keep:b9fca8bf06b170b8507b80b2564ee72b+57/a.txt",
+                        "nameext": ".txt",
+                        "nameroot": "a"
                     },
                     "id": "#step1.cwl/a",
                     "type": "File"
                 },
                 {
                     "default": {
+                        "basename": "b.txt",
                         "class": "File",
-                        "location": "keep:b9fca8bf06b170b8507b80b2564ee72b+57/b.txt"
+                        "location": "keep:b9fca8bf06b170b8507b80b2564ee72b+57/b.txt",
+                        "nameext": ".txt",
+                        "nameroot": "b"
                     },
                     "id": "#step1.cwl/b",
                     "type": "File"
             "inputs": [
                 {
                     "default": {
+                        "basename": "b.txt",
                         "class": "File",
-                        "location": "keep:8e2d09a066d96cdffdd2be41579e4e2e+57/b.txt"
+                        "location": "keep:8e2d09a066d96cdffdd2be41579e4e2e+57/b.txt",
+                        "nameext": ".txt",
+                        "nameroot": "b"
                     },
                     "id": "#step2.cwl/b",
                     "type": "File"
                 },
                 {
                     "default": {
+                        "basename": "c.txt",
                         "class": "File",
-                        "location": "keep:8e2d09a066d96cdffdd2be41579e4e2e+57/c.txt"
+                        "location": "keep:8e2d09a066d96cdffdd2be41579e4e2e+57/c.txt",
+                        "nameext": ".txt",
+                        "nameroot": "c"
                     },
                     "id": "#step2.cwl/c",
                     "type": "File"
diff --git a/sdk/cwl/tests/scripts/download_all_data.sh b/sdk/cwl/tests/scripts/download_all_data.sh
new file mode 100755 (executable)
index 0000000..d3a9d78
--- /dev/null
@@ -0,0 +1,7 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+#!/bin/bash
+
+echo bubble
index 49d5944c06d81f6f3ece48c8f37ce5421859a942..be5f6bf1a13b3872d59ae80c2b78ddf4159105f3 100644 (file)
@@ -18,6 +18,7 @@
             "basename": "renamed.txt",
             "class": "File",
             "location": "keep:99999999999999999999999999999998+99/file1.txt"
-        }]
+        }],
+        "location": "_:df80736f-f14d-4b10-b2e3-03aa27f034bb"
     }
 }
index adee2d20898770aaea1335fc617ee5b6ad553981..5092fc45756d9f07ae983ba9547e3245147a2cf9 100644 (file)
@@ -47,17 +47,22 @@ _rootDesc = None
 
 def stubs(func):
     @functools.wraps(func)
+    @mock.patch("uuid.uuid4")
     @mock.patch("arvados.commands.keepdocker.list_images_in_arv")
     @mock.patch("arvados.collection.KeepClient")
     @mock.patch("arvados.keep.KeepClient")
     @mock.patch("arvados.events.subscribe")
-    def wrapped(self, events, keep_client1, keep_client2, keepdocker, *args, **kwargs):
+    def wrapped(self, events, keep_client1, keep_client2, keepdocker, uuid4, *args, **kwargs):
         class Stubs(object):
             pass
         stubs = Stubs()
         stubs.events = events
         stubs.keepdocker = keepdocker
 
+        uuid4.side_effect = ["df80736f-f14d-4b10-b2e3-03aa27f034bb", "df80736f-f14d-4b10-b2e3-03aa27f034b1",
+                             "df80736f-f14d-4b10-b2e3-03aa27f034b2", "df80736f-f14d-4b10-b2e3-03aa27f034b3",
+                             "df80736f-f14d-4b10-b2e3-03aa27f034b4", "df80736f-f14d-4b10-b2e3-03aa27f034b5"]
+
         def putstub(p, **kwargs):
             return "%s+%i" % (hashlib.md5(p).hexdigest(), len(p))
         keep_client1().put.side_effect = putstub
@@ -1614,6 +1619,7 @@ class TestCreateWorkflow(unittest.TestCase):
                          self.existing_workflow_uuid + '\n')
         self.assertEqual(exited, 0)
 
+
     @stubs
     def test_update_name(self, stubs):
         exited = arvados_cwl.main(
index 4715c10a5e27d92d2f59bba9cca220761d20a041..42c7b251f86615538263a13676353dcf01e4ccdf 100644 (file)
             "inputs": [
                 {
                     "default": {
+                        "basename": "blub.txt",
                         "class": "File",
-                        "location": "keep:5d373e7629203ce39e7c22af98a0f881+52/blub.txt"
+                        "location": "keep:5d373e7629203ce39e7c22af98a0f881+52/blub.txt",
+                        "nameext": ".txt",
+                        "nameroot": "blub"
                     },
                     "id": "#submit_tool.cwl/x",
                     "inputBinding": {
@@ -68,7 +71,8 @@
                                 "nameroot": "renamed",
                                 "size": 0
                             }
-                        ]
+                        ],
+                       "location": "_:df80736f-f14d-4b10-b2e3-03aa27f034b2"
                     },
                     "id": "#main/z",
                     "type": "Directory"
index 0b13e3a8192328b069c1057103cfe80f7e025f6a..644f87fd53fc085f477761dd878ea9c5d4a810e8 100644 (file)
             "inputs": [
                 {
                     "default": {
+                        "basename": "blub.txt",
                         "class": "File",
-                        "location": "keep:5d373e7629203ce39e7c22af98a0f881+52/blub.txt"
+                        "location": "keep:5d373e7629203ce39e7c22af98a0f881+52/blub.txt",
+                        "nameext": ".txt",
+                        "nameroot": "blub"
                     },
                     "id": "#submit_tool.cwl/x",
                     "inputBinding": {
@@ -74,7 +77,8 @@
                                 "nameroot": "renamed",
                                 "size": 0
                             }
-                        ]
+                        ],
+                        "location": "_:df80736f-f14d-4b10-b2e3-03aa27f034b2"
                     },
                     "id": "#main/z",
                     "type": "Directory"