Merge branch '18323-cwl-gpu' refs #18323
authorPeter Amstutz <peter.amstutz@curii.com>
Tue, 21 Dec 2021 19:26:12 +0000 (14:26 -0500)
committerPeter Amstutz <peter.amstutz@curii.com>
Tue, 21 Dec 2021 19:26:12 +0000 (14:26 -0500)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

doc/user/cwl/cwl-extensions.html.textile.liquid
sdk/cwl/arvados_cwl/__init__.py
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
sdk/cwl/arvados_cwl/arvcontainer.py
sdk/cwl/tests/test_container.py

index 6a70454b10860b3cc0f41aeac03661a5c3f0c140..dcddace14819326459d2efed9a4f9cfcdd0ef74a 100644 (file)
@@ -57,6 +57,12 @@ hints:
     processProperties:
       property1: value1
       property2: $(inputs.value2)
+
+  arv:CUDARequirement:
+    minCUDADriverVersion: "11.0"
+    minCUDAHardwareCapability: "9.0"
+    minDeviceCount: 1
+    maxDeviceCount: 1
 {% endcodeblock %}
 
 h2(#RunInSingleContainer). arv:RunInSingleContainer
@@ -147,6 +153,17 @@ table(table table-bordered table-condensed).
 |_. Field |_. Type |_. Description |
 |processProperties|key-value map, or list of objects with the fields {propertyName, propertyValue}|The properties that will be set on the container request.  May include expressions that reference `$(inputs)` of the current workflow or tool.|
 
+h2(#CUDARequirement). arv:CUDARequirement
+
+Request support for Nvidia CUDA GPU acceleration in the container.  Assumes that the CUDA runtime (SDK) is installed in the container, and the host will inject the CUDA driver libraries into the container (equal or later to the version requested).
+
+table(table table-bordered table-condensed).
+|_. Field |_. Type |_. Description |
+|minCUDADriverVersion|string|Required.  The CUDA SDK version corresponding to the minimum driver version supported by the container (generally, the SDK version 'X.Y' the application was compiled against).|
+|minCUDAHardwareCapability|string|Required.  The minimum CUDA hardware capability (in 'X.Y' format) required by the application's PTX or C++ GPU code (will be JIT compiled for the available hardware).|
+|minDeviceCount|integer|Minimum number of GPU devices to allocate on a single node. Required.|
+|maxDeviceCount|integer|Maximum number of GPU devices to allocate on a single node. Optional.  If not specified, same as @minDeviceCount@.|
+
 h2. arv:dockerCollectionPDH
 
 This is an optional extension field appearing on the standard @DockerRequirement@.  It specifies the portable data hash of the Arvados collection containing the Docker image.  If present, it takes precedence over @dockerPull@ or @dockerImageId@.
index 71ef742e314633bab08b0f493f766227b19b7849..df5866d67db3132dd5551ada3ddb101a2bd95a3c 100644 (file)
@@ -250,7 +250,8 @@ def add_arv_hints():
         "http://arvados.org/cwl#ReuseRequirement",
         "http://arvados.org/cwl#ClusterTarget",
         "http://arvados.org/cwl#OutputStorageClass",
-        "http://arvados.org/cwl#ProcessProperties"
+        "http://arvados.org/cwl#ProcessProperties",
+        "http://arvados.org/cwl#CUDARequirement"
     ])
 
 def exit_signal_handler(sigcode, frame):
index bc5aeaf7970be0f2396f46d9d6aff8f4ea4be372..6f5e7980dbb7576918b97e736abbdf4240e91748 100644 (file)
@@ -329,3 +329,31 @@ $graph:
       jsonldPredicate:
         mapSubject: propertyName
         mapPredicate: propertyValue
+
+
+- name: CUDARequirement
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Require support for Nvidia CUDA (GPU hardware acceleration).
+  fields:
+    class:
+      type: string
+      doc: 'arv:CUDARequirement'
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    minCUDADriverVersion:
+      type: string
+      doc: Minimum CUDA driver version to run the software, in X.Y format of the associated CUDA SDK release.
+    minCUDAHardwareCapability:
+      type: string
+      doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
+    minDeviceCount:
+      type: int?
+      default: 1
+      doc: Minimum number of GPU devices to request, or 1.
+    maxDeviceCount:
+      type: int?
+      doc: Maximum number of GPU devices to request.  If not specified, same as `minDeviceCount`.
index a60ead113c179f1285cab2360550e78840f28aac..5dbb838f1a2e4a81bd9ad196a5e203743cdc3ef7 100644 (file)
@@ -272,3 +272,31 @@ $graph:
       jsonldPredicate:
         mapSubject: propertyName
         mapPredicate: propertyValue
+
+
+- name: CUDARequirement
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Require support for Nvidia CUDA (GPU hardware acceleration).
+  fields:
+    class:
+      type: string
+      doc: 'arv:CUDARequirement'
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    minCUDADriverVersion:
+      type: string
+      doc: Minimum CUDA driver version to run the software, in X.Y format of the associated CUDA SDK release.
+    minCUDAHardwareCapability:
+      type: string
+      doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
+    minDeviceCount:
+      type: int?
+      default: 1
+      doc: Minimum number of GPU devices to request, or 1.
+    maxDeviceCount:
+      type: int?
+      doc: Maximum number of GPU devices to request.  If not specified, same as `minDeviceCount`.
index a34ef3342acca9c0310d513b1f4e782942666350..5fbe5bd7f7c5327e41078014ed77b30729b84b8f 100644 (file)
@@ -274,3 +274,31 @@ $graph:
       jsonldPredicate:
         mapSubject: propertyName
         mapPredicate: propertyValue
+
+
+- name: CUDARequirement
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Require support for Nvidia CUDA (GPU hardware acceleration).
+  fields:
+    class:
+      type: string
+      doc: 'arv:CUDARequirement'
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    minCUDADriverVersion:
+      type: string
+      doc: Minimum CUDA driver version to run the software, in X.Y format of the associated CUDA SDK release.
+    minCUDAHardwareCapability:
+      type: string
+      doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
+    minDeviceCount:
+      type: int?
+      default: 1
+      doc: Minimum number of GPU devices to request, or 1.
+    maxDeviceCount:
+      type: int?
+      doc: Maximum number of GPU devices to request.  If not specified, same as `minDeviceCount`.
index ae3c6688955301141f0af3405787c9f831fb58b7..6372caaa38b182db35dc927af5d7b5e04a1ed93d 100644 (file)
@@ -291,6 +291,14 @@ class ArvadosContainer(JobBase):
             else:
                 container_request["output_storage_classes"] = runtimeContext.intermediate_storage_classes.strip().split(",")
 
+        cuda_req, _ = self.get_requirement("http://arvados.org/cwl#CUDARequirement")
+        if cuda_req:
+            runtime_constraints["cuda"] = {
+                "device_count": cuda_req.get("minDeviceCount", 1),
+                "driver_version": cuda_req["minCUDADriverVersion"],
+                "hardware_capability": cuda_req["minCUDAHardwareCapability"]
+            }
+
         if self.timelimit is not None and self.timelimit > 0:
             scheduling_parameters["max_run_time"] = self.timelimit
 
index 1a2bd112f37d15822fa9d3edc25107146f352825..e97572cd93aa4cc2721772b53b2759d028ccc13d 100644 (file)
@@ -18,6 +18,7 @@ import os
 import functools
 import cwltool.process
 import cwltool.secrets
+import cwltool.load_tool
 from cwltool.update import INTERNAL_VERSION
 from schema_salad.ref_resolver import Loader
 from schema_salad.sourceline import cmap
@@ -66,12 +67,16 @@ class TestContainer(unittest.TestCase):
 
         make_fs_access=functools.partial(arvados_cwl.CollectionFsAccess,
                                          collection_cache=arvados_cwl.CollectionCache(runner.api, None, 0))
+        fs_access = mock.MagicMock()
+        fs_access.exists.return_value = True
+
         loadingContext = arvados_cwl.context.ArvLoadingContext(
             {"avsc_names": avsc_names,
              "basedir": "",
              "make_fs_access": make_fs_access,
-             "loader": Loader({}),
-             "metadata": {"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"}})
+             "construct_tool_object": runner.arv_make_tool,
+             "fetcher_constructor": functools.partial(arvados_cwl.CollectionFetcher, api_client=runner.api, fs_access=fs_access)
+             })
         runtimeContext = arvados_cwl.context.ArvRuntimeContext(
             {"work_api": "containers",
              "basedir": "",
@@ -83,6 +88,11 @@ class TestContainer(unittest.TestCase):
              "project_uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz"
             })
 
+        if isinstance(runner, mock.MagicMock):
+            def make_tool(toolpath_object, loadingContext):
+                return arvados_cwl.ArvadosCommandTool(runner, toolpath_object, loadingContext)
+            runner.arv_make_tool.side_effect = make_tool
+
         return loadingContext, runtimeContext
 
     # Helper function to set up the ArvCwlExecutor to use the containers api
@@ -123,13 +133,14 @@ class TestContainer(unittest.TestCase):
                 "outputs": [],
                 "baseCommand": "ls",
                 "arguments": [{"valueFrom": "$(runtime.outdir)"}],
-                "id": "#",
-                "class": "org.w3id.cwl.cwl.CommandLineTool"
+                "id": "",
+                "class": "CommandLineTool",
+                "cwlVersion": "v1.2"
             })
 
             loadingContext, runtimeContext = self.helper(runner, enable_reuse)
 
-            arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, loadingContext)
+            arvtool = cwltool.load_tool.load_tool(tool, loadingContext)
             arvtool.formatgraph = None
 
             for j in arvtool.job({}, mock.MagicMock(), runtimeContext):
@@ -143,7 +154,7 @@ class TestContainer(unittest.TestCase):
                         'name': 'test_run_'+str(enable_reuse),
                         'runtime_constraints': {
                             'vcpus': 1,
-                            'ram': 1073741824
+                            'ram': 268435456
                         },
                         'use_existing': enable_reuse,
                         'priority': 500,
@@ -172,6 +183,7 @@ class TestContainer(unittest.TestCase):
     # For the remaining fields, the defaults will apply: {'cores': 1, 'ram': 1024, 'outdirSize': 1024, 'tmpdirSize': 1024}
     @mock.patch("arvados.commands.keepdocker.list_images_in_arv")
     def test_resource_requirements(self, keepdocker):
+        arvados_cwl.add_arv_hints()
         runner = mock.MagicMock()
         runner.ignore_docker_for_reuse = False
         runner.intermediate_output_ttl = 3600
@@ -203,18 +215,19 @@ class TestContainer(unittest.TestCase):
                 "class": "http://arvados.org/cwl#IntermediateOutput",
                 "outputTTL": 7200
             }, {
-                "class": "http://arvados.org/cwl#ReuseRequirement",
+                "class": "WorkReuse",
                 "enableReuse": False
             }],
             "baseCommand": "ls",
-            "id": "#",
-            "class": "org.w3id.cwl.cwl.CommandLineTool"
+            "id": "",
+            "class": "CommandLineTool",
+            "cwlVersion": "v1.2"
         })
 
         loadingContext, runtimeContext = self.helper(runner)
         runtimeContext.name = "test_resource_requirements"
 
-        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, loadingContext)
+        arvtool = cwltool.load_tool.load_tool(tool, loadingContext)
         arvtool.formatgraph = None
         for j in arvtool.job({}, mock.MagicMock(), runtimeContext):
             j.run(runtimeContext)
@@ -316,14 +329,16 @@ class TestContainer(unittest.TestCase):
                 }                        ]
             }],
             "baseCommand": "ls",
-            "id": "#",
-            "class": "org.w3id.cwl.cwl.CommandLineTool"
+            "class": "CommandLineTool",
+            "cwlVersion": "v1.2",
+            "id": ""
         })
 
         loadingContext, runtimeContext = self.helper(runner)
         runtimeContext.name = "test_initial_work_dir"
 
-        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, loadingContext)
+        arvtool = cwltool.load_tool.load_tool(tool, loadingContext)
+
         arvtool.formatgraph = None
         for j in arvtool.job({}, mock.MagicMock(), runtimeContext):
             j.run(runtimeContext)
@@ -343,7 +358,7 @@ class TestContainer(unittest.TestCase):
             'name': 'test_initial_work_dir',
             'runtime_constraints': {
                 'vcpus': 1,
-                'ram': 1073741824
+                'ram': 268435456
             },
             'use_existing': True,
             'priority': 500,
@@ -417,14 +432,15 @@ class TestContainer(unittest.TestCase):
             "stderr": "stderr.txt",
             "stdin": "/keep/99999999999999999999999999999996+99/file.txt",
             "arguments": [{"valueFrom": "$(runtime.outdir)"}],
-            "id": "#",
-            "class": "org.w3id.cwl.cwl.CommandLineTool"
+            "id": "",
+            "class": "CommandLineTool",
+            "cwlVersion": "v1.2"
         })
 
         loadingContext, runtimeContext = self.helper(runner)
         runtimeContext.name = "test_run_redirect"
 
-        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, loadingContext)
+        arvtool = cwltool.load_tool.load_tool(tool, loadingContext)
         arvtool.formatgraph = None
         for j in arvtool.job({}, mock.MagicMock(), runtimeContext):
             j.run(runtimeContext)
@@ -437,7 +453,7 @@ class TestContainer(unittest.TestCase):
                     'name': 'test_run_redirect',
                     'runtime_constraints': {
                         'vcpus': 1,
-                        'ram': 1073741824
+                        'ram': 268435456
                     },
                     'use_existing': True,
                     'priority': 500,
@@ -643,14 +659,15 @@ class TestContainer(unittest.TestCase):
             "outputs": [],
             "baseCommand": "ls",
             "arguments": [{"valueFrom": "$(runtime.outdir)"}],
-            "id": "#",
-            "class": "org.w3id.cwl.cwl.CommandLineTool"
+            "id": "",
+            "class": "CommandLineTool",
+            "cwlVersion": "v1.2"
         })
 
         loadingContext, runtimeContext = self.helper(runner)
         runtimeContext.name = "test_run_mounts"
 
-        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, loadingContext)
+        arvtool = cwltool.load_tool.load_tool(tool, loadingContext)
         arvtool.formatgraph = None
         job_order = {
             "p1": {
@@ -680,7 +697,7 @@ class TestContainer(unittest.TestCase):
                     'name': 'test_run_mounts',
                     'runtime_constraints': {
                         'vcpus': 1,
-                        'ram': 1073741824
+                        'ram': 268435456
                     },
                     'use_existing': True,
                     'priority': 500,
@@ -713,6 +730,7 @@ class TestContainer(unittest.TestCase):
     # Hence the default resources will apply: {'cores': 1, 'ram': 1024, 'outdirSize': 1024, 'tmpdirSize': 1024}
     @mock.patch("arvados.commands.keepdocker.list_images_in_arv")
     def test_secrets(self, keepdocker):
+        arvados_cwl.add_arv_hints()
         runner = mock.MagicMock()
         runner.ignore_docker_for_reuse = False
         runner.intermediate_output_ttl = 0
@@ -726,7 +744,8 @@ class TestContainer(unittest.TestCase):
         document_loader, avsc_names, schema_metadata, metaschema_loader = cwltool.process.get_schema("v1.1")
 
         tool = cmap({"arguments": ["md5sum", "example.conf"],
-                     "class": "org.w3id.cwl.cwl.CommandLineTool",
+                     "class": "CommandLineTool",
+                     "cwlVersion": "v1.2",
                      "hints": [
                          {
                              "class": "http://commonwl.org/cwltool#Secrets",
@@ -735,7 +754,7 @@ class TestContainer(unittest.TestCase):
                              ]
                          }
                      ],
-                     "id": "#secret_job.cwl",
+                     "id": "",
                      "inputs": [
                          {
                              "id": "#secret_job.cwl/pw",
@@ -759,7 +778,7 @@ class TestContainer(unittest.TestCase):
         loadingContext, runtimeContext = self.helper(runner)
         runtimeContext.name = "test_secrets"
 
-        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, loadingContext)
+        arvtool = cwltool.load_tool.load_tool(tool, loadingContext)
         arvtool.formatgraph = None
 
         job_order = {"pw": "blorp"}
@@ -776,7 +795,7 @@ class TestContainer(unittest.TestCase):
                     'name': 'test_secrets',
                     'runtime_constraints': {
                         'vcpus': 1,
-                        'ram': 1073741824
+                        'ram': 268435456
                     },
                     'use_existing': True,
                     'priority': 500,
@@ -825,8 +844,9 @@ class TestContainer(unittest.TestCase):
             "outputs": [],
             "baseCommand": "ls",
             "arguments": [{"valueFrom": "$(runtime.outdir)"}],
-            "id": "#",
-            "class": "org.w3id.cwl.cwl.CommandLineTool",
+            "id": "",
+            "cwlVersion": "v1.2",
+            "class": "CommandLineTool",
             "hints": [
                 {
                     "class": "ToolTimeLimit",
@@ -838,7 +858,7 @@ class TestContainer(unittest.TestCase):
         loadingContext, runtimeContext = self.helper(runner)
         runtimeContext.name = "test_timelimit"
 
-        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, loadingContext)
+        arvtool = cwltool.load_tool.load_tool(tool, loadingContext)
         arvtool.formatgraph = None
 
         for j in arvtool.job({}, mock.MagicMock(), runtimeContext):
@@ -869,8 +889,9 @@ class TestContainer(unittest.TestCase):
             "outputs": [],
             "baseCommand": "ls",
             "arguments": [{"valueFrom": "$(runtime.outdir)"}],
-            "id": "#",
-            "class": "org.w3id.cwl.cwl.CommandLineTool",
+            "id": "",
+            "cwlVersion": "v1.2",
+            "class": "CommandLineTool",
             "hints": [
                 {
                     "class": "http://arvados.org/cwl#OutputStorageClass",
@@ -882,7 +903,7 @@ class TestContainer(unittest.TestCase):
 
         loadingContext, runtimeContext = self.helper(runner, True)
 
-        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, loadingContext)
+        arvtool = cwltool.load_tool.load_tool(tool, loadingContext)
         arvtool.formatgraph = None
 
         for j in arvtool.job({}, mock.MagicMock(), runtimeContext):
@@ -896,7 +917,7 @@ class TestContainer(unittest.TestCase):
                     'name': 'test_run_True',
                     'runtime_constraints': {
                         'vcpus': 1,
-                        'ram': 1073741824
+                        'ram': 268435456
                     },
                     'use_existing': True,
                     'priority': 500,
@@ -944,8 +965,9 @@ class TestContainer(unittest.TestCase):
             "outputs": [],
             "baseCommand": "ls",
             "arguments": [{"valueFrom": "$(runtime.outdir)"}],
-            "id": "#",
-            "class": "org.w3id.cwl.cwl.CommandLineTool",
+            "id": "",
+            "class": "CommandLineTool",
+            "cwlVersion": "v1.2",
             "hints": [
             {
                 "class": "http://arvados.org/cwl#ProcessProperties",
@@ -967,7 +989,7 @@ class TestContainer(unittest.TestCase):
 
         loadingContext, runtimeContext = self.helper(runner, True)
 
-        arvtool = arvados_cwl.ArvadosCommandTool(runner, tool, loadingContext)
+        arvtool = cwltool.load_tool.load_tool(tool, loadingContext)
         arvtool.formatgraph = None
 
         for j in arvtool.job({"x": "blorp"}, mock.MagicMock(), runtimeContext):
@@ -981,7 +1003,7 @@ class TestContainer(unittest.TestCase):
                     'name': 'test_run_True',
                     'runtime_constraints': {
                         'vcpus': 1,
-                        'ram': 1073741824
+                        'ram': 268435456
                     },
                     'use_existing': True,
                     'priority': 500,
@@ -1014,6 +1036,87 @@ class TestContainer(unittest.TestCase):
                 }))
 
 
+    # The test passes no builder.resources
+    # Hence the default resources will apply: {'cores': 1, 'ram': 1024, 'outdirSize': 1024, 'tmpdirSize': 1024}
+    @mock.patch("arvados.commands.keepdocker.list_images_in_arv")
+    def test_cuda_requirement(self, keepdocker):
+        arvados_cwl.add_arv_hints()
+        arv_docker_clear_cache()
+
+        runner = mock.MagicMock()
+        runner.ignore_docker_for_reuse = False
+        runner.intermediate_output_ttl = 0
+        runner.secret_store = cwltool.secrets.SecretStore()
+        runner.api._rootDesc = {"revision": "20210628"}
+
+        keepdocker.return_value = [("zzzzz-4zz18-zzzzzzzzzzzzzz3", "")]
+        runner.api.collections().get().execute.return_value = {
+            "portable_data_hash": "99999999999999999999999999999993+99"}
+
+        tool = cmap({
+            "inputs": [],
+            "outputs": [],
+            "baseCommand": "nvidia-smi",
+            "arguments": [],
+            "id": "",
+            "cwlVersion": "v1.2",
+            "class": "CommandLineTool",
+            "hints": [
+            {
+                "class": "http://arvados.org/cwl#CUDARequirement",
+                "minCUDADriverVersion": "11.0",
+                "minCUDAHardwareCapability": "9.0",
+            }
+        ]
+        })
+
+        loadingContext, runtimeContext = self.helper(runner, True)
+
+        arvtool = cwltool.load_tool.load_tool(tool, loadingContext)
+        arvtool.formatgraph = None
+
+        for j in arvtool.job({}, mock.MagicMock(), runtimeContext):
+            j.run(runtimeContext)
+            runner.api.container_requests().create.assert_called_with(
+                body=JsonDiffMatcher({
+                    'environment': {
+                        'HOME': '/var/spool/cwl',
+                        'TMPDIR': '/tmp'
+                    },
+                    'name': 'test_run_True',
+                    'runtime_constraints': {
+                        'vcpus': 1,
+                        'ram': 268435456,
+                        'cuda': {
+                            'device_count': 1,
+                            'driver_version': "11.0",
+                            'hardware_capability': "9.0"
+                        }
+                    },
+                    'use_existing': True,
+                    'priority': 500,
+                    'mounts': {
+                        '/tmp': {'kind': 'tmp',
+                                 "capacity": 1073741824
+                             },
+                        '/var/spool/cwl': {'kind': 'tmp',
+                                           "capacity": 1073741824 }
+                    },
+                    'state': 'Committed',
+                    'output_name': 'Output for step test_run_True',
+                    'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
+                    'output_path': '/var/spool/cwl',
+                    'output_ttl': 0,
+                    'container_image': '99999999999999999999999999999993+99',
+                    'command': ['nvidia-smi'],
+                    'cwd': '/var/spool/cwl',
+                    'scheduling_parameters': {},
+                    'properties': {},
+                    'secret_mounts': {},
+                    'output_storage_classes': ["default"]
+                }))
+
+
 class TestWorkflow(unittest.TestCase):
     def setUp(self):
         cwltool.process._names = set()