18180: Support for requesting preemptible instances in CWL
authorPeter Amstutz <peter.amstutz@curii.com>
Thu, 17 Mar 2022 18:26:07 +0000 (14:26 -0400)
committerPeter Amstutz <peter.amstutz@curii.com>
Fri, 18 Mar 2022 19:58:47 +0000 (15:58 -0400)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

doc/user/cwl/cwl-extensions.html.textile.liquid
lib/config/config.default.yml
sdk/cwl/arvados_cwl/__init__.py
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml
sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml
sdk/cwl/arvados_cwl/arvcontainer.py
sdk/cwl/arvados_cwl/context.py
sdk/cwl/arvados_cwl/runner.py
sdk/cwl/tests/test_container.py
sdk/cwl/tests/test_submit.py

index dd78e989fd52afe4ddd24940a00d76634f546a2d..d6148d7eee1f3d2992a9d43bcb9c2ca44b3b68b5 100644 (file)
@@ -63,6 +63,9 @@ hints:
     cudaComputeCapabilityMin: "9.0"
     deviceCountMin: 1
     deviceCountMax: 1
+
+  arv:UsePreemptible:
+    usePreemptible: true
 {% endcodeblock %}
 
 h2(#RunInSingleContainer). arv:RunInSingleContainer
@@ -164,6 +167,14 @@ table(table table-bordered table-condensed).
 |deviceCountMin|integer|Minimum number of GPU devices to allocate on a single node. Required.|
 |deviceCountMax|integer|Maximum number of GPU devices to allocate on a single node. Optional.  If not specified, same as @minDeviceCount@.|
 
+h2(#UsePreemptible). arv:UsePreemptible
+
+Specify whether a workflow step should request preemptible (e.g. AWS Spot market) instances.  Such instances are generally cheaper, but can be taken back by the cloud provider at any time (preempted) causing the step to fail.  When this happens, Arvados will automatically re-try the step, up to the configuration value of @Containers.MaxRetryAttempts@ (default 3) times.
+
+table(table table-bordered table-condensed).
+|_. Field |_. Type |_. Description |
+|usePreemptible|boolean|Required, true to opt-in to using preemptible instances, false to opt-out.|
+
 h2. arv:dockerCollectionPDH
 
 This is an optional extension field appearing on the standard @DockerRequirement@.  It specifies the portable data hash of the Arvados collection containing the Docker image.  If present, it takes precedence over @dockerPull@ or @dockerImageId@.
index 9800be70473fc8bf20f56ae5013bac099c80d9a5..0a8f55244b905cb48a56a4b994733534e8a5fd53 100644 (file)
@@ -903,11 +903,6 @@ Clusters:
       # If false, containers are scheduled on preemptible instances
       # only when requested by the submitter.
       #
-      # Note that arvados-cwl-runner does not currently offer a
-      # feature to request preemptible instances, so this value
-      # effectively acts as a cluster-wide decision about whether to
-      # use preemptible instances.
-      #
       # This flag is ignored if no preemptible instance types are
       # configured, and has no effect on top-level containers.
       AlwaysUsePreemptibleInstances: true
index 826467cc09397342c8d0fa32bfe3b4ed8dd10124..c73b358eccfb19211ce5a077d56ac995d30a40c0 100644 (file)
@@ -213,6 +213,10 @@ def arg_parser():  # type: () -> argparse.ArgumentParser
     parser.add_argument("--http-timeout", type=int,
                         default=5*60, dest="http_timeout", help="API request timeout in seconds. Default is 300 seconds (5 minutes).")
 
+    exgroup = parser.add_mutually_exclusive_group()
+    exgroup.add_argument("--enable-preemptible", dest="enable_preemptible", default=None, action="store_true", help="Use preemptible instances. Control individual steps with arv:UsePreemptible hint.")
+    exgroup.add_argument("--disable-preemptible", dest="enable_preemptible", default=None, action="store_false", help="Don't use preemptible instances.")
+
     parser.add_argument(
         "--skip-schemas",
         action="store_true",
@@ -255,7 +259,8 @@ def add_arv_hints():
         "http://arvados.org/cwl#ClusterTarget",
         "http://arvados.org/cwl#OutputStorageClass",
         "http://arvados.org/cwl#ProcessProperties",
-        "http://commonwl.org/cwltool#CUDARequirement"
+        "http://commonwl.org/cwltool#CUDARequirement",
+        "http://arvados.org/cwl#UsePreemptible",
     ])
 
 def exit_signal_handler(sigcode, frame):
index 6e2d4f1d92ab9471dd1bcc441b360eed12cc6a2c..443a027aea931090ef741f27072e5fb53973350c 100644 (file)
@@ -385,3 +385,18 @@ $graph:
       doc: |
         Maximum number of GPU devices to request.  If not specified,
         same as `cudaDeviceCountMin`.
+
+- name: UsePreemptible
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Specify a workflow step should opt-in or opt-out of using preemptible (spot) instances.
+  fields:
+    class:
+      type: string
+      doc: "Always 'arv:ProcessProperties"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    usePreemptible: boolean
index 0e81347d72869253c9bdadae70ae93f498cb7d25..633edaad2349e505e2e2f5f5bc70c7ad5663b032 100644 (file)
@@ -328,3 +328,18 @@ $graph:
       doc: |
         Maximum number of GPU devices to request.  If not specified,
         same as `cudaDeviceCountMin`.
+
+- name: UsePreemptible
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Specify a workflow step should opt-in or opt-out of using preemptible (spot) instances.
+  fields:
+    class:
+      type: string
+      doc: "Always 'arv:ProcessProperties"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    usePreemptible: boolean
index e9f70bf1cf63616408e78f7d4278ae6b3e4e4c9f..8ca064a6e57f07a559d064807c550ca4a1dff19d 100644 (file)
@@ -330,3 +330,18 @@ $graph:
       doc: |
         Maximum number of GPU devices to request.  If not specified,
         same as `cudaDeviceCountMin`.
+
+- name: UsePreemptible
+  type: record
+  extends: cwl:ProcessRequirement
+  inVocab: false
+  doc: |
+    Specify a workflow step should opt-in or opt-out of using preemptible (spot) instances.
+  fields:
+    class:
+      type: string
+      doc: "Always 'arv:ProcessProperties"
+      jsonldPredicate:
+        _id: "@type"
+        _type: "@vocab"
+    usePreemptible: boolean
index 2a5ff3a13a834861c846769af9407b8b1de10c2a..8c468dd22d09046bdff1b1f2152197ebdbe5c3ed 100644 (file)
@@ -300,6 +300,17 @@ class ArvadosContainer(JobBase):
                 "hardware_capability": aslist(cuda_req["cudaComputeCapability"])[0]
             }
 
+        if runtimeContext.enable_preemptible is False:
+            scheduling_parameters["preemptible"] = False
+        else:
+            preemptible_req, _ = self.get_requirement("http://arvados.org/cwl#UsePreemptible")
+            if preemptible_req:
+                scheduling_parameters["preemptible"] = preemptible_req["usePreemptible"]
+            elif runtimeContext.enable_preemptible is True:
+                scheduling_parameters["preemptible"] = True
+            elif runtimeContext.enable_preemptible is None:
+                pass
+
         if self.timelimit is not None and self.timelimit > 0:
             scheduling_parameters["max_run_time"] = self.timelimit
 
@@ -550,6 +561,12 @@ class RunnerContainer(Runner):
         if self.enable_dev:
             command.append("--enable-dev")
 
+        if runtimeContext.enable_preemptible is True:
+            command.append("--enable-preemptible")
+
+        if runtimeContext.enable_preemptible is False:
+            command.append("--disable-preemptible")
+
         command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"])
 
         container_req["command"] = command
index 4239dd3b51fea8e51d3cdaf33116595cc2748f34..316250106b09cdd248d0ddf7b292cbfc1881a700 100644 (file)
@@ -37,6 +37,7 @@ class ArvRuntimeContext(RuntimeContext):
         self.always_submit_runner = False
         self.collection_cache_size = 256
         self.match_local_docker = False
+        self.enable_preemptible = None
 
         super(ArvRuntimeContext, self).__init__(kwargs)
 
index ad17950a2fb6bab90376ecb0fe688af7a80f2b94..38e2c4d806f36ed80f9dac70da8cf37006f3de19 100644 (file)
@@ -40,7 +40,7 @@ import schema_salad.validate as validate
 
 import arvados.collection
 from .util import collectionUUID
-import ruamel.yaml as yaml
+from ruamel.yaml import YAML
 from ruamel.yaml.comments import CommentedMap, CommentedSeq
 
 import arvados_cwl.arvdocker
@@ -265,7 +265,8 @@ def upload_dependencies(arvrunner, name, document_loader,
                 textIO = StringIO(text.decode('utf-8'))
             else:
                 textIO = StringIO(text)
-            return yaml.safe_load(textIO)
+            yamlloader = YAML(typ='safe', pure=True)
+            return yamlloader.load(textIO)
         else:
             return {}
 
index 3de90c8d8810766399b08934c5c2db312926408d..798c5af289322ce2ae23edc26ca7d8a863d50186 100644 (file)
@@ -1234,6 +1234,103 @@ class TestContainer(unittest.TestCase):
                 body=JsonDiffMatcher(container_request))
 
 
+    # The test passes no builder.resources
+    # Hence the default resources will apply: {'cores': 1, 'ram': 1024, 'outdirSize': 1024, 'tmpdirSize': 1024}
+    @mock.patch("arvados.commands.keepdocker.list_images_in_arv")
+    def test_run_preemptible_hint(self, keepdocker):
+        arvados_cwl.add_arv_hints()
+        for enable_preemptible in (None, True, False):
+            for preemptible_hint in (None, True, False):
+                arv_docker_clear_cache()
+
+                runner = mock.MagicMock()
+                runner.ignore_docker_for_reuse = False
+                runner.intermediate_output_ttl = 0
+                runner.secret_store = cwltool.secrets.SecretStore()
+                runner.api._rootDesc = {"revision": "20210628"}
+
+                keepdocker.return_value = [("zzzzz-4zz18-zzzzzzzzzzzzzz3", "")]
+                runner.api.collections().get().execute.return_value = {
+                    "portable_data_hash": "99999999999999999999999999999993+99"}
+
+                if preemptible_hint is not None:
+                    hints = [{
+                        "class": "http://arvados.org/cwl#UsePreemptible",
+                        "usePreemptible": preemptible_hint
+                    }]
+                else:
+                    hints = []
+
+                tool = cmap({
+                    "inputs": [],
+                    "outputs": [],
+                    "baseCommand": "ls",
+                    "arguments": [{"valueFrom": "$(runtime.outdir)"}],
+                    "id": "",
+                    "class": "CommandLineTool",
+                    "cwlVersion": "v1.2",
+                    "hints": hints
+                })
+
+                loadingContext, runtimeContext = self.helper(runner)
+
+                runtimeContext.name = 'test_run_enable_preemptible_'+str(enable_preemptible)+str(preemptible_hint)
+                runtimeContext.enable_preemptible = enable_preemptible
+
+                arvtool = cwltool.load_tool.load_tool(tool, loadingContext)
+                arvtool.formatgraph = None
+
+                # Test the interactions between --enable/disable-preemptible
+                # and UsePreemptible hint
+
+                if enable_preemptible is None:
+                    if preemptible_hint is None:
+                        sched = {}
+                    else:
+                        sched = {'preemptible': preemptible_hint}
+                else:
+                    if preemptible_hint is None:
+                        sched = {'preemptible': enable_preemptible}
+                    else:
+                        sched = {'preemptible': enable_preemptible and preemptible_hint}
+
+                for j in arvtool.job({}, mock.MagicMock(), runtimeContext):
+                    j.run(runtimeContext)
+                    runner.api.container_requests().create.assert_called_with(
+                        body=JsonDiffMatcher({
+                            'environment': {
+                                'HOME': '/var/spool/cwl',
+                                'TMPDIR': '/tmp'
+                            },
+                            'name': runtimeContext.name,
+                            'runtime_constraints': {
+                                'vcpus': 1,
+                                'ram': 268435456
+                            },
+                            'use_existing': True,
+                            'priority': 500,
+                            'mounts': {
+                                '/tmp': {'kind': 'tmp',
+                                         "capacity": 1073741824
+                                     },
+                                '/var/spool/cwl': {'kind': 'tmp',
+                                                   "capacity": 1073741824 }
+                            },
+                            'state': 'Committed',
+                            'output_name': 'Output for step '+runtimeContext.name,
+                            'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz',
+                            'output_path': '/var/spool/cwl',
+                            'output_ttl': 0,
+                            'container_image': '99999999999999999999999999999993+99',
+                            'command': ['ls', '/var/spool/cwl'],
+                            'cwd': '/var/spool/cwl',
+                            'scheduling_parameters': sched,
+                            'properties': {},
+                            'secret_mounts': {},
+                            'output_storage_classes': ["default"]
+                        }))
+
+
 
 class TestWorkflow(unittest.TestCase):
     def setUp(self):
index 10443359b99bf02a51163d8eb38924579d14154a..61892bf2a447395708359a7da7e3bf4798603626 100644 (file)
@@ -1468,6 +1468,49 @@ class TestSubmit(unittest.TestCase):
         self.assertEqual(exited, 0)
 
 
+    @stubs
+    def test_submit_enable_preemptible(self, stubs):
+        exited = arvados_cwl.main(
+            ["--submit", "--no-wait", "--api=containers", "--debug", "--enable-preemptible",
+                "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
+            stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
+
+        expect_container = copy.deepcopy(stubs.expect_container_spec)
+        expect_container['command'] = ['arvados-cwl-runner', '--local', '--api=containers',
+                        '--no-log-timestamps', '--disable-validate', '--disable-color',
+                        '--eval-timeout=20', '--thread-count=0',
+                        '--enable-reuse', "--collection-cache-size=256", '--debug', '--on-error=continue',
+                                       '--enable-preemptible',
+                        '/var/lib/cwl/workflow.json#main', '/var/lib/cwl/cwl.input.json']
+
+        stubs.api.container_requests().create.assert_called_with(
+            body=JsonDiffMatcher(expect_container))
+        self.assertEqual(stubs.capture_stdout.getvalue(),
+                         stubs.expect_container_request_uuid + '\n')
+        self.assertEqual(exited, 0)
+
+    @stubs
+    def test_submit_disable_preemptible(self, stubs):
+        exited = arvados_cwl.main(
+            ["--submit", "--no-wait", "--api=containers", "--debug", "--disable-preemptible",
+                "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"],
+            stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client)
+
+        expect_container = copy.deepcopy(stubs.expect_container_spec)
+        expect_container['command'] = ['arvados-cwl-runner', '--local', '--api=containers',
+                        '--no-log-timestamps', '--disable-validate', '--disable-color',
+                        '--eval-timeout=20', '--thread-count=0',
+                        '--enable-reuse', "--collection-cache-size=256", '--debug', '--on-error=continue',
+                                       '--disable-preemptible',
+                        '/var/lib/cwl/workflow.json#main', '/var/lib/cwl/cwl.input.json']
+
+        stubs.api.container_requests().create.assert_called_with(
+            body=JsonDiffMatcher(expect_container))
+        self.assertEqual(stubs.capture_stdout.getvalue(),
+                         stubs.expect_container_request_uuid + '\n')
+        self.assertEqual(exited, 0)
+
+
 class TestCreateWorkflow(unittest.TestCase):
     existing_workflow_uuid = "zzzzz-7fd4e-validworkfloyml"
     expect_workflow = StripYAMLComments(