From d7e80d62a0e1c8587c65975c8bb020200cd0d7d6 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 17 Mar 2022 14:26:07 -0400 Subject: [PATCH] 18180: Support for requesting preemptible instances in CWL Arvados-DCO-1.1-Signed-off-by: Peter Amstutz --- .../cwl/cwl-extensions.html.textile.liquid | 11 +++ lib/config/config.default.yml | 5 - sdk/cwl/arvados_cwl/__init__.py | 7 +- sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml | 15 +++ sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml | 15 +++ sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml | 15 +++ sdk/cwl/arvados_cwl/arvcontainer.py | 17 ++++ sdk/cwl/arvados_cwl/context.py | 1 + sdk/cwl/arvados_cwl/runner.py | 5 +- sdk/cwl/tests/test_container.py | 97 +++++++++++++++++++ sdk/cwl/tests/test_submit.py | 43 ++++++++ 11 files changed, 223 insertions(+), 8 deletions(-) diff --git a/doc/user/cwl/cwl-extensions.html.textile.liquid b/doc/user/cwl/cwl-extensions.html.textile.liquid index dd78e989fd..d6148d7eee 100644 --- a/doc/user/cwl/cwl-extensions.html.textile.liquid +++ b/doc/user/cwl/cwl-extensions.html.textile.liquid @@ -63,6 +63,9 @@ hints: cudaComputeCapabilityMin: "9.0" deviceCountMin: 1 deviceCountMax: 1 + + arv:UsePreemptible: + usePreemptible: true {% endcodeblock %} h2(#RunInSingleContainer). arv:RunInSingleContainer @@ -164,6 +167,14 @@ table(table table-bordered table-condensed). |deviceCountMin|integer|Minimum number of GPU devices to allocate on a single node. Required.| |deviceCountMax|integer|Maximum number of GPU devices to allocate on a single node. Optional. If not specified, same as @minDeviceCount@.| +h2(#UsePreemptible). arv:UsePreemptible + +Specify whether a workflow step should request preemptible (e.g. AWS Spot market) instances. Such instances are generally cheaper, but can be taken back by the cloud provider at any time (preempted) causing the step to fail. When this happens, Arvados will automatically re-try the step, up to the configuration value of @Containers.MaxRetryAttempts@ (default 3) times. + +table(table table-bordered table-condensed). +|_. Field |_. Type |_. Description | +|usePreemptible|boolean|Required, true to opt-in to using preemptible instances, false to opt-out.| + h2. arv:dockerCollectionPDH This is an optional extension field appearing on the standard @DockerRequirement@. It specifies the portable data hash of the Arvados collection containing the Docker image. If present, it takes precedence over @dockerPull@ or @dockerImageId@. diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml index 9800be7047..0a8f55244b 100644 --- a/lib/config/config.default.yml +++ b/lib/config/config.default.yml @@ -903,11 +903,6 @@ Clusters: # If false, containers are scheduled on preemptible instances # only when requested by the submitter. # - # Note that arvados-cwl-runner does not currently offer a - # feature to request preemptible instances, so this value - # effectively acts as a cluster-wide decision about whether to - # use preemptible instances. - # # This flag is ignored if no preemptible instance types are # configured, and has no effect on top-level containers. AlwaysUsePreemptibleInstances: true diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py index 826467cc09..c73b358ecc 100644 --- a/sdk/cwl/arvados_cwl/__init__.py +++ b/sdk/cwl/arvados_cwl/__init__.py @@ -213,6 +213,10 @@ def arg_parser(): # type: () -> argparse.ArgumentParser parser.add_argument("--http-timeout", type=int, default=5*60, dest="http_timeout", help="API request timeout in seconds. Default is 300 seconds (5 minutes).") + exgroup = parser.add_mutually_exclusive_group() + exgroup.add_argument("--enable-preemptible", dest="enable_preemptible", default=None, action="store_true", help="Use preemptible instances. Control individual steps with arv:UsePreemptible hint.") + exgroup.add_argument("--disable-preemptible", dest="enable_preemptible", default=None, action="store_false", help="Don't use preemptible instances.") + parser.add_argument( "--skip-schemas", action="store_true", @@ -255,7 +259,8 @@ def add_arv_hints(): "http://arvados.org/cwl#ClusterTarget", "http://arvados.org/cwl#OutputStorageClass", "http://arvados.org/cwl#ProcessProperties", - "http://commonwl.org/cwltool#CUDARequirement" + "http://commonwl.org/cwltool#CUDARequirement", + "http://arvados.org/cwl#UsePreemptible", ]) def exit_signal_handler(sigcode, frame): diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml index 6e2d4f1d92..443a027aea 100644 --- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml +++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.0.yml @@ -385,3 +385,18 @@ $graph: doc: | Maximum number of GPU devices to request. If not specified, same as `cudaDeviceCountMin`. + +- name: UsePreemptible + type: record + extends: cwl:ProcessRequirement + inVocab: false + doc: | + Specify a workflow step should opt-in or opt-out of using preemptible (spot) instances. + fields: + class: + type: string + doc: "Always 'arv:ProcessProperties" + jsonldPredicate: + _id: "@type" + _type: "@vocab" + usePreemptible: boolean diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml index 0e81347d72..633edaad23 100644 --- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml +++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.1.yml @@ -328,3 +328,18 @@ $graph: doc: | Maximum number of GPU devices to request. If not specified, same as `cudaDeviceCountMin`. + +- name: UsePreemptible + type: record + extends: cwl:ProcessRequirement + inVocab: false + doc: | + Specify a workflow step should opt-in or opt-out of using preemptible (spot) instances. + fields: + class: + type: string + doc: "Always 'arv:ProcessProperties" + jsonldPredicate: + _id: "@type" + _type: "@vocab" + usePreemptible: boolean diff --git a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml index e9f70bf1cf..8ca064a6e5 100644 --- a/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml +++ b/sdk/cwl/arvados_cwl/arv-cwl-schema-v1.2.yml @@ -330,3 +330,18 @@ $graph: doc: | Maximum number of GPU devices to request. If not specified, same as `cudaDeviceCountMin`. + +- name: UsePreemptible + type: record + extends: cwl:ProcessRequirement + inVocab: false + doc: | + Specify a workflow step should opt-in or opt-out of using preemptible (spot) instances. + fields: + class: + type: string + doc: "Always 'arv:ProcessProperties" + jsonldPredicate: + _id: "@type" + _type: "@vocab" + usePreemptible: boolean diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py index 2a5ff3a13a..8c468dd22d 100644 --- a/sdk/cwl/arvados_cwl/arvcontainer.py +++ b/sdk/cwl/arvados_cwl/arvcontainer.py @@ -300,6 +300,17 @@ class ArvadosContainer(JobBase): "hardware_capability": aslist(cuda_req["cudaComputeCapability"])[0] } + if runtimeContext.enable_preemptible is False: + scheduling_parameters["preemptible"] = False + else: + preemptible_req, _ = self.get_requirement("http://arvados.org/cwl#UsePreemptible") + if preemptible_req: + scheduling_parameters["preemptible"] = preemptible_req["usePreemptible"] + elif runtimeContext.enable_preemptible is True: + scheduling_parameters["preemptible"] = True + elif runtimeContext.enable_preemptible is None: + pass + if self.timelimit is not None and self.timelimit > 0: scheduling_parameters["max_run_time"] = self.timelimit @@ -550,6 +561,12 @@ class RunnerContainer(Runner): if self.enable_dev: command.append("--enable-dev") + if runtimeContext.enable_preemptible is True: + command.append("--enable-preemptible") + + if runtimeContext.enable_preemptible is False: + command.append("--disable-preemptible") + command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"]) container_req["command"] = command diff --git a/sdk/cwl/arvados_cwl/context.py b/sdk/cwl/arvados_cwl/context.py index 4239dd3b51..316250106b 100644 --- a/sdk/cwl/arvados_cwl/context.py +++ b/sdk/cwl/arvados_cwl/context.py @@ -37,6 +37,7 @@ class ArvRuntimeContext(RuntimeContext): self.always_submit_runner = False self.collection_cache_size = 256 self.match_local_docker = False + self.enable_preemptible = None super(ArvRuntimeContext, self).__init__(kwargs) diff --git a/sdk/cwl/arvados_cwl/runner.py b/sdk/cwl/arvados_cwl/runner.py index ad17950a2f..38e2c4d806 100644 --- a/sdk/cwl/arvados_cwl/runner.py +++ b/sdk/cwl/arvados_cwl/runner.py @@ -40,7 +40,7 @@ import schema_salad.validate as validate import arvados.collection from .util import collectionUUID -import ruamel.yaml as yaml +from ruamel.yaml import YAML from ruamel.yaml.comments import CommentedMap, CommentedSeq import arvados_cwl.arvdocker @@ -265,7 +265,8 @@ def upload_dependencies(arvrunner, name, document_loader, textIO = StringIO(text.decode('utf-8')) else: textIO = StringIO(text) - return yaml.safe_load(textIO) + yamlloader = YAML(typ='safe', pure=True) + return yamlloader.load(textIO) else: return {} diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py index 3de90c8d88..798c5af289 100644 --- a/sdk/cwl/tests/test_container.py +++ b/sdk/cwl/tests/test_container.py @@ -1234,6 +1234,103 @@ class TestContainer(unittest.TestCase): body=JsonDiffMatcher(container_request)) + # The test passes no builder.resources + # Hence the default resources will apply: {'cores': 1, 'ram': 1024, 'outdirSize': 1024, 'tmpdirSize': 1024} + @mock.patch("arvados.commands.keepdocker.list_images_in_arv") + def test_run_preemptible_hint(self, keepdocker): + arvados_cwl.add_arv_hints() + for enable_preemptible in (None, True, False): + for preemptible_hint in (None, True, False): + arv_docker_clear_cache() + + runner = mock.MagicMock() + runner.ignore_docker_for_reuse = False + runner.intermediate_output_ttl = 0 + runner.secret_store = cwltool.secrets.SecretStore() + runner.api._rootDesc = {"revision": "20210628"} + + keepdocker.return_value = [("zzzzz-4zz18-zzzzzzzzzzzzzz3", "")] + runner.api.collections().get().execute.return_value = { + "portable_data_hash": "99999999999999999999999999999993+99"} + + if preemptible_hint is not None: + hints = [{ + "class": "http://arvados.org/cwl#UsePreemptible", + "usePreemptible": preemptible_hint + }] + else: + hints = [] + + tool = cmap({ + "inputs": [], + "outputs": [], + "baseCommand": "ls", + "arguments": [{"valueFrom": "$(runtime.outdir)"}], + "id": "", + "class": "CommandLineTool", + "cwlVersion": "v1.2", + "hints": hints + }) + + loadingContext, runtimeContext = self.helper(runner) + + runtimeContext.name = 'test_run_enable_preemptible_'+str(enable_preemptible)+str(preemptible_hint) + runtimeContext.enable_preemptible = enable_preemptible + + arvtool = cwltool.load_tool.load_tool(tool, loadingContext) + arvtool.formatgraph = None + + # Test the interactions between --enable/disable-preemptible + # and UsePreemptible hint + + if enable_preemptible is None: + if preemptible_hint is None: + sched = {} + else: + sched = {'preemptible': preemptible_hint} + else: + if preemptible_hint is None: + sched = {'preemptible': enable_preemptible} + else: + sched = {'preemptible': enable_preemptible and preemptible_hint} + + for j in arvtool.job({}, mock.MagicMock(), runtimeContext): + j.run(runtimeContext) + runner.api.container_requests().create.assert_called_with( + body=JsonDiffMatcher({ + 'environment': { + 'HOME': '/var/spool/cwl', + 'TMPDIR': '/tmp' + }, + 'name': runtimeContext.name, + 'runtime_constraints': { + 'vcpus': 1, + 'ram': 268435456 + }, + 'use_existing': True, + 'priority': 500, + 'mounts': { + '/tmp': {'kind': 'tmp', + "capacity": 1073741824 + }, + '/var/spool/cwl': {'kind': 'tmp', + "capacity": 1073741824 } + }, + 'state': 'Committed', + 'output_name': 'Output for step '+runtimeContext.name, + 'owner_uuid': 'zzzzz-8i9sb-zzzzzzzzzzzzzzz', + 'output_path': '/var/spool/cwl', + 'output_ttl': 0, + 'container_image': '99999999999999999999999999999993+99', + 'command': ['ls', '/var/spool/cwl'], + 'cwd': '/var/spool/cwl', + 'scheduling_parameters': sched, + 'properties': {}, + 'secret_mounts': {}, + 'output_storage_classes': ["default"] + })) + + class TestWorkflow(unittest.TestCase): def setUp(self): diff --git a/sdk/cwl/tests/test_submit.py b/sdk/cwl/tests/test_submit.py index 10443359b9..61892bf2a4 100644 --- a/sdk/cwl/tests/test_submit.py +++ b/sdk/cwl/tests/test_submit.py @@ -1468,6 +1468,49 @@ class TestSubmit(unittest.TestCase): self.assertEqual(exited, 0) + @stubs + def test_submit_enable_preemptible(self, stubs): + exited = arvados_cwl.main( + ["--submit", "--no-wait", "--api=containers", "--debug", "--enable-preemptible", + "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"], + stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client) + + expect_container = copy.deepcopy(stubs.expect_container_spec) + expect_container['command'] = ['arvados-cwl-runner', '--local', '--api=containers', + '--no-log-timestamps', '--disable-validate', '--disable-color', + '--eval-timeout=20', '--thread-count=0', + '--enable-reuse', "--collection-cache-size=256", '--debug', '--on-error=continue', + '--enable-preemptible', + '/var/lib/cwl/workflow.json#main', '/var/lib/cwl/cwl.input.json'] + + stubs.api.container_requests().create.assert_called_with( + body=JsonDiffMatcher(expect_container)) + self.assertEqual(stubs.capture_stdout.getvalue(), + stubs.expect_container_request_uuid + '\n') + self.assertEqual(exited, 0) + + @stubs + def test_submit_disable_preemptible(self, stubs): + exited = arvados_cwl.main( + ["--submit", "--no-wait", "--api=containers", "--debug", "--disable-preemptible", + "tests/wf/submit_wf.cwl", "tests/submit_test_job.json"], + stubs.capture_stdout, sys.stderr, api_client=stubs.api, keep_client=stubs.keep_client) + + expect_container = copy.deepcopy(stubs.expect_container_spec) + expect_container['command'] = ['arvados-cwl-runner', '--local', '--api=containers', + '--no-log-timestamps', '--disable-validate', '--disable-color', + '--eval-timeout=20', '--thread-count=0', + '--enable-reuse', "--collection-cache-size=256", '--debug', '--on-error=continue', + '--disable-preemptible', + '/var/lib/cwl/workflow.json#main', '/var/lib/cwl/cwl.input.json'] + + stubs.api.container_requests().create.assert_called_with( + body=JsonDiffMatcher(expect_container)) + self.assertEqual(stubs.capture_stdout.getvalue(), + stubs.expect_container_request_uuid + '\n') + self.assertEqual(exited, 0) + + class TestCreateWorkflow(unittest.TestCase): existing_workflow_uuid = "zzzzz-7fd4e-validworkfloyml" expect_workflow = StripYAMLComments( -- 2.30.2