19982: Preemption resubmit tests pass
[arvados.git] / sdk / cwl / arvados_cwl / __init__.py
1 #!/usr/bin/env python3
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: Apache-2.0
5
6 # Implement cwl-runner interface for submitting and running work on Arvados, using
7 # the Crunch containers API.
8
9 import argparse
10 import importlib.metadata
11 import importlib.resources
12 import logging
13 import os
14 import sys
15 import re
16
17 from schema_salad.sourceline import SourceLine
18 import schema_salad.validate as validate
19 import cwltool.main
20 import cwltool.workflow
21 import cwltool.process
22 import cwltool.argparser
23 from cwltool.errors import WorkflowException
24 from cwltool.process import shortname, UnsupportedRequirement, use_custom_schema
25 from cwltool.utils import adjustFileObjs, adjustDirObjs, get_listing
26
27 import arvados
28 import arvados.config
29 import arvados.logging
30 from arvados.keep import KeepClient
31 from arvados.errors import ApiError
32 import arvados.commands._util as arv_cmd
33
34 from .perf import Perf
35 from ._version import __version__
36 from .executor import ArvCwlExecutor
37 from .fsaccess import workflow_uuid_pattern
38
39 # These aren't used directly in this file but
40 # other code expects to import them from here
41 from .arvcontainer import ArvadosContainer
42 from .arvtool import ArvadosCommandTool
43 from .fsaccess import CollectionFsAccess, CollectionCache, CollectionFetcher
44 from .util import get_current_container
45 from .executor import RuntimeStatusLoggingHandler, DEFAULT_PRIORITY
46 from .arvworkflow import ArvadosWorkflow
47
48 logger = logging.getLogger('arvados.cwl-runner')
49 metrics = logging.getLogger('arvados.cwl-runner.metrics')
50 logger.setLevel(logging.INFO)
51
52 arvados.log_handler.setFormatter(logging.Formatter(
53         '%(asctime)s %(name)s %(levelname)s: %(message)s',
54         '%Y-%m-%d %H:%M:%S'))
55
56 def versionstring():
57     """Print version string of key packages for provenance and debugging."""
58     return "{} {}, arvados-python-client {}, cwltool {}".format(
59         sys.argv[0],
60         importlib.metadata.version('arvados-cwl-runner'),
61         importlib.metadata.version('arvados-python-client'),
62         importlib.metadata.version('cwltool'),
63     )
64
65 def arg_parser():  # type: () -> argparse.ArgumentParser
66     parser = argparse.ArgumentParser(
67         description='Arvados executor for Common Workflow Language',
68         parents=[arv_cmd.retry_opt],
69     )
70
71     parser.add_argument("--basedir",
72                         help="Base directory used to resolve relative references in the input, default to directory of input object file or current directory (if inputs piped/provided on command line).")
73     parser.add_argument("--outdir", default=os.path.abspath('.'),
74                         help="Output directory, default current directory")
75
76     parser.add_argument("--eval-timeout",
77                         help="Time to wait for a Javascript expression to evaluate before giving an error, default 20s.",
78                         type=float,
79                         default=20)
80
81     exgroup = parser.add_mutually_exclusive_group()
82     exgroup.add_argument("--print-dot", action="store_true",
83                          help="Print workflow visualization in graphviz format and exit")
84     exgroup.add_argument("--version", action="version", help="Print version and exit", version=versionstring())
85     exgroup.add_argument("--validate", action="store_true", help="Validate CWL document only.")
86
87     exgroup = parser.add_mutually_exclusive_group()
88     exgroup.add_argument("--verbose", action="store_true", help="Default logging")
89     exgroup.add_argument("--quiet", action="store_true", help="Only print warnings and errors.")
90     exgroup.add_argument("--debug", action="store_true", help="Print even more logging")
91
92     parser.add_argument("--metrics", action="store_true", help="Print timing metrics")
93
94     parser.add_argument("--tool-help", action="store_true", help="Print command line help for tool")
95
96     exgroup = parser.add_mutually_exclusive_group()
97     exgroup.add_argument("--enable-reuse", action="store_true",
98                         default=True, dest="enable_reuse",
99                         help="Enable container reuse (default)")
100     exgroup.add_argument("--disable-reuse", action="store_false",
101                         default=True, dest="enable_reuse",
102                         help="Disable container reuse")
103
104     parser.add_argument("--project-uuid", metavar="UUID", help="Project that will own the workflow containers, if not provided, will go to home project.")
105     parser.add_argument("--output-name", help="Name to use for collection that stores the final output.", default=None)
106     parser.add_argument("--output-tags", help="Tags for the final output collection separated by commas, e.g., '--output-tags tag0,tag1,tag2'.", default=None)
107     parser.add_argument("--ignore-docker-for-reuse", action="store_true",
108                         help="Ignore Docker image version when deciding whether to reuse past containers.",
109                         default=False)
110
111     exgroup = parser.add_mutually_exclusive_group()
112     exgroup.add_argument("--submit", action="store_true", help="Submit workflow to run on Arvados.",
113                         default=True, dest="submit")
114     exgroup.add_argument("--local", action="store_false", help="Run workflow on local host (submits containers to Arvados).",
115                         default=True, dest="submit")
116     exgroup.add_argument("--create-template", action="store_true", help="(Deprecated) synonym for --create-workflow.",
117                          dest="create_workflow")
118     exgroup.add_argument("--create-workflow", action="store_true", help="Register an Arvados workflow that can be run from Workbench")
119     exgroup.add_argument("--update-workflow", metavar="UUID", help="Update an existing Arvados workflow with the given UUID.")
120
121     exgroup.add_argument("--print-keep-deps", action="store_true", help="To assist copying, print a list of Keep collections that this workflow depends on.")
122
123     exgroup = parser.add_mutually_exclusive_group()
124     exgroup.add_argument("--wait", action="store_true", help="After submitting workflow runner, wait for completion.",
125                         default=True, dest="wait")
126     exgroup.add_argument("--no-wait", action="store_false", help="Submit workflow runner and exit.",
127                         default=True, dest="wait")
128
129     exgroup = parser.add_mutually_exclusive_group()
130     exgroup.add_argument("--log-timestamps", action="store_true", help="Prefix logging lines with timestamp",
131                         default=True, dest="log_timestamps")
132     exgroup.add_argument("--no-log-timestamps", action="store_false", help="No timestamp on logging lines",
133                         default=True, dest="log_timestamps")
134
135     parser.add_argument("--api",
136                         default=None, dest="work_api",
137                         choices=("containers",),
138                         help="Select work submission API.  Only supports 'containers'")
139
140     parser.add_argument("--compute-checksum", action="store_true", default=False,
141                         help="Compute checksum of contents while collecting outputs",
142                         dest="compute_checksum")
143
144     parser.add_argument("--submit-runner-ram", type=int,
145                         help="RAM (in MiB) required for the workflow runner job (default 1024)",
146                         default=None)
147
148     parser.add_argument("--submit-runner-image",
149                         help="Docker image for workflow runner job, default arvados/jobs:%s" % __version__,
150                         default=None)
151
152     parser.add_argument("--always-submit-runner", action="store_true",
153                         help="When invoked with --submit --wait, always submit a runner to manage the workflow, even when only running a single CommandLineTool",
154                         default=False)
155
156     parser.add_argument("--match-submitter-images", action="store_true",
157                         default=False, dest="match_local_docker",
158                         help="Where Arvados has more than one Docker image of the same name, use image from the Docker instance on the submitting node.")
159
160     exgroup = parser.add_mutually_exclusive_group()
161     exgroup.add_argument("--submit-request-uuid",
162                          default=None,
163                          help="Update and commit to supplied container request instead of creating a new one.",
164                          metavar="UUID")
165     exgroup.add_argument("--submit-runner-cluster",
166                          help="Submit workflow runner to a remote cluster",
167                          default=None,
168                          metavar="CLUSTER_ID")
169
170     parser.add_argument("--collection-cache-size", type=int,
171                         default=None,
172                         help="Collection cache size (in MiB, default 256).")
173
174     parser.add_argument("--name",
175                         help="Name to use for workflow execution instance.",
176                         default=None)
177
178     parser.add_argument("--on-error",
179                         help="Desired workflow behavior when a step fails.  One of 'stop' (do not submit any more steps) or "
180                         "'continue' (may submit other steps that are not downstream from the error). Default is 'continue'.",
181                         default="continue", choices=("stop", "continue"))
182
183     parser.add_argument("--enable-dev", action="store_true",
184                         help="Enable loading and running development versions "
185                              "of the CWL standards.", default=False)
186     parser.add_argument('--storage-classes', default="default",
187                         help="Specify comma separated list of storage classes to be used when saving final workflow output to Keep.")
188     parser.add_argument('--intermediate-storage-classes', default="default",
189                         help="Specify comma separated list of storage classes to be used when saving intermediate workflow output to Keep.")
190
191     parser.add_argument("--intermediate-output-ttl", type=int, metavar="N",
192                         help="If N > 0, intermediate output collections will be trashed N seconds after creation.  Default is 0 (don't trash).",
193                         default=0)
194
195     parser.add_argument("--priority", type=int,
196                         help="Workflow priority (range 1..1000, higher has precedence over lower)",
197                         default=DEFAULT_PRIORITY)
198
199     parser.add_argument("--disable-validate", dest="do_validate",
200                         action="store_false", default=True,
201                         help=argparse.SUPPRESS)
202
203     parser.add_argument("--disable-git", dest="git_info",
204                         action="store_false", default=True,
205                         help=argparse.SUPPRESS)
206
207     parser.add_argument("--disable-color", dest="enable_color",
208                         action="store_false", default=True,
209                         help=argparse.SUPPRESS)
210
211     parser.add_argument("--disable-js-validation",
212                         action="store_true", default=False,
213                         help=argparse.SUPPRESS)
214
215     parser.add_argument("--fast-parser", dest="fast_parser",
216                         action="store_true", default=False,
217                         help=argparse.SUPPRESS)
218
219     parser.add_argument("--thread-count", type=int,
220                         default=0, help="Number of threads to use for job submit and output collection.")
221
222     parser.add_argument("--http-timeout", type=int,
223                         default=5*60, dest="http_timeout", help="API request timeout in seconds. Default is 300 seconds (5 minutes).")
224
225     parser.add_argument("--defer-downloads", action="store_true", default=False,
226                         help="When submitting a workflow, defer downloading HTTP URLs to workflow launch instead of downloading to Keep before submit.")
227
228     parser.add_argument("--varying-url-params", type=str, default="",
229                         help="A comma separated list of URL query parameters that should be ignored when storing HTTP URLs in Keep.")
230
231     parser.add_argument("--prefer-cached-downloads", action="store_true", default=False,
232                         help="If a HTTP URL is found in Keep, skip upstream URL freshness check (will not notice if the upstream has changed, but also not error if upstream is unavailable).")
233
234     exgroup = parser.add_mutually_exclusive_group()
235     exgroup.add_argument("--enable-preemptible", dest="enable_preemptible", default=None, action="store_true", help="Use preemptible instances. Control individual steps with arv:UsePreemptible hint.")
236     exgroup.add_argument("--disable-preemptible", dest="enable_preemptible", default=None, action="store_false", help="Don't use preemptible instances.")
237
238     exgroup = parser.add_mutually_exclusive_group()
239     exgroup.add_argument("--enable-resubmit-non-preemptible", dest="enable_resubmit_non_preemptible",
240                          default=None, action="store_true",
241                          help="If a workflow step fails due to the instance it is running on being preempted, re-submit the container with the `preemptible` flag disabled. Control individual steps with arv:PreemptionBehavior hint.")
242     exgroup.add_argument("--disable-resubmit-non-preemptible", dest="enable_resubmit_non_preemptible",
243                          default=None, action="store_false",
244                          help="Don't resumbit when a preemptible instance is reclaimed.")
245
246     exgroup = parser.add_mutually_exclusive_group()
247     exgroup.add_argument("--copy-deps", dest="copy_deps", default=None, action="store_true", help="Copy dependencies into the destination project.")
248     exgroup.add_argument("--no-copy-deps", dest="copy_deps", default=None, action="store_false", help="Leave dependencies where they are.")
249
250     parser.add_argument(
251         "--skip-schemas",
252         action="store_true",
253         help="Skip loading of schemas",
254         default=False,
255         dest="skip_schemas",
256     )
257
258     exgroup = parser.add_mutually_exclusive_group()
259     exgroup.add_argument("--trash-intermediate", action="store_true",
260                         default=False, dest="trash_intermediate",
261                          help="Immediately trash intermediate outputs on workflow success.")
262     exgroup.add_argument("--no-trash-intermediate", action="store_false",
263                         default=False, dest="trash_intermediate",
264                         help="Do not trash intermediate outputs (default).")
265
266     exgroup = parser.add_mutually_exclusive_group()
267     exgroup.add_argument("--enable-usage-report", dest="enable_usage_report", default=None, action="store_true", help="Create usage_report.html with a summary of each step's resource usage.")
268     exgroup.add_argument("--disable-usage-report", dest="enable_usage_report", default=None, action="store_false", help="Disable usage report.")
269
270     parser.add_argument("workflow", default=None, help="The workflow to execute")
271     parser.add_argument("job_order", nargs=argparse.REMAINDER, help="The input object to the workflow.")
272
273     return parser
274
275 def add_arv_hints():
276     cwltool.command_line_tool.ACCEPTLIST_EN_RELAXED_RE = re.compile(r".*")
277     cwltool.command_line_tool.ACCEPTLIST_RE = cwltool.command_line_tool.ACCEPTLIST_EN_RELAXED_RE
278     supported_versions = ["v1.0", "v1.1", "v1.2"]
279     for s in supported_versions:
280         customschema = importlib.resources.read_text(__name__, f'arv-cwl-schema-{s}.yml', 'utf-8')
281         use_custom_schema(s, "http://arvados.org/cwl", customschema)
282     cwltool.process.supportedProcessRequirements.extend([
283         "http://arvados.org/cwl#RunInSingleContainer",
284         "http://arvados.org/cwl#OutputDirType",
285         "http://arvados.org/cwl#RuntimeConstraints",
286         "http://arvados.org/cwl#PartitionRequirement",
287         "http://arvados.org/cwl#APIRequirement",
288         "http://commonwl.org/cwltool#LoadListingRequirement",
289         "http://arvados.org/cwl#IntermediateOutput",
290         "http://arvados.org/cwl#ReuseRequirement",
291         "http://arvados.org/cwl#ClusterTarget",
292         "http://arvados.org/cwl#OutputStorageClass",
293         "http://arvados.org/cwl#ProcessProperties",
294         "http://commonwl.org/cwltool#CUDARequirement",
295         "http://arvados.org/cwl#UsePreemptible",
296         "http://arvados.org/cwl#OutputCollectionProperties",
297         "http://arvados.org/cwl#KeepCacheTypeRequirement",
298         "http://arvados.org/cwl#OutOfMemoryRetry",
299         "http://arvados.org/cwl#PreemptionBehavior",
300     ])
301
302 def exit_signal_handler(sigcode, frame):
303     logger.error(str(u"Caught signal {}, exiting.").format(sigcode))
304     sys.exit(-sigcode)
305
306 def main(args=sys.argv[1:],
307          stdout=sys.stdout,
308          stderr=sys.stderr,
309          api_client=None,
310          keep_client=None,
311          install_sig_handlers=True):
312     parser = arg_parser()
313
314     job_order_object = None
315     arvargs = parser.parse_args(args)
316
317     arvargs.use_container = True
318     arvargs.relax_path_checks = True
319     arvargs.print_supported_versions = False
320
321     if install_sig_handlers:
322         arv_cmd.install_signal_handlers()
323
324     if arvargs.update_workflow:
325         if arvargs.update_workflow.find('-7fd4e-') == 5:
326             want_api = 'containers'
327         else:
328             want_api = None
329         if want_api and arvargs.work_api and want_api != arvargs.work_api:
330             logger.error(str(u'--update-workflow arg {!r} uses {!r} API, but --api={!r} specified').format(
331                 arvargs.update_workflow, want_api, arvargs.work_api))
332             return 1
333         arvargs.work_api = want_api
334
335     workflow_op = arvargs.create_workflow or arvargs.update_workflow or arvargs.print_keep_deps
336
337     if workflow_op and not arvargs.job_order:
338         job_order_object = ({}, "")
339
340     add_arv_hints()
341
342     for key, val in cwltool.argparser.get_default_args().items():
343         if not hasattr(arvargs, key):
344             setattr(arvargs, key, val)
345
346     try:
347         if api_client is None:
348             api_client = arvados.safeapi.ThreadSafeApiCache(
349                 api_params={
350                     'num_retries': arvargs.retries,
351                     'timeout': arvargs.http_timeout,
352                 },
353                 keep_params={
354                     'num_retries': arvargs.retries,
355                 },
356                 version='v1',
357             )
358             keep_client = api_client.keep
359             # Make an API object now so errors are reported early.
360             api_client.users().current().execute()
361         if keep_client is None:
362             block_cache = arvados.keep.KeepBlockCache(disk_cache=True)
363             keep_client = arvados.keep.KeepClient(
364                 api_client=api_client,
365                 block_cache=block_cache,
366                 num_retries=arvargs.retries,
367             )
368         executor = ArvCwlExecutor(
369             api_client,
370             arvargs,
371             keep_client=keep_client,
372             num_retries=arvargs.retries,
373             stdout=stdout,
374         )
375     except WorkflowException as e:
376         logger.error(e, exc_info=(sys.exc_info()[1] if arvargs.debug else False))
377         return 1
378     except Exception:
379         logger.exception("Error creating the Arvados CWL Executor")
380         return 1
381
382     # Note that unless in debug mode, some stack traces related to user
383     # workflow errors may be suppressed.
384
385     # Set the logging on most modules INFO (instead of default which is WARNING)
386     logger.setLevel(logging.INFO)
387     logging.getLogger('arvados').setLevel(logging.INFO)
388     logging.getLogger('arvados.keep').setLevel(logging.WARNING)
389     # API retries are filtered to the INFO level and can be noisy, but as long as
390     # they succeed we don't need to see warnings about it.
391     googleapiclient_http_logger = logging.getLogger('googleapiclient.http')
392     googleapiclient_http_logger.addFilter(arvados.logging.GoogleHTTPClientFilter())
393     googleapiclient_http_logger.setLevel(logging.WARNING)
394
395     if arvargs.debug:
396         logger.setLevel(logging.DEBUG)
397         logging.getLogger('arvados').setLevel(logging.DEBUG)
398         # In debug mode show logs about retries, but we arn't
399         # debugging the google client so we don't need to see
400         # everything.
401         googleapiclient_http_logger.setLevel(logging.NOTSET)
402         logging.getLogger('googleapiclient').setLevel(logging.INFO)
403
404     if arvargs.quiet:
405         logger.setLevel(logging.WARN)
406         logging.getLogger('arvados').setLevel(logging.WARN)
407         logging.getLogger('arvados.arv-run').setLevel(logging.WARN)
408
409     if arvargs.metrics:
410         metrics.setLevel(logging.DEBUG)
411         logging.getLogger("cwltool.metrics").setLevel(logging.DEBUG)
412
413     if arvargs.log_timestamps:
414         arvados.log_handler.setFormatter(logging.Formatter(
415             '%(asctime)s %(name)s %(levelname)s: %(message)s',
416             '%Y-%m-%d %H:%M:%S'))
417     else:
418         arvados.log_handler.setFormatter(logging.Formatter('%(name)s %(levelname)s: %(message)s'))
419
420     if stdout is sys.stdout:
421         # cwltool.main has code to work around encoding issues with
422         # sys.stdout and unix pipes (they default to ASCII encoding,
423         # we want utf-8), so when stdout is sys.stdout set it to None
424         # to take advantage of that.  Don't override it for all cases
425         # since we still want to be able to capture stdout for the
426         # unit tests.
427         stdout = None
428
429     executor.loadingContext.default_docker_image = arvargs.submit_runner_image or "arvados/jobs:"+__version__
430
431     if arvargs.workflow.startswith("arvwf:") or workflow_uuid_pattern.match(arvargs.workflow) or arvargs.workflow.startswith("keep:"):
432         executor.loadingContext.do_validate = False
433         if arvargs.submit and not workflow_op:
434             executor.fast_submit = True
435
436     return cwltool.main.main(args=arvargs,
437                              stdout=stdout,
438                              stderr=stderr,
439                              executor=executor.arv_executor,
440                              versionfunc=versionstring,
441                              job_order_object=job_order_object,
442                              logger_handler=arvados.log_handler,
443                              custom_schema_callback=add_arv_hints,
444                              loadingContext=executor.loadingContext,
445                              runtimeContext=executor.toplevel_runtimeContext,
446                              input_required=not workflow_op)