1759e4ac2829a4840895d47e465fdfcad6a2bf1d
[arvados.git] / sdk / cwl / arvados_cwl / executor.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 from __future__ import division
6 from builtins import next
7 from builtins import object
8 from builtins import str
9 from future.utils import viewvalues, viewitems
10
11 import argparse
12 import logging
13 import os
14 import sys
15 import threading
16 import copy
17 import json
18 import re
19 from functools import partial
20 import time
21 import urllib
22
23 from cwltool.errors import WorkflowException
24 import cwltool.workflow
25 from schema_salad.sourceline import SourceLine
26 import schema_salad.validate as validate
27
28 import arvados
29 import arvados.config
30 from arvados.keep import KeepClient
31 from arvados.errors import ApiError
32
33 import arvados_cwl.util
34 from .arvcontainer import RunnerContainer
35 from .runner import Runner, upload_docker, upload_job_order, upload_workflow_deps
36 from .arvtool import ArvadosCommandTool, validate_cluster_target, ArvadosExpressionTool
37 from .arvworkflow import ArvadosWorkflow, upload_workflow
38 from .fsaccess import CollectionFsAccess, CollectionFetcher, collectionResolver, CollectionCache, pdh_size
39 from .perf import Perf
40 from .pathmapper import NoFollowPathMapper
41 from cwltool.task_queue import TaskQueue
42 from .context import ArvLoadingContext, ArvRuntimeContext
43 from ._version import __version__
44
45 from cwltool.process import shortname, UnsupportedRequirement, use_custom_schema
46 from cwltool.utils import adjustFileObjs, adjustDirObjs, get_listing, visit_class, aslist
47 from cwltool.command_line_tool import compute_checksums
48 from cwltool.load_tool import load_tool
49
50 logger = logging.getLogger('arvados.cwl-runner')
51 metrics = logging.getLogger('arvados.cwl-runner.metrics')
52
53 DEFAULT_PRIORITY = 500
54
55 class RuntimeStatusLoggingHandler(logging.Handler):
56     """
57     Intercepts logging calls and report them as runtime statuses on runner
58     containers.
59     """
60     def __init__(self, runtime_status_update_func):
61         super(RuntimeStatusLoggingHandler, self).__init__()
62         self.runtime_status_update = runtime_status_update_func
63         self.updatingRuntimeStatus = False
64
65     def emit(self, record):
66         kind = None
67         if record.levelno >= logging.ERROR:
68             kind = 'error'
69         elif record.levelno >= logging.WARNING:
70             kind = 'warning'
71         if kind is not None and self.updatingRuntimeStatus is not True:
72             self.updatingRuntimeStatus = True
73             try:
74                 log_msg = record.getMessage()
75                 if '\n' in log_msg:
76                     # If the logged message is multi-line, use its first line as status
77                     # and the rest as detail.
78                     status, detail = log_msg.split('\n', 1)
79                     self.runtime_status_update(
80                         kind,
81                         "%s: %s" % (record.name, status),
82                         detail
83                     )
84                 else:
85                     self.runtime_status_update(
86                         kind,
87                         "%s: %s" % (record.name, record.getMessage())
88                     )
89             finally:
90                 self.updatingRuntimeStatus = False
91
92
93 class ArvCwlExecutor(object):
94     """Execute a CWL tool or workflow, submit work (using containers API),
95     wait for them to complete, and report output.
96
97     """
98
99     def __init__(self, api_client,
100                  arvargs=None,
101                  keep_client=None,
102                  num_retries=4,
103                  thread_count=4,
104                  stdout=sys.stdout):
105
106         if arvargs is None:
107             arvargs = argparse.Namespace()
108             arvargs.work_api = None
109             arvargs.output_name = None
110             arvargs.output_tags = None
111             arvargs.thread_count = 1
112             arvargs.collection_cache_size = None
113
114         self.api = api_client
115         self.processes = {}
116         self.workflow_eval_lock = threading.Condition(threading.RLock())
117         self.final_output = None
118         self.final_status = None
119         self.num_retries = num_retries
120         self.uuid = None
121         self.stop_polling = threading.Event()
122         self.poll_api = None
123         self.pipeline = None
124         self.final_output_collection = None
125         self.output_name = arvargs.output_name
126         self.output_tags = arvargs.output_tags
127         self.project_uuid = None
128         self.intermediate_output_ttl = 0
129         self.intermediate_output_collections = []
130         self.trash_intermediate = False
131         self.thread_count = arvargs.thread_count
132         self.poll_interval = 12
133         self.loadingContext = None
134         self.should_estimate_cache_size = True
135         self.fs_access = None
136         self.secret_store = None
137         self.stdout = stdout
138
139         if keep_client is not None:
140             self.keep_client = keep_client
141         else:
142             self.keep_client = arvados.keep.KeepClient(api_client=self.api, num_retries=self.num_retries)
143
144         if arvargs.collection_cache_size:
145             collection_cache_size = arvargs.collection_cache_size*1024*1024
146             self.should_estimate_cache_size = False
147         else:
148             collection_cache_size = 256*1024*1024
149
150         self.collection_cache = CollectionCache(self.api, self.keep_client, self.num_retries,
151                                                 cap=collection_cache_size)
152
153         self.fetcher_constructor = partial(CollectionFetcher,
154                                            api_client=self.api,
155                                            fs_access=CollectionFsAccess("", collection_cache=self.collection_cache),
156                                            num_retries=self.num_retries)
157
158         self.work_api = None
159         expected_api = ["containers"]
160         for api in expected_api:
161             try:
162                 methods = self.api._rootDesc.get('resources')[api]['methods']
163                 if ('httpMethod' in methods['create'] and
164                     (arvargs.work_api == api or arvargs.work_api is None)):
165                     self.work_api = api
166                     break
167             except KeyError:
168                 pass
169
170         if not self.work_api:
171             if arvargs.work_api is None:
172                 raise Exception("No supported APIs")
173             else:
174                 raise Exception("Unsupported API '%s', expected one of %s" % (arvargs.work_api, expected_api))
175
176         if self.work_api == "jobs":
177             logger.error("""
178 *******************************
179 The 'jobs' API is no longer supported.
180 *******************************""")
181             exit(1)
182
183         self.loadingContext = ArvLoadingContext(vars(arvargs))
184         self.loadingContext.fetcher_constructor = self.fetcher_constructor
185         self.loadingContext.resolver = partial(collectionResolver, self.api, num_retries=self.num_retries)
186         self.loadingContext.construct_tool_object = self.arv_make_tool
187
188         # Add a custom logging handler to the root logger for runtime status reporting
189         # if running inside a container
190         if arvados_cwl.util.get_current_container(self.api, self.num_retries, logger):
191             root_logger = logging.getLogger('')
192
193             # Remove existing RuntimeStatusLoggingHandlers if they exist
194             handlers = [h for h in root_logger.handlers if not isinstance(h, RuntimeStatusLoggingHandler)]
195             root_logger.handlers = handlers
196
197             handler = RuntimeStatusLoggingHandler(self.runtime_status_update)
198             root_logger.addHandler(handler)
199
200         self.toplevel_runtimeContext = ArvRuntimeContext(vars(arvargs))
201         self.toplevel_runtimeContext.make_fs_access = partial(CollectionFsAccess,
202                                                      collection_cache=self.collection_cache)
203
204         validate_cluster_target(self, self.toplevel_runtimeContext)
205
206
207     def arv_make_tool(self, toolpath_object, loadingContext):
208         if "class" in toolpath_object and toolpath_object["class"] == "CommandLineTool":
209             return ArvadosCommandTool(self, toolpath_object, loadingContext)
210         elif "class" in toolpath_object and toolpath_object["class"] == "Workflow":
211             return ArvadosWorkflow(self, toolpath_object, loadingContext)
212         elif "class" in toolpath_object and toolpath_object["class"] == "ExpressionTool":
213             return ArvadosExpressionTool(self, toolpath_object, loadingContext)
214         else:
215             raise Exception("Unknown tool %s" % toolpath_object.get("class"))
216
217     def output_callback(self, out, processStatus):
218         with self.workflow_eval_lock:
219             if processStatus == "success":
220                 logger.info("Overall process status is %s", processStatus)
221                 state = "Complete"
222             else:
223                 logger.error("Overall process status is %s", processStatus)
224                 state = "Failed"
225             if self.pipeline:
226                 self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
227                                                         body={"state": state}).execute(num_retries=self.num_retries)
228             self.final_status = processStatus
229             self.final_output = out
230             self.workflow_eval_lock.notifyAll()
231
232
233     def start_run(self, runnable, runtimeContext):
234         self.task_queue.add(partial(runnable.run, runtimeContext),
235                             self.workflow_eval_lock, self.stop_polling)
236
237     def process_submitted(self, container):
238         with self.workflow_eval_lock:
239             self.processes[container.uuid] = container
240
241     def process_done(self, uuid, record):
242         with self.workflow_eval_lock:
243             j = self.processes[uuid]
244             logger.info("%s %s is %s", self.label(j), uuid, record["state"])
245             self.task_queue.add(partial(j.done, record),
246                                 self.workflow_eval_lock, self.stop_polling)
247             del self.processes[uuid]
248
249     def runtime_status_update(self, kind, message, detail=None):
250         """
251         Updates the runtime_status field on the runner container.
252         Called when there's a need to report errors, warnings or just
253         activity statuses, for example in the RuntimeStatusLoggingHandler.
254         """
255         with self.workflow_eval_lock:
256             current = None
257             try:
258                 current = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
259             except Exception as e:
260                 logger.info("Couldn't get current container: %s", e)
261             if current is None:
262                 return
263             runtime_status = current.get('runtime_status', {})
264             if kind in ('error', 'warning'):
265                 updatemessage = runtime_status.get(kind, "")
266                 if not updatemessage:
267                     updatemessage = message
268
269                 # Subsequent messages tacked on in detail
270                 updatedetail = runtime_status.get(kind+'Detail', "")
271                 maxlines = 40
272                 if updatedetail.count("\n") < maxlines:
273                     if updatedetail:
274                         updatedetail += "\n"
275                     updatedetail += message + "\n"
276
277                     if detail:
278                         updatedetail += detail + "\n"
279
280                     if updatedetail.count("\n") >= maxlines:
281                         updatedetail += "\nSome messages may have been omitted.  Check the full log."
282
283                 runtime_status.update({
284                     kind: updatemessage,
285                     kind+'Detail': updatedetail,
286                 })
287             else:
288                 # Ignore any other status kind
289                 return
290             try:
291                 self.api.containers().update(uuid=current['uuid'],
292                                             body={
293                                                 'runtime_status': runtime_status,
294                                             }).execute(num_retries=self.num_retries)
295             except Exception as e:
296                 logger.info("Couldn't update runtime_status: %s", e)
297
298     def wrapped_callback(self, cb, obj, st):
299         with self.workflow_eval_lock:
300             cb(obj, st)
301             self.workflow_eval_lock.notifyAll()
302
303     def get_wrapped_callback(self, cb):
304         return partial(self.wrapped_callback, cb)
305
306     def on_message(self, event):
307         if event.get("object_uuid") in self.processes and event["event_type"] == "update":
308             uuid = event["object_uuid"]
309             if event["properties"]["new_attributes"]["state"] == "Running":
310                 with self.workflow_eval_lock:
311                     j = self.processes[uuid]
312                     if j.running is False:
313                         j.running = True
314                         j.update_pipeline_component(event["properties"]["new_attributes"])
315                         logger.info("%s %s is Running", self.label(j), uuid)
316             elif event["properties"]["new_attributes"]["state"] in ("Complete", "Failed", "Cancelled", "Final"):
317                 self.process_done(uuid, event["properties"]["new_attributes"])
318
319     def label(self, obj):
320         return "[%s %s]" % (self.work_api[0:-1], obj.name)
321
322     def poll_states(self):
323         """Poll status of containers listed in the processes dict.
324
325         Runs in a separate thread.
326         """
327
328         try:
329             remain_wait = self.poll_interval
330             while True:
331                 if remain_wait > 0:
332                     self.stop_polling.wait(remain_wait)
333                 if self.stop_polling.is_set():
334                     break
335                 with self.workflow_eval_lock:
336                     keys = list(self.processes)
337                 if not keys:
338                     remain_wait = self.poll_interval
339                     continue
340
341                 begin_poll = time.time()
342                 if self.work_api == "containers":
343                     table = self.poll_api.container_requests()
344
345                 pageSize = self.poll_api._rootDesc.get('maxItemsPerResponse', 1000)
346
347                 while keys:
348                     page = keys[:pageSize]
349                     try:
350                         proc_states = table.list(filters=[["uuid", "in", page]]).execute(num_retries=self.num_retries)
351                     except Exception:
352                         logger.exception("Error checking states on API server: %s")
353                         remain_wait = self.poll_interval
354                         continue
355
356                     for p in proc_states["items"]:
357                         self.on_message({
358                             "object_uuid": p["uuid"],
359                             "event_type": "update",
360                             "properties": {
361                                 "new_attributes": p
362                             }
363                         })
364                     keys = keys[pageSize:]
365
366                 finish_poll = time.time()
367                 remain_wait = self.poll_interval - (finish_poll - begin_poll)
368         except:
369             logger.exception("Fatal error in state polling thread.")
370             with self.workflow_eval_lock:
371                 self.processes.clear()
372                 self.workflow_eval_lock.notifyAll()
373         finally:
374             self.stop_polling.set()
375
376     def add_intermediate_output(self, uuid):
377         if uuid:
378             self.intermediate_output_collections.append(uuid)
379
380     def trash_intermediate_output(self):
381         logger.info("Cleaning up intermediate output collections")
382         for i in self.intermediate_output_collections:
383             try:
384                 self.api.collections().delete(uuid=i).execute(num_retries=self.num_retries)
385             except Exception:
386                 logger.warning("Failed to delete intermediate output: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
387             except (KeyboardInterrupt, SystemExit):
388                 break
389
390     def check_features(self, obj, parentfield=""):
391         if isinstance(obj, dict):
392             if obj.get("class") == "DockerRequirement":
393                 if obj.get("dockerOutputDirectory"):
394                     if not obj.get("dockerOutputDirectory").startswith('/'):
395                         raise SourceLine(obj, "dockerOutputDirectory", validate.ValidationException).makeError(
396                             "Option 'dockerOutputDirectory' must be an absolute path.")
397             if obj.get("class") == "InplaceUpdateRequirement":
398                 if obj["inplaceUpdate"] and parentfield == "requirements":
399                     raise SourceLine(obj, "class", UnsupportedRequirement).makeError("InplaceUpdateRequirement not supported for keep collections.")
400             for k,v in viewitems(obj):
401                 self.check_features(v, parentfield=k)
402         elif isinstance(obj, list):
403             for i,v in enumerate(obj):
404                 with SourceLine(obj, i, UnsupportedRequirement, logger.isEnabledFor(logging.DEBUG)):
405                     self.check_features(v, parentfield=parentfield)
406
407     def make_output_collection(self, name, storage_classes, tagsString, outputObj):
408         outputObj = copy.deepcopy(outputObj)
409
410         files = []
411         def capture(fileobj):
412             files.append(fileobj)
413
414         adjustDirObjs(outputObj, capture)
415         adjustFileObjs(outputObj, capture)
416
417         generatemapper = NoFollowPathMapper(files, "", "", separateDirs=False)
418
419         final = arvados.collection.Collection(api_client=self.api,
420                                               keep_client=self.keep_client,
421                                               num_retries=self.num_retries)
422
423         for k,v in generatemapper.items():
424             if v.type == "Directory" and v.resolved.startswith("_:"):
425                     continue
426             if v.type == "CreateFile" and (k.startswith("_:") or v.resolved.startswith("_:")):
427                 with final.open(v.target, "wb") as f:
428                     f.write(v.resolved.encode("utf-8"))
429                     continue
430
431             if not v.resolved.startswith("keep:"):
432                 raise Exception("Output source is not in keep or a literal")
433             sp = v.resolved.split("/")
434             srccollection = sp[0][5:]
435             try:
436                 reader = self.collection_cache.get(srccollection)
437                 srcpath = urllib.parse.unquote("/".join(sp[1:]) if len(sp) > 1 else ".")
438                 final.copy(srcpath, v.target, source_collection=reader, overwrite=False)
439             except arvados.errors.ArgumentError as e:
440                 logger.error("Creating CollectionReader for '%s' '%s': %s", k, v, e)
441                 raise
442             except IOError as e:
443                 logger.error("While preparing output collection: %s", e)
444                 raise
445
446         def rewrite(fileobj):
447             fileobj["location"] = generatemapper.mapper(fileobj["location"]).target
448             for k in ("listing", "contents", "nameext", "nameroot", "dirname"):
449                 if k in fileobj:
450                     del fileobj[k]
451
452         adjustDirObjs(outputObj, rewrite)
453         adjustFileObjs(outputObj, rewrite)
454
455         with final.open("cwl.output.json", "w") as f:
456             res = str(json.dumps(outputObj, sort_keys=True, indent=4, separators=(',',': '), ensure_ascii=False))
457             f.write(res)
458
459         final.save_new(name=name, owner_uuid=self.project_uuid, storage_classes=storage_classes, ensure_unique_name=True)
460
461         logger.info("Final output collection %s \"%s\" (%s)", final.portable_data_hash(),
462                     final.api_response()["name"],
463                     final.manifest_locator())
464
465         final_uuid = final.manifest_locator()
466         tags = tagsString.split(',')
467         for tag in tags:
468              self.api.links().create(body={
469                 "head_uuid": final_uuid, "link_class": "tag", "name": tag
470                 }).execute(num_retries=self.num_retries)
471
472         def finalcollection(fileobj):
473             fileobj["location"] = "keep:%s/%s" % (final.portable_data_hash(), fileobj["location"])
474
475         adjustDirObjs(outputObj, finalcollection)
476         adjustFileObjs(outputObj, finalcollection)
477
478         return (outputObj, final)
479
480     def set_crunch_output(self):
481         if self.work_api == "containers":
482             current = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
483             if current is None:
484                 return
485             try:
486                 self.api.containers().update(uuid=current['uuid'],
487                                              body={
488                                                  'output': self.final_output_collection.portable_data_hash(),
489                                              }).execute(num_retries=self.num_retries)
490                 self.api.collections().update(uuid=self.final_output_collection.manifest_locator(),
491                                               body={
492                                                   'is_trashed': True
493                                               }).execute(num_retries=self.num_retries)
494             except Exception:
495                 logger.exception("Setting container output")
496                 raise
497
498     def apply_reqs(self, job_order_object, tool):
499         if "https://w3id.org/cwl/cwl#requirements" in job_order_object:
500             if tool.metadata.get("http://commonwl.org/cwltool#original_cwlVersion") == 'v1.0':
501                 raise WorkflowException(
502                     "`cwl:requirements` in the input object is not part of CWL "
503                     "v1.0. You can adjust to use `cwltool:overrides` instead; or you "
504                     "can set the cwlVersion to v1.1 or greater and re-run with "
505                     "--enable-dev.")
506             job_reqs = job_order_object["https://w3id.org/cwl/cwl#requirements"]
507             for req in job_reqs:
508                 tool.requirements.append(req)
509
510     def arv_executor(self, updated_tool, job_order, runtimeContext, logger=None):
511         self.debug = runtimeContext.debug
512
513         workbench1 = self.api.config()["Services"]["Workbench1"]["ExternalURL"]
514         workbench2 = self.api.config()["Services"]["Workbench2"]["ExternalURL"]
515         controller = self.api.config()["Services"]["Controller"]["ExternalURL"]
516         logger.info("Using cluster %s (%s)", self.api.config()["ClusterID"], workbench2 or workbench1 or controller)
517
518         updated_tool.visit(self.check_features)
519
520         self.pipeline = None
521         self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
522         self.secret_store = runtimeContext.secret_store
523
524         self.trash_intermediate = runtimeContext.trash_intermediate
525         if self.trash_intermediate and self.work_api != "containers":
526             raise Exception("--trash-intermediate is only supported with --api=containers.")
527
528         self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
529         if self.intermediate_output_ttl and self.work_api != "containers":
530             raise Exception("--intermediate-output-ttl is only supported with --api=containers.")
531         if self.intermediate_output_ttl < 0:
532             raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl)
533
534         if runtimeContext.submit_request_uuid and self.work_api != "containers":
535             raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api))
536
537         runtimeContext = runtimeContext.copy()
538
539         default_storage_classes = ",".join([k for k,v in self.api.config().get("StorageClasses", {"default": {"Default": True}}).items() if v.get("Default") is True])
540         if runtimeContext.storage_classes == "default":
541             runtimeContext.storage_classes = default_storage_classes
542         if runtimeContext.intermediate_storage_classes == "default":
543             runtimeContext.intermediate_storage_classes = default_storage_classes
544
545         if not runtimeContext.name:
546             runtimeContext.name = self.name = updated_tool.tool.get("label") or updated_tool.metadata.get("label") or os.path.basename(updated_tool.tool["id"])
547
548         if runtimeContext.copy_deps is None and (runtimeContext.create_workflow or runtimeContext.update_workflow):
549             # When creating or updating workflow record, by default
550             # always copy dependencies and ensure Docker images are up
551             # to date.
552             runtimeContext.copy_deps = True
553             runtimeContext.match_local_docker = True
554
555         if runtimeContext.update_workflow and self.project_uuid is None:
556             # If we are updating a workflow, make sure anything that
557             # gets uploaded goes into the same parent project, unless
558             # an alternate --project-uuid was provided.
559             existing_wf = self.api.workflows().get(uuid=runtimeContext.update_workflow).execute()
560             runtimeContext.project_uuid = existing_wf["owner_uuid"]
561
562         self.project_uuid = runtimeContext.project_uuid
563
564         # Upload local file references in the job order.
565         job_order = upload_job_order(self, "%s input" % runtimeContext.name,
566                                      updated_tool, job_order, runtimeContext)
567
568         # the last clause means: if it is a command line tool, and we
569         # are going to wait for the result, and always_submit_runner
570         # is false, then we don't submit a runner process.
571
572         submitting = (runtimeContext.update_workflow or
573                       runtimeContext.create_workflow or
574                       (runtimeContext.submit and not
575                        (updated_tool.tool["class"] == "CommandLineTool" and
576                         runtimeContext.wait and
577                         not runtimeContext.always_submit_runner)))
578
579         loadingContext = self.loadingContext.copy()
580         loadingContext.do_validate = False
581         if submitting:
582             loadingContext.do_update = False
583             # Document may have been auto-updated. Reload the original
584             # document with updating disabled because we want to
585             # submit the document with its original CWL version, not
586             # the auto-updated one.
587             tool = load_tool(updated_tool.tool["id"], loadingContext)
588         else:
589             tool = updated_tool
590
591         # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
592         # Also uploads docker images.
593         merged_map = upload_workflow_deps(self, tool, runtimeContext)
594
595         # Recreate process object (ArvadosWorkflow or
596         # ArvadosCommandTool) because tool document may have been
597         # updated by upload_workflow_deps in ways that modify
598         # inheritance of hints or requirements.
599         loadingContext.loader = tool.doc_loader
600         loadingContext.avsc_names = tool.doc_schema
601         loadingContext.metadata = tool.metadata
602         tool = load_tool(tool.tool, loadingContext)
603
604         if runtimeContext.update_workflow or runtimeContext.create_workflow:
605             # Create a pipeline template or workflow record and exit.
606             if self.work_api == "containers":
607                 uuid = upload_workflow(self, tool, job_order,
608                                        runtimeContext.project_uuid,
609                                        runtimeContext,
610                                        uuid=runtimeContext.update_workflow,
611                                        submit_runner_ram=runtimeContext.submit_runner_ram,
612                                        name=runtimeContext.name,
613                                        merged_map=merged_map,
614                                        submit_runner_image=runtimeContext.submit_runner_image)
615                 self.stdout.write(uuid + "\n")
616                 return (None, "success")
617
618         self.apply_reqs(job_order, tool)
619
620         self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
621         self.eval_timeout = runtimeContext.eval_timeout
622
623         runtimeContext.use_container = True
624         runtimeContext.tmpdir_prefix = "tmp"
625         runtimeContext.work_api = self.work_api
626
627         if self.work_api == "containers":
628             if self.ignore_docker_for_reuse:
629                 raise Exception("--ignore-docker-for-reuse not supported with containers API.")
630             runtimeContext.outdir = "/var/spool/cwl"
631             runtimeContext.docker_outdir = "/var/spool/cwl"
632             runtimeContext.tmpdir = "/tmp"
633             runtimeContext.docker_tmpdir = "/tmp"
634
635         if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
636             raise Exception("--priority must be in the range 1..1000.")
637
638         if self.should_estimate_cache_size:
639             visited = set()
640             estimated_size = [0]
641             def estimate_collection_cache(obj):
642                 if obj.get("location", "").startswith("keep:"):
643                     m = pdh_size.match(obj["location"][5:])
644                     if m and m.group(1) not in visited:
645                         visited.add(m.group(1))
646                         estimated_size[0] += int(m.group(2))
647             visit_class(job_order, ("File", "Directory"), estimate_collection_cache)
648             runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256)
649             self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024)
650
651         logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size)
652
653         runnerjob = None
654         if runtimeContext.submit:
655             # Submit a runner job to run the workflow for us.
656             if self.work_api == "containers":
657                 if submitting:
658                     tool = RunnerContainer(self, updated_tool,
659                                            tool, loadingContext, runtimeContext.enable_reuse,
660                                            self.output_name,
661                                            self.output_tags,
662                                            submit_runner_ram=runtimeContext.submit_runner_ram,
663                                            name=runtimeContext.name,
664                                            on_error=runtimeContext.on_error,
665                                            submit_runner_image=runtimeContext.submit_runner_image,
666                                            intermediate_output_ttl=runtimeContext.intermediate_output_ttl,
667                                            merged_map=merged_map,
668                                            priority=runtimeContext.priority,
669                                            secret_store=self.secret_store,
670                                            collection_cache_size=runtimeContext.collection_cache_size,
671                                            collection_cache_is_default=self.should_estimate_cache_size)
672                 else:
673                     runtimeContext.runnerjob = tool.tool["id"]
674
675         if runtimeContext.cwl_runner_job is not None:
676             self.uuid = runtimeContext.cwl_runner_job.get('uuid')
677
678         jobiter = tool.job(job_order,
679                            self.output_callback,
680                            runtimeContext)
681
682         if runtimeContext.submit and not runtimeContext.wait:
683             runnerjob = next(jobiter)
684             runnerjob.run(runtimeContext)
685             self.stdout.write(runnerjob.uuid+"\n")
686             return (None, "success")
687
688         current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
689         if current_container:
690             logger.info("Running inside container %s", current_container.get("uuid"))
691
692         self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout)
693         self.polling_thread = threading.Thread(target=self.poll_states)
694         self.polling_thread.start()
695
696         self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)
697
698         try:
699             self.workflow_eval_lock.acquire()
700
701             # Holds the lock while this code runs and releases it when
702             # it is safe to do so in self.workflow_eval_lock.wait(),
703             # at which point on_message can update job state and
704             # process output callbacks.
705
706             loopperf = Perf(metrics, "jobiter")
707             loopperf.__enter__()
708             for runnable in jobiter:
709                 loopperf.__exit__()
710
711                 if self.stop_polling.is_set():
712                     break
713
714                 if self.task_queue.error is not None:
715                     raise self.task_queue.error
716
717                 if runnable:
718                     with Perf(metrics, "run"):
719                         self.start_run(runnable, runtimeContext)
720                 else:
721                     if (self.task_queue.in_flight + len(self.processes)) > 0:
722                         self.workflow_eval_lock.wait(3)
723                     else:
724                         logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.")
725                         break
726
727                 if self.stop_polling.is_set():
728                     break
729
730                 loopperf.__enter__()
731             loopperf.__exit__()
732
733             while (self.task_queue.in_flight + len(self.processes)) > 0:
734                 if self.task_queue.error is not None:
735                     raise self.task_queue.error
736                 self.workflow_eval_lock.wait(3)
737
738         except UnsupportedRequirement:
739             raise
740         except:
741             if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit:
742                 logger.error("Interrupted, workflow will be cancelled")
743             elif isinstance(sys.exc_info()[1], WorkflowException):
744                 logger.error("Workflow execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
745             else:
746                 logger.exception("Workflow execution failed")
747
748             if self.pipeline:
749                 self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
750                                                      body={"state": "Failed"}).execute(num_retries=self.num_retries)
751
752             if self.work_api == "containers" and not current_container:
753                 # Not running in a crunch container, so cancel any outstanding processes.
754                 for p in self.processes:
755                     try:
756                         self.api.container_requests().update(uuid=p,
757                                                              body={"priority": "0"}
758                         ).execute(num_retries=self.num_retries)
759                     except Exception:
760                         pass
761         finally:
762             self.workflow_eval_lock.release()
763             self.task_queue.drain()
764             self.stop_polling.set()
765             self.polling_thread.join()
766             self.task_queue.join()
767
768         if self.final_status == "UnsupportedRequirement":
769             raise UnsupportedRequirement("Check log for details.")
770
771         if self.final_output is None:
772             raise WorkflowException("Workflow did not return a result.")
773
774         if runtimeContext.submit and isinstance(tool, Runner):
775             logger.info("Final output collection %s", tool.final_output)
776             if workbench2 or workbench1:
777                 logger.info("Output at %scollections/%s", workbench2 or workbench1, tool.final_output)
778         else:
779             if self.output_name is None:
780                 self.output_name = "Output of %s" % (shortname(tool.tool["id"]))
781             if self.output_tags is None:
782                 self.output_tags = ""
783
784             storage_classes = ""
785             storage_class_req, _ = tool.get_requirement("http://arvados.org/cwl#OutputStorageClass")
786             if storage_class_req and storage_class_req.get("finalStorageClass"):
787                 storage_classes = aslist(storage_class_req["finalStorageClass"])
788             else:
789                 storage_classes = runtimeContext.storage_classes.strip().split(",")
790
791             self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output)
792             self.set_crunch_output()
793
794         if runtimeContext.compute_checksum:
795             adjustDirObjs(self.final_output, partial(get_listing, self.fs_access))
796             adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access))
797
798         if self.trash_intermediate and self.final_status == "success":
799             self.trash_intermediate_output()
800
801         return (self.final_output, self.final_status)