9b9c2f16101bbd915a180c3585e898ae16216aa3
[arvados.git] / sdk / cwl / arvados_cwl / executor.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 from __future__ import division
6 from builtins import next
7 from builtins import object
8 from builtins import str
9 from future.utils import viewvalues
10
11 import argparse
12 import logging
13 import os
14 import sys
15 import threading
16 import copy
17 import json
18 import re
19 from functools import partial
20 import time
21
22 from cwltool.errors import WorkflowException
23 import cwltool.workflow
24 from schema_salad.sourceline import SourceLine
25 import schema_salad.validate as validate
26
27 import arvados
28 import arvados.config
29 from arvados.keep import KeepClient
30 from arvados.errors import ApiError
31
32 import arvados_cwl.util
33 from .arvcontainer import RunnerContainer
34 from .arvjob import RunnerJob, RunnerTemplate
35 from .runner import Runner, upload_docker, upload_job_order, upload_workflow_deps
36 from .arvtool import ArvadosCommandTool, validate_cluster_target, ArvadosExpressionTool
37 from .arvworkflow import ArvadosWorkflow, upload_workflow
38 from .fsaccess import CollectionFsAccess, CollectionFetcher, collectionResolver, CollectionCache, pdh_size
39 from .perf import Perf
40 from .pathmapper import NoFollowPathMapper
41 from .task_queue import TaskQueue
42 from .context import ArvLoadingContext, ArvRuntimeContext
43 from ._version import __version__
44
45 from cwltool.process import shortname, UnsupportedRequirement, use_custom_schema
46 from cwltool.pathmapper import adjustFileObjs, adjustDirObjs, get_listing, visit_class
47 from cwltool.command_line_tool import compute_checksums
48 from cwltool.load_tool import load_tool
49
50 logger = logging.getLogger('arvados.cwl-runner')
51 metrics = logging.getLogger('arvados.cwl-runner.metrics')
52
53 DEFAULT_PRIORITY = 500
54
55 class RuntimeStatusLoggingHandler(logging.Handler):
56     """
57     Intercepts logging calls and report them as runtime statuses on runner
58     containers.
59     """
60     def __init__(self, runtime_status_update_func):
61         super(RuntimeStatusLoggingHandler, self).__init__()
62         self.runtime_status_update = runtime_status_update_func
63         self.updatingRuntimeStatus = False
64
65     def emit(self, record):
66         kind = None
67         if record.levelno >= logging.ERROR:
68             kind = 'error'
69         elif record.levelno >= logging.WARNING:
70             kind = 'warning'
71         if kind is not None and self.updatingRuntimeStatus is not True:
72             self.updatingRuntimeStatus = True
73             try:
74                 log_msg = record.getMessage()
75                 if '\n' in log_msg:
76                     # If the logged message is multi-line, use its first line as status
77                     # and the rest as detail.
78                     status, detail = log_msg.split('\n', 1)
79                     self.runtime_status_update(
80                         kind,
81                         "%s: %s" % (record.name, status),
82                         detail
83                     )
84                 else:
85                     self.runtime_status_update(
86                         kind,
87                         "%s: %s" % (record.name, record.getMessage())
88                     )
89             finally:
90                 self.updatingRuntimeStatus = False
91
92
93 class ArvCwlExecutor(object):
94     """Execute a CWL tool or workflow, submit work (using either jobs or
95     containers API), wait for them to complete, and report output.
96
97     """
98
99     def __init__(self, api_client,
100                  arvargs=None,
101                  keep_client=None,
102                  num_retries=4,
103                  thread_count=4):
104
105         if arvargs is None:
106             arvargs = argparse.Namespace()
107             arvargs.work_api = None
108             arvargs.output_name = None
109             arvargs.output_tags = None
110             arvargs.thread_count = 1
111             arvargs.collection_cache_size = None
112
113         self.api = api_client
114         self.processes = {}
115         self.workflow_eval_lock = threading.Condition(threading.RLock())
116         self.final_output = None
117         self.final_status = None
118         self.num_retries = num_retries
119         self.uuid = None
120         self.stop_polling = threading.Event()
121         self.poll_api = None
122         self.pipeline = None
123         self.final_output_collection = None
124         self.output_name = arvargs.output_name
125         self.output_tags = arvargs.output_tags
126         self.project_uuid = None
127         self.intermediate_output_ttl = 0
128         self.intermediate_output_collections = []
129         self.trash_intermediate = False
130         self.thread_count = arvargs.thread_count
131         self.poll_interval = 12
132         self.loadingContext = None
133         self.should_estimate_cache_size = True
134
135         if keep_client is not None:
136             self.keep_client = keep_client
137         else:
138             self.keep_client = arvados.keep.KeepClient(api_client=self.api, num_retries=self.num_retries)
139
140         if arvargs.collection_cache_size:
141             collection_cache_size = arvargs.collection_cache_size*1024*1024
142             self.should_estimate_cache_size = False
143         else:
144             collection_cache_size = 256*1024*1024
145
146         self.collection_cache = CollectionCache(self.api, self.keep_client, self.num_retries,
147                                                 cap=collection_cache_size)
148
149         self.fetcher_constructor = partial(CollectionFetcher,
150                                            api_client=self.api,
151                                            fs_access=CollectionFsAccess("", collection_cache=self.collection_cache),
152                                            num_retries=self.num_retries)
153
154         self.work_api = None
155         expected_api = ["jobs", "containers"]
156         for api in expected_api:
157             try:
158                 methods = self.api._rootDesc.get('resources')[api]['methods']
159                 if ('httpMethod' in methods['create'] and
160                     (arvargs.work_api == api or arvargs.work_api is None)):
161                     self.work_api = api
162                     break
163             except KeyError:
164                 pass
165
166         if not self.work_api:
167             if arvargs.work_api is None:
168                 raise Exception("No supported APIs")
169             else:
170                 raise Exception("Unsupported API '%s', expected one of %s" % (arvargs.work_api, expected_api))
171
172         if self.work_api == "jobs":
173             logger.warning("""
174 *******************************
175 Using the deprecated 'jobs' API.
176
177 To get rid of this warning:
178
179 Users: read about migrating at
180 http://doc.arvados.org/user/cwl/cwl-style.html#migrate
181 and use the option --api=containers
182
183 Admins: configure the cluster to disable the 'jobs' API as described at:
184 http://doc.arvados.org/install/install-api-server.html#disable_api_methods
185 *******************************""")
186
187         self.loadingContext = ArvLoadingContext(vars(arvargs))
188         self.loadingContext.fetcher_constructor = self.fetcher_constructor
189         self.loadingContext.resolver = partial(collectionResolver, self.api, num_retries=self.num_retries)
190         self.loadingContext.construct_tool_object = self.arv_make_tool
191
192         # Add a custom logging handler to the root logger for runtime status reporting
193         # if running inside a container
194         if arvados_cwl.util.get_current_container(self.api, self.num_retries, logger):
195             root_logger = logging.getLogger('')
196
197             # Remove existing RuntimeStatusLoggingHandlers if they exist
198             handlers = [h for h in root_logger.handlers if not isinstance(h, RuntimeStatusLoggingHandler)]
199             root_logger.handlers = handlers
200
201             handler = RuntimeStatusLoggingHandler(self.runtime_status_update)
202             root_logger.addHandler(handler)
203
204         self.runtimeContext = ArvRuntimeContext(vars(arvargs))
205         self.runtimeContext.make_fs_access = partial(CollectionFsAccess,
206                                                      collection_cache=self.collection_cache)
207
208         validate_cluster_target(self, self.runtimeContext)
209
210
211     def arv_make_tool(self, toolpath_object, loadingContext):
212         if "class" in toolpath_object and toolpath_object["class"] == "CommandLineTool":
213             return ArvadosCommandTool(self, toolpath_object, loadingContext)
214         elif "class" in toolpath_object and toolpath_object["class"] == "Workflow":
215             return ArvadosWorkflow(self, toolpath_object, loadingContext)
216         elif "class" in toolpath_object and toolpath_object["class"] == "ExpressionTool":
217             return ArvadosExpressionTool(self, toolpath_object, loadingContext)
218         else:
219             raise Exception("Unknown tool %s" % toolpath_object.get("class"))
220
221     def output_callback(self, out, processStatus):
222         with self.workflow_eval_lock:
223             if processStatus == "success":
224                 logger.info("Overall process status is %s", processStatus)
225                 state = "Complete"
226             else:
227                 logger.error("Overall process status is %s", processStatus)
228                 state = "Failed"
229             if self.pipeline:
230                 self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
231                                                         body={"state": state}).execute(num_retries=self.num_retries)
232             self.final_status = processStatus
233             self.final_output = out
234             self.workflow_eval_lock.notifyAll()
235
236
237     def start_run(self, runnable, runtimeContext):
238         self.task_queue.add(partial(runnable.run, runtimeContext),
239                             self.workflow_eval_lock, self.stop_polling)
240
241     def process_submitted(self, container):
242         with self.workflow_eval_lock:
243             self.processes[container.uuid] = container
244
245     def process_done(self, uuid, record):
246         with self.workflow_eval_lock:
247             j = self.processes[uuid]
248             logger.info("%s %s is %s", self.label(j), uuid, record["state"])
249             self.task_queue.add(partial(j.done, record),
250                                 self.workflow_eval_lock, self.stop_polling)
251             del self.processes[uuid]
252
253     def runtime_status_update(self, kind, message, detail=None):
254         """
255         Updates the runtime_status field on the runner container.
256         Called when there's a need to report errors, warnings or just
257         activity statuses, for example in the RuntimeStatusLoggingHandler.
258         """
259         with self.workflow_eval_lock:
260             current = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
261             if current is None:
262                 return
263             runtime_status = current.get('runtime_status', {})
264             # In case of status being an error, only report the first one.
265             if kind == 'error':
266                 if not runtime_status.get('error'):
267                     runtime_status.update({
268                         'error': message
269                     })
270                     if detail is not None:
271                         runtime_status.update({
272                             'errorDetail': detail
273                         })
274                 # Further errors are only mentioned as a count.
275                 else:
276                     # Get anything before an optional 'and N more' string.
277                     try:
278                         error_msg = re.match(
279                             r'^(.*?)(?=\s*\(and \d+ more\)|$)', runtime_status.get('error')).groups()[0]
280                         more_failures = re.match(
281                             r'.*\(and (\d+) more\)', runtime_status.get('error'))
282                     except TypeError:
283                         # Ignore tests stubbing errors
284                         return
285                     if more_failures:
286                         failure_qty = int(more_failures.groups()[0])
287                         runtime_status.update({
288                             'error': "%s (and %d more)" % (error_msg, failure_qty+1)
289                         })
290                     else:
291                         runtime_status.update({
292                             'error': "%s (and 1 more)" % error_msg
293                         })
294             elif kind in ['warning', 'activity']:
295                 # Record the last warning/activity status without regard of
296                 # previous occurences.
297                 runtime_status.update({
298                     kind: message
299                 })
300                 if detail is not None:
301                     runtime_status.update({
302                         kind+"Detail": detail
303                     })
304             else:
305                 # Ignore any other status kind
306                 return
307             try:
308                 self.api.containers().update(uuid=current['uuid'],
309                                             body={
310                                                 'runtime_status': runtime_status,
311                                             }).execute(num_retries=self.num_retries)
312             except Exception as e:
313                 logger.info("Couldn't update runtime_status: %s", e)
314
315     def wrapped_callback(self, cb, obj, st):
316         with self.workflow_eval_lock:
317             cb(obj, st)
318             self.workflow_eval_lock.notifyAll()
319
320     def get_wrapped_callback(self, cb):
321         return partial(self.wrapped_callback, cb)
322
323     def on_message(self, event):
324         if event.get("object_uuid") in self.processes and event["event_type"] == "update":
325             uuid = event["object_uuid"]
326             if event["properties"]["new_attributes"]["state"] == "Running":
327                 with self.workflow_eval_lock:
328                     j = self.processes[uuid]
329                     if j.running is False:
330                         j.running = True
331                         j.update_pipeline_component(event["properties"]["new_attributes"])
332                         logger.info("%s %s is Running", self.label(j), uuid)
333             elif event["properties"]["new_attributes"]["state"] in ("Complete", "Failed", "Cancelled", "Final"):
334                 self.process_done(uuid, event["properties"]["new_attributes"])
335
336     def label(self, obj):
337         return "[%s %s]" % (self.work_api[0:-1], obj.name)
338
339     def poll_states(self):
340         """Poll status of jobs or containers listed in the processes dict.
341
342         Runs in a separate thread.
343         """
344
345         try:
346             remain_wait = self.poll_interval
347             while True:
348                 if remain_wait > 0:
349                     self.stop_polling.wait(remain_wait)
350                 if self.stop_polling.is_set():
351                     break
352                 with self.workflow_eval_lock:
353                     keys = list(self.processes)
354                 if not keys:
355                     remain_wait = self.poll_interval
356                     continue
357
358                 begin_poll = time.time()
359                 if self.work_api == "containers":
360                     table = self.poll_api.container_requests()
361                 elif self.work_api == "jobs":
362                     table = self.poll_api.jobs()
363
364                 pageSize = self.poll_api._rootDesc.get('maxItemsPerResponse', 1000)
365
366                 while keys:
367                     page = keys[:pageSize]
368                     try:
369                         proc_states = table.list(filters=[["uuid", "in", page]]).execute(num_retries=self.num_retries)
370                     except Exception:
371                         logger.exception("Error checking states on API server: %s")
372                         remain_wait = self.poll_interval
373                         continue
374
375                     for p in proc_states["items"]:
376                         self.on_message({
377                             "object_uuid": p["uuid"],
378                             "event_type": "update",
379                             "properties": {
380                                 "new_attributes": p
381                             }
382                         })
383                     keys = keys[pageSize:]
384
385                 finish_poll = time.time()
386                 remain_wait = self.poll_interval - (finish_poll - begin_poll)
387         except:
388             logger.exception("Fatal error in state polling thread.")
389             with self.workflow_eval_lock:
390                 self.processes.clear()
391                 self.workflow_eval_lock.notifyAll()
392         finally:
393             self.stop_polling.set()
394
395     def add_intermediate_output(self, uuid):
396         if uuid:
397             self.intermediate_output_collections.append(uuid)
398
399     def trash_intermediate_output(self):
400         logger.info("Cleaning up intermediate output collections")
401         for i in self.intermediate_output_collections:
402             try:
403                 self.api.collections().delete(uuid=i).execute(num_retries=self.num_retries)
404             except Exception:
405                 logger.warning("Failed to delete intermediate output: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
406             except (KeyboardInterrupt, SystemExit):
407                 break
408
409     def check_features(self, obj):
410         if isinstance(obj, dict):
411             if obj.get("writable") and self.work_api != "containers":
412                 raise SourceLine(obj, "writable", UnsupportedRequirement).makeError("InitialWorkDir feature 'writable: true' not supported with --api=jobs")
413             if obj.get("class") == "DockerRequirement":
414                 if obj.get("dockerOutputDirectory"):
415                     if self.work_api != "containers":
416                         raise SourceLine(obj, "dockerOutputDirectory", UnsupportedRequirement).makeError(
417                             "Option 'dockerOutputDirectory' of DockerRequirement not supported with --api=jobs.")
418                     if not obj.get("dockerOutputDirectory").startswith('/'):
419                         raise SourceLine(obj, "dockerOutputDirectory", validate.ValidationException).makeError(
420                             "Option 'dockerOutputDirectory' must be an absolute path.")
421             if obj.get("class") == "http://commonwl.org/cwltool#Secrets" and self.work_api != "containers":
422                 raise SourceLine(obj, "class", UnsupportedRequirement).makeError("Secrets not supported with --api=jobs")
423             for v in viewvalues(obj):
424                 self.check_features(v)
425         elif isinstance(obj, list):
426             for i,v in enumerate(obj):
427                 with SourceLine(obj, i, UnsupportedRequirement, logger.isEnabledFor(logging.DEBUG)):
428                     self.check_features(v)
429
430     def make_output_collection(self, name, storage_classes, tagsString, outputObj):
431         outputObj = copy.deepcopy(outputObj)
432
433         files = []
434         def capture(fileobj):
435             files.append(fileobj)
436
437         adjustDirObjs(outputObj, capture)
438         adjustFileObjs(outputObj, capture)
439
440         generatemapper = NoFollowPathMapper(files, "", "", separateDirs=False)
441
442         final = arvados.collection.Collection(api_client=self.api,
443                                               keep_client=self.keep_client,
444                                               num_retries=self.num_retries)
445
446         for k,v in generatemapper.items():
447             if v.type == "Directory" and v.resolved.startswith("_:"):
448                     continue
449             if v.type == "CreateFile" and (k.startswith("_:") or v.resolved.startswith("_:")):
450                 with final.open(v.target, "wb") as f:
451                     f.write(v.resolved.encode("utf-8"))
452                     continue
453
454             if not v.resolved.startswith("keep:"):
455                 raise Exception("Output source is not in keep or a literal")
456             sp = v.resolved.split("/")
457             srccollection = sp[0][5:]
458             try:
459                 reader = self.collection_cache.get(srccollection)
460                 srcpath = "/".join(sp[1:]) if len(sp) > 1 else "."
461                 final.copy(srcpath, v.target, source_collection=reader, overwrite=False)
462             except arvados.errors.ArgumentError as e:
463                 logger.error("Creating CollectionReader for '%s' '%s': %s", k, v, e)
464                 raise
465             except IOError as e:
466                 logger.error("While preparing output collection: %s", e)
467                 raise
468
469         def rewrite(fileobj):
470             fileobj["location"] = generatemapper.mapper(fileobj["location"]).target
471             for k in ("listing", "contents", "nameext", "nameroot", "dirname"):
472                 if k in fileobj:
473                     del fileobj[k]
474
475         adjustDirObjs(outputObj, rewrite)
476         adjustFileObjs(outputObj, rewrite)
477
478         with final.open("cwl.output.json", "w") as f:
479             res = str(json.dumps(outputObj, sort_keys=True, indent=4, separators=(',',': '), ensure_ascii=False))
480             f.write(res)
481
482         final.save_new(name=name, owner_uuid=self.project_uuid, storage_classes=storage_classes, ensure_unique_name=True)
483
484         logger.info("Final output collection %s \"%s\" (%s)", final.portable_data_hash(),
485                     final.api_response()["name"],
486                     final.manifest_locator())
487
488         final_uuid = final.manifest_locator()
489         tags = tagsString.split(',')
490         for tag in tags:
491              self.api.links().create(body={
492                 "head_uuid": final_uuid, "link_class": "tag", "name": tag
493                 }).execute(num_retries=self.num_retries)
494
495         def finalcollection(fileobj):
496             fileobj["location"] = "keep:%s/%s" % (final.portable_data_hash(), fileobj["location"])
497
498         adjustDirObjs(outputObj, finalcollection)
499         adjustFileObjs(outputObj, finalcollection)
500
501         return (outputObj, final)
502
503     def set_crunch_output(self):
504         if self.work_api == "containers":
505             current = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
506             if current is None:
507                 return
508             try:
509                 self.api.containers().update(uuid=current['uuid'],
510                                              body={
511                                                  'output': self.final_output_collection.portable_data_hash(),
512                                              }).execute(num_retries=self.num_retries)
513                 self.api.collections().update(uuid=self.final_output_collection.manifest_locator(),
514                                               body={
515                                                   'is_trashed': True
516                                               }).execute(num_retries=self.num_retries)
517             except Exception:
518                 logger.exception("Setting container output")
519                 return
520         elif self.work_api == "jobs" and "TASK_UUID" in os.environ:
521             self.api.job_tasks().update(uuid=os.environ["TASK_UUID"],
522                                    body={
523                                        'output': self.final_output_collection.portable_data_hash(),
524                                        'success': self.final_status == "success",
525                                        'progress':1.0
526                                    }).execute(num_retries=self.num_retries)
527
528     def arv_executor(self, tool, job_order, runtimeContext, logger=None):
529         self.debug = runtimeContext.debug
530
531         tool.visit(self.check_features)
532
533         self.project_uuid = runtimeContext.project_uuid
534         self.pipeline = None
535         self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
536         self.secret_store = runtimeContext.secret_store
537
538         self.trash_intermediate = runtimeContext.trash_intermediate
539         if self.trash_intermediate and self.work_api != "containers":
540             raise Exception("--trash-intermediate is only supported with --api=containers.")
541
542         self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
543         if self.intermediate_output_ttl and self.work_api != "containers":
544             raise Exception("--intermediate-output-ttl is only supported with --api=containers.")
545         if self.intermediate_output_ttl < 0:
546             raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl)
547
548         if runtimeContext.submit_request_uuid and self.work_api != "containers":
549             raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api))
550
551         if not runtimeContext.name:
552             runtimeContext.name = self.name = tool.tool.get("label") or tool.metadata.get("label") or os.path.basename(tool.tool["id"])
553
554         # Upload local file references in the job order.
555         job_order = upload_job_order(self, "%s input" % runtimeContext.name,
556                                      tool, job_order)
557
558         submitting = (runtimeContext.update_workflow or
559                       runtimeContext.create_workflow or
560                       (runtimeContext.submit and not
561                        (tool.tool["class"] == "CommandLineTool" and
562                         runtimeContext.wait and
563                         not runtimeContext.always_submit_runner)))
564
565         loadingContext = self.loadingContext.copy()
566         loadingContext.do_validate = False
567         loadingContext.do_update = False
568         if submitting:
569             # Document may have been auto-updated. Reload the original
570             # document with updating disabled because we want to
571             # submit the original document, not the auto-updated one.
572             tool = load_tool(tool.tool["id"], loadingContext)
573
574         # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
575         # Also uploads docker images.
576         merged_map = upload_workflow_deps(self, tool)
577
578         # Recreate process object (ArvadosWorkflow or
579         # ArvadosCommandTool) because tool document may have been
580         # updated by upload_workflow_deps in ways that modify
581         # inheritance of hints or requirements.
582         loadingContext.loader = tool.doc_loader
583         loadingContext.avsc_names = tool.doc_schema
584         loadingContext.metadata = tool.metadata
585         tool = load_tool(tool.tool, loadingContext)
586
587         existing_uuid = runtimeContext.update_workflow
588         if existing_uuid or runtimeContext.create_workflow:
589             # Create a pipeline template or workflow record and exit.
590             if self.work_api == "jobs":
591                 tmpl = RunnerTemplate(self, tool, job_order,
592                                       runtimeContext.enable_reuse,
593                                       uuid=existing_uuid,
594                                       submit_runner_ram=runtimeContext.submit_runner_ram,
595                                       name=runtimeContext.name,
596                                       merged_map=merged_map,
597                                       loadingContext=loadingContext)
598                 tmpl.save()
599                 # cwltool.main will write our return value to stdout.
600                 return (tmpl.uuid, "success")
601             elif self.work_api == "containers":
602                 return (upload_workflow(self, tool, job_order,
603                                         self.project_uuid,
604                                         uuid=existing_uuid,
605                                         submit_runner_ram=runtimeContext.submit_runner_ram,
606                                         name=runtimeContext.name,
607                                         merged_map=merged_map),
608                         "success")
609
610         self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
611         self.eval_timeout = runtimeContext.eval_timeout
612
613         runtimeContext = runtimeContext.copy()
614         runtimeContext.use_container = True
615         runtimeContext.tmpdir_prefix = "tmp"
616         runtimeContext.work_api = self.work_api
617
618         if self.work_api == "containers":
619             if self.ignore_docker_for_reuse:
620                 raise Exception("--ignore-docker-for-reuse not supported with containers API.")
621             runtimeContext.outdir = "/var/spool/cwl"
622             runtimeContext.docker_outdir = "/var/spool/cwl"
623             runtimeContext.tmpdir = "/tmp"
624             runtimeContext.docker_tmpdir = "/tmp"
625         elif self.work_api == "jobs":
626             if runtimeContext.priority != DEFAULT_PRIORITY:
627                 raise Exception("--priority not implemented for jobs API.")
628             runtimeContext.outdir = "$(task.outdir)"
629             runtimeContext.docker_outdir = "$(task.outdir)"
630             runtimeContext.tmpdir = "$(task.tmpdir)"
631
632         if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
633             raise Exception("--priority must be in the range 1..1000.")
634
635         if self.should_estimate_cache_size:
636             visited = set()
637             estimated_size = [0]
638             def estimate_collection_cache(obj):
639                 if obj.get("location", "").startswith("keep:"):
640                     m = pdh_size.match(obj["location"][5:])
641                     if m and m.group(1) not in visited:
642                         visited.add(m.group(1))
643                         estimated_size[0] += int(m.group(2))
644             visit_class(job_order, ("File", "Directory"), estimate_collection_cache)
645             runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256)
646             self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024)
647
648         logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size)
649
650         runnerjob = None
651         if runtimeContext.submit:
652             # Submit a runner job to run the workflow for us.
653             if self.work_api == "containers":
654                 if tool.tool["class"] == "CommandLineTool" and runtimeContext.wait and (not runtimeContext.always_submit_runner):
655                     runtimeContext.runnerjob = tool.tool["id"]
656                 else:
657                     tool = RunnerContainer(self, tool, loadingContext, runtimeContext.enable_reuse,
658                                                 self.output_name,
659                                                 self.output_tags,
660                                                 submit_runner_ram=runtimeContext.submit_runner_ram,
661                                                 name=runtimeContext.name,
662                                                 on_error=runtimeContext.on_error,
663                                                 submit_runner_image=runtimeContext.submit_runner_image,
664                                                 intermediate_output_ttl=runtimeContext.intermediate_output_ttl,
665                                                 merged_map=merged_map,
666                                                 priority=runtimeContext.priority,
667                                                 secret_store=self.secret_store,
668                                                 collection_cache_size=runtimeContext.collection_cache_size,
669                                                 collection_cache_is_default=self.should_estimate_cache_size)
670             elif self.work_api == "jobs":
671                 tool = RunnerJob(self, tool, loadingContext, runtimeContext.enable_reuse,
672                                       self.output_name,
673                                       self.output_tags,
674                                       submit_runner_ram=runtimeContext.submit_runner_ram,
675                                       name=runtimeContext.name,
676                                       on_error=runtimeContext.on_error,
677                                       submit_runner_image=runtimeContext.submit_runner_image,
678                                       merged_map=merged_map)
679         elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs":
680             # Create pipeline for local run
681             self.pipeline = self.api.pipeline_instances().create(
682                 body={
683                     "owner_uuid": self.project_uuid,
684                     "name": runtimeContext.name if runtimeContext.name else shortname(tool.tool["id"]),
685                     "components": {},
686                     "state": "RunningOnClient"}).execute(num_retries=self.num_retries)
687             logger.info("Pipeline instance %s", self.pipeline["uuid"])
688
689         if runtimeContext.cwl_runner_job is not None:
690             self.uuid = runtimeContext.cwl_runner_job.get('uuid')
691
692         jobiter = tool.job(job_order,
693                            self.output_callback,
694                            runtimeContext)
695
696         if runtimeContext.submit and not runtimeContext.wait:
697             runnerjob = next(jobiter)
698             runnerjob.run(runtimeContext)
699             return (runnerjob.uuid, "success")
700
701         current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
702         if current_container:
703             logger.info("Running inside container %s", current_container.get("uuid"))
704
705         self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout)
706         self.polling_thread = threading.Thread(target=self.poll_states)
707         self.polling_thread.start()
708
709         self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)
710
711         try:
712             self.workflow_eval_lock.acquire()
713
714             # Holds the lock while this code runs and releases it when
715             # it is safe to do so in self.workflow_eval_lock.wait(),
716             # at which point on_message can update job state and
717             # process output callbacks.
718
719             loopperf = Perf(metrics, "jobiter")
720             loopperf.__enter__()
721             for runnable in jobiter:
722                 loopperf.__exit__()
723
724                 if self.stop_polling.is_set():
725                     break
726
727                 if self.task_queue.error is not None:
728                     raise self.task_queue.error
729
730                 if runnable:
731                     with Perf(metrics, "run"):
732                         self.start_run(runnable, runtimeContext)
733                 else:
734                     if (self.task_queue.in_flight + len(self.processes)) > 0:
735                         self.workflow_eval_lock.wait(3)
736                     else:
737                         logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.")
738                         break
739
740                 if self.stop_polling.is_set():
741                     break
742
743                 loopperf.__enter__()
744             loopperf.__exit__()
745
746             while (self.task_queue.in_flight + len(self.processes)) > 0:
747                 if self.task_queue.error is not None:
748                     raise self.task_queue.error
749                 self.workflow_eval_lock.wait(3)
750
751         except UnsupportedRequirement:
752             raise
753         except:
754             if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit:
755                 logger.error("Interrupted, workflow will be cancelled")
756             elif isinstance(sys.exc_info()[1], WorkflowException):
757                 logger.error("Workflow execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
758             else:
759                 logger.exception("Workflow execution failed")
760
761             if self.pipeline:
762                 self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
763                                                      body={"state": "Failed"}).execute(num_retries=self.num_retries)
764             if runtimeContext.submit and isinstance(tool, Runner):
765                 runnerjob = tool
766                 if runnerjob.uuid and self.work_api == "containers":
767                     self.api.container_requests().update(uuid=runnerjob.uuid,
768                                                      body={"priority": "0"}).execute(num_retries=self.num_retries)
769         finally:
770             self.workflow_eval_lock.release()
771             self.task_queue.drain()
772             self.stop_polling.set()
773             self.polling_thread.join()
774             self.task_queue.join()
775
776         if self.final_status == "UnsupportedRequirement":
777             raise UnsupportedRequirement("Check log for details.")
778
779         if self.final_output is None:
780             raise WorkflowException("Workflow did not return a result.")
781
782         if runtimeContext.submit and isinstance(tool, Runner):
783             logger.info("Final output collection %s", tool.final_output)
784         else:
785             if self.output_name is None:
786                 self.output_name = "Output of %s" % (shortname(tool.tool["id"]))
787             if self.output_tags is None:
788                 self.output_tags = ""
789
790             storage_classes = runtimeContext.storage_classes.strip().split(",")
791             self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output)
792             self.set_crunch_output()
793
794         if runtimeContext.compute_checksum:
795             adjustDirObjs(self.final_output, partial(get_listing, self.fs_access))
796             adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access))
797
798         if self.trash_intermediate and self.final_status == "success":
799             self.trash_intermediate_output()
800
801         return (self.final_output, self.final_status)