13306: Improved py2/3 str compatibility
[arvados.git] / sdk / cwl / arvados_cwl / executor.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 from __future__ import division
6 from builtins import next
7 from builtins import object
8 from builtins import str
9 from future.utils import viewvalues
10
11 import argparse
12 import logging
13 import os
14 import sys
15 import threading
16 import copy
17 import json
18 import re
19 from functools import partial
20 import time
21
22 from cwltool.errors import WorkflowException
23 import cwltool.workflow
24 from schema_salad.sourceline import SourceLine
25 import schema_salad.validate as validate
26
27 import arvados
28 import arvados.config
29 from arvados.keep import KeepClient
30 from arvados.errors import ApiError
31
32 import arvados_cwl.util
33 from .arvcontainer import RunnerContainer
34 from .arvjob import RunnerJob, RunnerTemplate
35 from .runner import Runner, upload_docker, upload_job_order, upload_workflow_deps
36 from .arvtool import ArvadosCommandTool, validate_cluster_target, ArvadosExpressionTool
37 from .arvworkflow import ArvadosWorkflow, upload_workflow
38 from .fsaccess import CollectionFsAccess, CollectionFetcher, collectionResolver, CollectionCache, pdh_size
39 from .perf import Perf
40 from .pathmapper import NoFollowPathMapper
41 from .task_queue import TaskQueue
42 from .context import ArvLoadingContext, ArvRuntimeContext
43 from ._version import __version__
44
45 from cwltool.process import shortname, UnsupportedRequirement, use_custom_schema
46 from cwltool.pathmapper import adjustFileObjs, adjustDirObjs, get_listing, visit_class
47 from cwltool.command_line_tool import compute_checksums
48
49 logger = logging.getLogger('arvados.cwl-runner')
50 metrics = logging.getLogger('arvados.cwl-runner.metrics')
51
52 DEFAULT_PRIORITY = 500
53
54 class RuntimeStatusLoggingHandler(logging.Handler):
55     """
56     Intercepts logging calls and report them as runtime statuses on runner
57     containers.
58     """
59     def __init__(self, runtime_status_update_func):
60         super(RuntimeStatusLoggingHandler, self).__init__()
61         self.runtime_status_update = runtime_status_update_func
62
63     def emit(self, record):
64         kind = None
65         if record.levelno >= logging.ERROR:
66             kind = 'error'
67         elif record.levelno >= logging.WARNING:
68             kind = 'warning'
69         if kind is not None:
70             log_msg = record.getMessage()
71             if '\n' in log_msg:
72                 # If the logged message is multi-line, use its first line as status
73                 # and the rest as detail.
74                 status, detail = log_msg.split('\n', 1)
75                 self.runtime_status_update(
76                     kind,
77                     "%s: %s" % (record.name, status),
78                     detail
79                 )
80             else:
81                 self.runtime_status_update(
82                     kind,
83                     "%s: %s" % (record.name, record.getMessage())
84                 )
85
86 class ArvCwlExecutor(object):
87     """Execute a CWL tool or workflow, submit work (using either jobs or
88     containers API), wait for them to complete, and report output.
89
90     """
91
92     def __init__(self, api_client,
93                  arvargs=None,
94                  keep_client=None,
95                  num_retries=4,
96                  thread_count=4):
97
98         if arvargs is None:
99             arvargs = argparse.Namespace()
100             arvargs.work_api = None
101             arvargs.output_name = None
102             arvargs.output_tags = None
103             arvargs.thread_count = 1
104             arvargs.collection_cache_size = None
105
106         self.api = api_client
107         self.processes = {}
108         self.workflow_eval_lock = threading.Condition(threading.RLock())
109         self.final_output = None
110         self.final_status = None
111         self.num_retries = num_retries
112         self.uuid = None
113         self.stop_polling = threading.Event()
114         self.poll_api = None
115         self.pipeline = None
116         self.final_output_collection = None
117         self.output_name = arvargs.output_name
118         self.output_tags = arvargs.output_tags
119         self.project_uuid = None
120         self.intermediate_output_ttl = 0
121         self.intermediate_output_collections = []
122         self.trash_intermediate = False
123         self.thread_count = arvargs.thread_count
124         self.poll_interval = 12
125         self.loadingContext = None
126         self.should_estimate_cache_size = True
127
128         if keep_client is not None:
129             self.keep_client = keep_client
130         else:
131             self.keep_client = arvados.keep.KeepClient(api_client=self.api, num_retries=self.num_retries)
132
133         if arvargs.collection_cache_size:
134             collection_cache_size = arvargs.collection_cache_size*1024*1024
135             self.should_estimate_cache_size = False
136         else:
137             collection_cache_size = 256*1024*1024
138
139         self.collection_cache = CollectionCache(self.api, self.keep_client, self.num_retries,
140                                                 cap=collection_cache_size)
141
142         self.fetcher_constructor = partial(CollectionFetcher,
143                                            api_client=self.api,
144                                            fs_access=CollectionFsAccess("", collection_cache=self.collection_cache),
145                                            num_retries=self.num_retries)
146
147         self.work_api = None
148         expected_api = ["jobs", "containers"]
149         for api in expected_api:
150             try:
151                 methods = self.api._rootDesc.get('resources')[api]['methods']
152                 if ('httpMethod' in methods['create'] and
153                     (arvargs.work_api == api or arvargs.work_api is None)):
154                     self.work_api = api
155                     break
156             except KeyError:
157                 pass
158
159         if not self.work_api:
160             if arvargs.work_api is None:
161                 raise Exception("No supported APIs")
162             else:
163                 raise Exception("Unsupported API '%s', expected one of %s" % (arvargs.work_api, expected_api))
164
165         if self.work_api == "jobs":
166             logger.warning("""
167 *******************************
168 Using the deprecated 'jobs' API.
169
170 To get rid of this warning:
171
172 Users: read about migrating at
173 http://doc.arvados.org/user/cwl/cwl-style.html#migrate
174 and use the option --api=containers
175
176 Admins: configure the cluster to disable the 'jobs' API as described at:
177 http://doc.arvados.org/install/install-api-server.html#disable_api_methods
178 *******************************""")
179
180         self.loadingContext = ArvLoadingContext(vars(arvargs))
181         self.loadingContext.fetcher_constructor = self.fetcher_constructor
182         self.loadingContext.resolver = partial(collectionResolver, self.api, num_retries=self.num_retries)
183         self.loadingContext.construct_tool_object = self.arv_make_tool
184
185         # Add a custom logging handler to the root logger for runtime status reporting
186         # if running inside a container
187         if arvados_cwl.util.get_current_container(self.api, self.num_retries, logger):
188             root_logger = logging.getLogger('')
189
190             # Remove existing RuntimeStatusLoggingHandlers if they exist
191             handlers = [h for h in root_logger.handlers if not isinstance(h, RuntimeStatusLoggingHandler)]
192             root_logger.handlers = handlers
193
194             handler = RuntimeStatusLoggingHandler(self.runtime_status_update)
195             root_logger.addHandler(handler)
196
197         self.runtimeContext = ArvRuntimeContext(vars(arvargs))
198         self.runtimeContext.make_fs_access = partial(CollectionFsAccess,
199                                                      collection_cache=self.collection_cache)
200
201         validate_cluster_target(self, self.runtimeContext)
202
203
204     def arv_make_tool(self, toolpath_object, loadingContext):
205         if "class" in toolpath_object and toolpath_object["class"] == "CommandLineTool":
206             return ArvadosCommandTool(self, toolpath_object, loadingContext)
207         elif "class" in toolpath_object and toolpath_object["class"] == "Workflow":
208             return ArvadosWorkflow(self, toolpath_object, loadingContext)
209         elif "class" in toolpath_object and toolpath_object["class"] == "ExpressionTool":
210             return ArvadosExpressionTool(self, toolpath_object, loadingContext)
211         else:
212             raise Exception("Unknown tool %s" % toolpath_object.get("class"))
213
214     def output_callback(self, out, processStatus):
215         with self.workflow_eval_lock:
216             if processStatus == "success":
217                 logger.info("Overall process status is %s", processStatus)
218                 state = "Complete"
219             else:
220                 logger.error("Overall process status is %s", processStatus)
221                 state = "Failed"
222             if self.pipeline:
223                 self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
224                                                         body={"state": state}).execute(num_retries=self.num_retries)
225             self.final_status = processStatus
226             self.final_output = out
227             self.workflow_eval_lock.notifyAll()
228
229
230     def start_run(self, runnable, runtimeContext):
231         self.task_queue.add(partial(runnable.run, runtimeContext),
232                             self.workflow_eval_lock, self.stop_polling)
233
234     def process_submitted(self, container):
235         with self.workflow_eval_lock:
236             self.processes[container.uuid] = container
237
238     def process_done(self, uuid, record):
239         with self.workflow_eval_lock:
240             j = self.processes[uuid]
241             logger.info("%s %s is %s", self.label(j), uuid, record["state"])
242             self.task_queue.add(partial(j.done, record),
243                                 self.workflow_eval_lock, self.stop_polling)
244             del self.processes[uuid]
245
246     def runtime_status_update(self, kind, message, detail=None):
247         """
248         Updates the runtime_status field on the runner container.
249         Called when there's a need to report errors, warnings or just
250         activity statuses, for example in the RuntimeStatusLoggingHandler.
251         """
252         with self.workflow_eval_lock:
253             current = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
254             if current is None:
255                 return
256             runtime_status = current.get('runtime_status', {})
257             # In case of status being an error, only report the first one.
258             if kind == 'error':
259                 if not runtime_status.get('error'):
260                     runtime_status.update({
261                         'error': message
262                     })
263                     if detail is not None:
264                         runtime_status.update({
265                             'errorDetail': detail
266                         })
267                 # Further errors are only mentioned as a count.
268                 else:
269                     # Get anything before an optional 'and N more' string.
270                     try:
271                         error_msg = re.match(
272                             r'^(.*?)(?=\s*\(and \d+ more\)|$)', runtime_status.get('error')).groups()[0]
273                         more_failures = re.match(
274                             r'.*\(and (\d+) more\)', runtime_status.get('error'))
275                     except TypeError:
276                         # Ignore tests stubbing errors
277                         return
278                     if more_failures:
279                         failure_qty = int(more_failures.groups()[0])
280                         runtime_status.update({
281                             'error': "%s (and %d more)" % (error_msg, failure_qty+1)
282                         })
283                     else:
284                         runtime_status.update({
285                             'error': "%s (and 1 more)" % error_msg
286                         })
287             elif kind in ['warning', 'activity']:
288                 # Record the last warning/activity status without regard of
289                 # previous occurences.
290                 runtime_status.update({
291                     kind: message
292                 })
293                 if detail is not None:
294                     runtime_status.update({
295                         kind+"Detail": detail
296                     })
297             else:
298                 # Ignore any other status kind
299                 return
300             try:
301                 self.api.containers().update(uuid=current['uuid'],
302                                             body={
303                                                 'runtime_status': runtime_status,
304                                             }).execute(num_retries=self.num_retries)
305             except Exception as e:
306                 logger.info("Couldn't update runtime_status: %s", e)
307
308     def wrapped_callback(self, cb, obj, st):
309         with self.workflow_eval_lock:
310             cb(obj, st)
311             self.workflow_eval_lock.notifyAll()
312
313     def get_wrapped_callback(self, cb):
314         return partial(self.wrapped_callback, cb)
315
316     def on_message(self, event):
317         if event.get("object_uuid") in self.processes and event["event_type"] == "update":
318             uuid = event["object_uuid"]
319             if event["properties"]["new_attributes"]["state"] == "Running":
320                 with self.workflow_eval_lock:
321                     j = self.processes[uuid]
322                     if j.running is False:
323                         j.running = True
324                         j.update_pipeline_component(event["properties"]["new_attributes"])
325                         logger.info("%s %s is Running", self.label(j), uuid)
326             elif event["properties"]["new_attributes"]["state"] in ("Complete", "Failed", "Cancelled", "Final"):
327                 self.process_done(uuid, event["properties"]["new_attributes"])
328
329     def label(self, obj):
330         return "[%s %s]" % (self.work_api[0:-1], obj.name)
331
332     def poll_states(self):
333         """Poll status of jobs or containers listed in the processes dict.
334
335         Runs in a separate thread.
336         """
337
338         try:
339             remain_wait = self.poll_interval
340             while True:
341                 if remain_wait > 0:
342                     self.stop_polling.wait(remain_wait)
343                 if self.stop_polling.is_set():
344                     break
345                 with self.workflow_eval_lock:
346                     keys = list(self.processes)
347                 if not keys:
348                     remain_wait = self.poll_interval
349                     continue
350
351                 begin_poll = time.time()
352                 if self.work_api == "containers":
353                     table = self.poll_api.container_requests()
354                 elif self.work_api == "jobs":
355                     table = self.poll_api.jobs()
356
357                 pageSize = self.poll_api._rootDesc.get('maxItemsPerResponse', 1000)
358
359                 while keys:
360                     page = keys[:pageSize]
361                     keys = keys[pageSize:]
362                     try:
363                         proc_states = table.list(filters=[["uuid", "in", page]]).execute(num_retries=self.num_retries)
364                     except Exception as e:
365                         logger.warning("Error checking states on API server: %s", e)
366                         remain_wait = self.poll_interval
367                         continue
368
369                     for p in proc_states["items"]:
370                         self.on_message({
371                             "object_uuid": p["uuid"],
372                             "event_type": "update",
373                             "properties": {
374                                 "new_attributes": p
375                             }
376                         })
377                 finish_poll = time.time()
378                 remain_wait = self.poll_interval - (finish_poll - begin_poll)
379         except:
380             logger.exception("Fatal error in state polling thread.")
381             with self.workflow_eval_lock:
382                 self.processes.clear()
383                 self.workflow_eval_lock.notifyAll()
384         finally:
385             self.stop_polling.set()
386
387     def add_intermediate_output(self, uuid):
388         if uuid:
389             self.intermediate_output_collections.append(uuid)
390
391     def trash_intermediate_output(self):
392         logger.info("Cleaning up intermediate output collections")
393         for i in self.intermediate_output_collections:
394             try:
395                 self.api.collections().delete(uuid=i).execute(num_retries=self.num_retries)
396             except:
397                 logger.warning("Failed to delete intermediate output: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
398             if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit:
399                 break
400
401     def check_features(self, obj):
402         if isinstance(obj, dict):
403             if obj.get("writable") and self.work_api != "containers":
404                 raise SourceLine(obj, "writable", UnsupportedRequirement).makeError("InitialWorkDir feature 'writable: true' not supported with --api=jobs")
405             if obj.get("class") == "DockerRequirement":
406                 if obj.get("dockerOutputDirectory"):
407                     if self.work_api != "containers":
408                         raise SourceLine(obj, "dockerOutputDirectory", UnsupportedRequirement).makeError(
409                             "Option 'dockerOutputDirectory' of DockerRequirement not supported with --api=jobs.")
410                     if not obj.get("dockerOutputDirectory").startswith('/'):
411                         raise SourceLine(obj, "dockerOutputDirectory", validate.ValidationException).makeError(
412                             "Option 'dockerOutputDirectory' must be an absolute path.")
413             if obj.get("class") == "http://commonwl.org/cwltool#Secrets" and self.work_api != "containers":
414                 raise SourceLine(obj, "class", UnsupportedRequirement).makeError("Secrets not supported with --api=jobs")
415             for v in viewvalues(obj):
416                 self.check_features(v)
417         elif isinstance(obj, list):
418             for i,v in enumerate(obj):
419                 with SourceLine(obj, i, UnsupportedRequirement, logger.isEnabledFor(logging.DEBUG)):
420                     self.check_features(v)
421
422     def make_output_collection(self, name, storage_classes, tagsString, outputObj):
423         outputObj = copy.deepcopy(outputObj)
424
425         files = []
426         def capture(fileobj):
427             files.append(fileobj)
428
429         adjustDirObjs(outputObj, capture)
430         adjustFileObjs(outputObj, capture)
431
432         generatemapper = NoFollowPathMapper(files, "", "", separateDirs=False)
433
434         final = arvados.collection.Collection(api_client=self.api,
435                                               keep_client=self.keep_client,
436                                               num_retries=self.num_retries)
437
438         for k,v in generatemapper.items():
439             if k.startswith("_:"):
440                 if v.type == "Directory":
441                     continue
442                 if v.type == "CreateFile":
443                     with final.open(v.target, "wb") as f:
444                         f.write(v.resolved.encode("utf-8"))
445                     continue
446
447             if not k.startswith("keep:"):
448                 raise Exception("Output source is not in keep or a literal")
449             sp = k.split("/")
450             srccollection = sp[0][5:]
451             try:
452                 reader = self.collection_cache.get(srccollection)
453                 srcpath = "/".join(sp[1:]) if len(sp) > 1 else "."
454                 final.copy(srcpath, v.target, source_collection=reader, overwrite=False)
455             except arvados.errors.ArgumentError as e:
456                 logger.error("Creating CollectionReader for '%s' '%s': %s", k, v, e)
457                 raise
458             except IOError as e:
459                 logger.warning("While preparing output collection: %s", e)
460
461         def rewrite(fileobj):
462             fileobj["location"] = generatemapper.mapper(fileobj["location"]).target
463             for k in ("listing", "contents", "nameext", "nameroot", "dirname"):
464                 if k in fileobj:
465                     del fileobj[k]
466
467         adjustDirObjs(outputObj, rewrite)
468         adjustFileObjs(outputObj, rewrite)
469
470         with final.open("cwl.output.json", "w") as f:
471             res = str(json.dumps(outputObj, sort_keys=True, indent=4, separators=(',',': '), ensure_ascii=False))
472             f.write(res)           
473
474         final.save_new(name=name, owner_uuid=self.project_uuid, storage_classes=storage_classes, ensure_unique_name=True)
475
476         logger.info("Final output collection %s \"%s\" (%s)", final.portable_data_hash(),
477                     final.api_response()["name"],
478                     final.manifest_locator())
479
480         final_uuid = final.manifest_locator()
481         tags = tagsString.split(',')
482         for tag in tags:
483              self.api.links().create(body={
484                 "head_uuid": final_uuid, "link_class": "tag", "name": tag
485                 }).execute(num_retries=self.num_retries)
486
487         def finalcollection(fileobj):
488             fileobj["location"] = "keep:%s/%s" % (final.portable_data_hash(), fileobj["location"])
489
490         adjustDirObjs(outputObj, finalcollection)
491         adjustFileObjs(outputObj, finalcollection)
492
493         return (outputObj, final)
494
495     def set_crunch_output(self):
496         if self.work_api == "containers":
497             current = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
498             if current is None:
499                 return
500             try:
501                 self.api.containers().update(uuid=current['uuid'],
502                                              body={
503                                                  'output': self.final_output_collection.portable_data_hash(),
504                                              }).execute(num_retries=self.num_retries)
505                 self.api.collections().update(uuid=self.final_output_collection.manifest_locator(),
506                                               body={
507                                                   'is_trashed': True
508                                               }).execute(num_retries=self.num_retries)
509             except Exception as e:
510                 logger.info("Setting container output: %s", e)
511         elif self.work_api == "jobs" and "TASK_UUID" in os.environ:
512             self.api.job_tasks().update(uuid=os.environ["TASK_UUID"],
513                                    body={
514                                        'output': self.final_output_collection.portable_data_hash(),
515                                        'success': self.final_status == "success",
516                                        'progress':1.0
517                                    }).execute(num_retries=self.num_retries)
518
519     def arv_executor(self, tool, job_order, runtimeContext, logger=None):
520         self.debug = runtimeContext.debug
521
522         tool.visit(self.check_features)
523
524         self.project_uuid = runtimeContext.project_uuid
525         self.pipeline = None
526         self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir)
527         self.secret_store = runtimeContext.secret_store
528
529         self.trash_intermediate = runtimeContext.trash_intermediate
530         if self.trash_intermediate and self.work_api != "containers":
531             raise Exception("--trash-intermediate is only supported with --api=containers.")
532
533         self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl
534         if self.intermediate_output_ttl and self.work_api != "containers":
535             raise Exception("--intermediate-output-ttl is only supported with --api=containers.")
536         if self.intermediate_output_ttl < 0:
537             raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl)
538
539         if runtimeContext.submit_request_uuid and self.work_api != "containers":
540             raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api))
541
542         if not runtimeContext.name:
543             runtimeContext.name = self.name = tool.tool.get("label") or tool.metadata.get("label") or os.path.basename(tool.tool["id"])
544
545         # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
546         # Also uploads docker images.
547         merged_map = upload_workflow_deps(self, tool)
548
549         # Reload tool object which may have been updated by
550         # upload_workflow_deps
551         # Don't validate this time because it will just print redundant errors.
552         loadingContext = self.loadingContext.copy()
553         loadingContext.loader = tool.doc_loader
554         loadingContext.avsc_names = tool.doc_schema
555         loadingContext.metadata = tool.metadata
556         loadingContext.do_validate = False
557
558         tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
559                                   loadingContext)
560
561         # Upload local file references in the job order.
562         job_order = upload_job_order(self, "%s input" % runtimeContext.name,
563                                      tool, job_order)
564
565         existing_uuid = runtimeContext.update_workflow
566         if existing_uuid or runtimeContext.create_workflow:
567             # Create a pipeline template or workflow record and exit.
568             if self.work_api == "jobs":
569                 tmpl = RunnerTemplate(self, tool, job_order,
570                                       runtimeContext.enable_reuse,
571                                       uuid=existing_uuid,
572                                       submit_runner_ram=runtimeContext.submit_runner_ram,
573                                       name=runtimeContext.name,
574                                       merged_map=merged_map,
575                                       loadingContext=loadingContext)
576                 tmpl.save()
577                 # cwltool.main will write our return value to stdout.
578                 return (tmpl.uuid, "success")
579             elif self.work_api == "containers":
580                 return (upload_workflow(self, tool, job_order,
581                                         self.project_uuid,
582                                         uuid=existing_uuid,
583                                         submit_runner_ram=runtimeContext.submit_runner_ram,
584                                         name=runtimeContext.name,
585                                         merged_map=merged_map),
586                         "success")
587
588         self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse
589         self.eval_timeout = runtimeContext.eval_timeout
590
591         runtimeContext = runtimeContext.copy()
592         runtimeContext.use_container = True
593         runtimeContext.tmpdir_prefix = "tmp"
594         runtimeContext.work_api = self.work_api
595
596         if self.work_api == "containers":
597             if self.ignore_docker_for_reuse:
598                 raise Exception("--ignore-docker-for-reuse not supported with containers API.")
599             runtimeContext.outdir = "/var/spool/cwl"
600             runtimeContext.docker_outdir = "/var/spool/cwl"
601             runtimeContext.tmpdir = "/tmp"
602             runtimeContext.docker_tmpdir = "/tmp"
603         elif self.work_api == "jobs":
604             if runtimeContext.priority != DEFAULT_PRIORITY:
605                 raise Exception("--priority not implemented for jobs API.")
606             runtimeContext.outdir = "$(task.outdir)"
607             runtimeContext.docker_outdir = "$(task.outdir)"
608             runtimeContext.tmpdir = "$(task.tmpdir)"
609
610         if runtimeContext.priority < 1 or runtimeContext.priority > 1000:
611             raise Exception("--priority must be in the range 1..1000.")
612
613         if self.should_estimate_cache_size:
614             visited = set()
615             estimated_size = [0]
616             def estimate_collection_cache(obj):
617                 if obj.get("location", "").startswith("keep:"):
618                     m = pdh_size.match(obj["location"][5:])
619                     if m and m.group(1) not in visited:
620                         visited.add(m.group(1))
621                         estimated_size[0] += int(m.group(2))
622             visit_class(job_order, ("File", "Directory"), estimate_collection_cache)
623             runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256)
624             self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024)
625
626         logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size)
627
628         runnerjob = None
629         if runtimeContext.submit:
630             # Submit a runner job to run the workflow for us.
631             if self.work_api == "containers":
632                 if tool.tool["class"] == "CommandLineTool" and runtimeContext.wait and (not runtimeContext.always_submit_runner):
633                     runtimeContext.runnerjob = tool.tool["id"]
634                 else:
635                     tool = RunnerContainer(self, tool, loadingContext, runtimeContext.enable_reuse,
636                                                 self.output_name,
637                                                 self.output_tags,
638                                                 submit_runner_ram=runtimeContext.submit_runner_ram,
639                                                 name=runtimeContext.name,
640                                                 on_error=runtimeContext.on_error,
641                                                 submit_runner_image=runtimeContext.submit_runner_image,
642                                                 intermediate_output_ttl=runtimeContext.intermediate_output_ttl,
643                                                 merged_map=merged_map,
644                                                 priority=runtimeContext.priority,
645                                                 secret_store=self.secret_store,
646                                                 collection_cache_size=runtimeContext.collection_cache_size,
647                                                 collection_cache_is_default=self.should_estimate_cache_size)
648             elif self.work_api == "jobs":
649                 tool = RunnerJob(self, tool, loadingContext, runtimeContext.enable_reuse,
650                                       self.output_name,
651                                       self.output_tags,
652                                       submit_runner_ram=runtimeContext.submit_runner_ram,
653                                       name=runtimeContext.name,
654                                       on_error=runtimeContext.on_error,
655                                       submit_runner_image=runtimeContext.submit_runner_image,
656                                       merged_map=merged_map)
657         elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs":
658             # Create pipeline for local run
659             self.pipeline = self.api.pipeline_instances().create(
660                 body={
661                     "owner_uuid": self.project_uuid,
662                     "name": runtimeContext.name if runtimeContext.name else shortname(tool.tool["id"]),
663                     "components": {},
664                     "state": "RunningOnClient"}).execute(num_retries=self.num_retries)
665             logger.info("Pipeline instance %s", self.pipeline["uuid"])
666
667         if runtimeContext.cwl_runner_job is not None:
668             self.uuid = runtimeContext.cwl_runner_job.get('uuid')
669
670         jobiter = tool.job(job_order,
671                            self.output_callback,
672                            runtimeContext)
673
674         if runtimeContext.submit and not runtimeContext.wait:
675             runnerjob = next(jobiter)
676             runnerjob.run(runtimeContext)
677             return (runnerjob.uuid, "success")
678
679         current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger)
680         if current_container:
681             logger.info("Running inside container %s", current_container.get("uuid"))
682
683         self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout)
684         self.polling_thread = threading.Thread(target=self.poll_states)
685         self.polling_thread.start()
686
687         self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count)
688
689         try:
690             self.workflow_eval_lock.acquire()
691
692             # Holds the lock while this code runs and releases it when
693             # it is safe to do so in self.workflow_eval_lock.wait(),
694             # at which point on_message can update job state and
695             # process output callbacks.
696
697             loopperf = Perf(metrics, "jobiter")
698             loopperf.__enter__()
699             for runnable in jobiter:
700                 loopperf.__exit__()
701
702                 if self.stop_polling.is_set():
703                     break
704
705                 if self.task_queue.error is not None:
706                     raise self.task_queue.error
707
708                 if runnable:
709                     with Perf(metrics, "run"):
710                         self.start_run(runnable, runtimeContext)
711                 else:
712                     if (self.task_queue.in_flight + len(self.processes)) > 0:
713                         self.workflow_eval_lock.wait(3)
714                     else:
715                         logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.")
716                         break
717
718                 if self.stop_polling.is_set():
719                     break
720
721                 loopperf.__enter__()
722             loopperf.__exit__()
723
724             while (self.task_queue.in_flight + len(self.processes)) > 0:
725                 if self.task_queue.error is not None:
726                     raise self.task_queue.error
727                 self.workflow_eval_lock.wait(3)
728
729         except UnsupportedRequirement:
730             raise
731         except:
732             if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit:
733                 logger.error("Interrupted, workflow will be cancelled")
734             else:
735                 logger.error("Execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
736             if self.pipeline:
737                 self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
738                                                      body={"state": "Failed"}).execute(num_retries=self.num_retries)
739             if runtimeContext.submit and isinstance(tool, Runner):
740                 runnerjob = tool
741                 if runnerjob.uuid and self.work_api == "containers":
742                     self.api.container_requests().update(uuid=runnerjob.uuid,
743                                                      body={"priority": "0"}).execute(num_retries=self.num_retries)
744         finally:
745             self.workflow_eval_lock.release()
746             self.task_queue.drain()
747             self.stop_polling.set()
748             self.polling_thread.join()
749             self.task_queue.join()
750
751         if self.final_status == "UnsupportedRequirement":
752             raise UnsupportedRequirement("Check log for details.")
753
754         if self.final_output is None:
755             raise WorkflowException("Workflow did not return a result.")
756
757         if runtimeContext.submit and isinstance(tool, Runner):
758             logger.info("Final output collection %s", tool.final_output)
759         else:
760             if self.output_name is None:
761                 self.output_name = "Output of %s" % (shortname(tool.tool["id"]))
762             if self.output_tags is None:
763                 self.output_tags = ""
764
765             storage_classes = runtimeContext.storage_classes.strip().split(",")
766             self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output)
767             self.set_crunch_output()
768
769         if runtimeContext.compute_checksum:
770             adjustDirObjs(self.final_output, partial(get_listing, self.fs_access))
771             adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access))
772
773         if self.trash_intermediate and self.final_status == "success":
774             self.trash_intermediate_output()
775
776         return (self.final_output, self.final_status)