21356: Remove all Python 2/3 compatibility imports
[arvados.git] / sdk / cwl / arvados_cwl / runner.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 import os
6 import sys
7 import re
8 import urllib.parse
9 from functools import partial
10 import logging
11 import json
12 import copy
13 from collections import namedtuple
14 from io import StringIO
15 from typing import (
16     Any,
17     Callable,
18     Dict,
19     Iterable,
20     Iterator,
21     List,
22     Mapping,
23     MutableMapping,
24     Sequence,
25     MutableSequence,
26     Optional,
27     Set,
28     Sized,
29     Tuple,
30     Type,
31     Union,
32     cast,
33 )
34 from cwltool.utils import (
35     CWLObjectType,
36     CWLOutputAtomType,
37     CWLOutputType,
38 )
39
40 import subprocess
41
42 from schema_salad.sourceline import SourceLine, cmap
43
44 from cwltool.command_line_tool import CommandLineTool
45 import cwltool.workflow
46 from cwltool.process import (scandeps, UnsupportedRequirement, normalizeFilesDirs,
47                              shortname, Process, fill_in_defaults)
48 from cwltool.load_tool import fetch_document, jobloaderctx
49 from cwltool.utils import aslist, adjustFileObjs, adjustDirObjs, visit_class
50 from cwltool.builder import substitute
51 from cwltool.pack import pack
52 from cwltool.update import INTERNAL_VERSION
53 from cwltool.builder import Builder
54 import schema_salad.validate as validate
55 import schema_salad.ref_resolver
56
57 import arvados.collection
58 import arvados.util
59 from .util import collectionUUID
60 from ruamel.yaml import YAML
61 from ruamel.yaml.comments import CommentedMap, CommentedSeq
62
63 import arvados_cwl.arvdocker
64 from .pathmapper import ArvPathMapper, trim_listing, collection_pdh_pattern, collection_uuid_pattern, MapperEnt
65 from ._version import __version__
66 from . import done
67 from . context import ArvRuntimeContext
68 from .perf import Perf
69
70 basestring = (bytes, str)
71 logger = logging.getLogger('arvados.cwl-runner')
72 metrics = logging.getLogger('arvados.cwl-runner.metrics')
73
74 def trim_anonymous_location(obj):
75     """Remove 'location' field from File and Directory literals.
76
77     To make internal handling easier, literals are assigned a random id for
78     'location'.  However, when writing the record back out, this can break
79     reproducibility.  Since it is valid for literals not have a 'location'
80     field, remove it.
81
82     """
83
84     if obj.get("location", "").startswith("_:"):
85         del obj["location"]
86
87
88 def remove_redundant_fields(obj):
89     for field in ("path", "nameext", "nameroot", "dirname"):
90         if field in obj:
91             del obj[field]
92
93
94 def find_defaults(d, op):
95     if isinstance(d, list):
96         for i in d:
97             find_defaults(i, op)
98     elif isinstance(d, dict):
99         if "default" in d:
100             op(d)
101         else:
102             for i in d.values():
103                 find_defaults(i, op)
104
105 def make_builder(joborder, hints, requirements, runtimeContext, metadata):
106     return Builder(
107                  job=joborder,
108                  files=[],               # type: List[Dict[Text, Text]]
109                  bindings=[],            # type: List[Dict[Text, Any]]
110                  schemaDefs={},          # type: Dict[Text, Dict[Text, Any]]
111                  names=None,               # type: Names
112                  requirements=requirements,        # type: List[Dict[Text, Any]]
113                  hints=hints,               # type: List[Dict[Text, Any]]
114                  resources={},           # type: Dict[str, int]
115                  mutation_manager=None,    # type: Optional[MutationManager]
116                  formatgraph=None,         # type: Optional[Graph]
117                  make_fs_access=None,      # type: Type[StdFsAccess]
118                  fs_access=None,           # type: StdFsAccess
119                  job_script_provider=runtimeContext.job_script_provider, # type: Optional[Any]
120                  timeout=runtimeContext.eval_timeout,             # type: float
121                  debug=runtimeContext.debug,               # type: bool
122                  js_console=runtimeContext.js_console,          # type: bool
123                  force_docker_pull=runtimeContext.force_docker_pull,   # type: bool
124                  loadListing="",         # type: Text
125                  outdir="",              # type: Text
126                  tmpdir="",              # type: Text
127                  stagedir="",            # type: Text
128                  cwlVersion=metadata.get("http://commonwl.org/cwltool#original_cwlVersion") or metadata.get("cwlVersion"),
129                  container_engine="docker"
130                 )
131
132 def search_schemadef(name, reqs):
133     for r in reqs:
134         if r["class"] == "SchemaDefRequirement":
135             for sd in r["types"]:
136                 if sd["name"] == name:
137                     return sd
138     return None
139
140 primitive_types_set = frozenset(("null", "boolean", "int", "long",
141                                  "float", "double", "string", "record",
142                                  "array", "enum"))
143
144 def set_secondary(fsaccess, builder, inputschema, secondaryspec, primary, discovered):
145     if isinstance(inputschema, Sequence) and not isinstance(inputschema, basestring):
146         # union type, collect all possible secondaryFiles
147         for i in inputschema:
148             set_secondary(fsaccess, builder, i, secondaryspec, primary, discovered)
149         return
150
151     if inputschema == "File":
152         inputschema = {"type": "File"}
153
154     if isinstance(inputschema, basestring):
155         sd = search_schemadef(inputschema, reversed(builder.hints+builder.requirements))
156         if sd:
157             inputschema = sd
158         else:
159             return
160
161     if "secondaryFiles" in inputschema:
162         # set secondaryFiles, may be inherited by compound types.
163         secondaryspec = inputschema["secondaryFiles"]
164
165     if (isinstance(inputschema["type"], (Mapping, Sequence)) and
166         not isinstance(inputschema["type"], basestring)):
167         # compound type (union, array, record)
168         set_secondary(fsaccess, builder, inputschema["type"], secondaryspec, primary, discovered)
169
170     elif (inputschema["type"] == "record" and
171           isinstance(primary, Mapping)):
172         #
173         # record type, find secondary files associated with fields.
174         #
175         for f in inputschema["fields"]:
176             p = primary.get(shortname(f["name"]))
177             if p:
178                 set_secondary(fsaccess, builder, f, secondaryspec, p, discovered)
179
180     elif (inputschema["type"] == "array" and
181           isinstance(primary, Sequence)):
182         #
183         # array type, find secondary files of elements
184         #
185         for p in primary:
186             set_secondary(fsaccess, builder, {"type": inputschema["items"]}, secondaryspec, p, discovered)
187
188     elif (inputschema["type"] == "File" and
189           isinstance(primary, Mapping) and
190           primary.get("class") == "File"):
191
192         if "secondaryFiles" in primary or not secondaryspec:
193             # Nothing to do.
194             return
195
196         #
197         # Found a file, check for secondaryFiles
198         #
199         specs = []
200         primary["secondaryFiles"] = secondaryspec
201         for i, sf in enumerate(aslist(secondaryspec)):
202             if builder.cwlVersion == "v1.0":
203                 pattern = sf
204             else:
205                 pattern = sf["pattern"]
206             if pattern is None:
207                 continue
208             if isinstance(pattern, list):
209                 specs.extend(pattern)
210             elif isinstance(pattern, dict):
211                 specs.append(pattern)
212             elif isinstance(pattern, str):
213                 if builder.cwlVersion == "v1.0":
214                     specs.append({"pattern": pattern, "required": True})
215                 else:
216                     specs.append({"pattern": pattern, "required": sf.get("required")})
217             else:
218                 raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
219                     "Expression must return list, object, string or null")
220
221         found = []
222         for i, sf in enumerate(specs):
223             if isinstance(sf, dict):
224                 if sf.get("class") == "File":
225                     pattern = None
226                     if sf.get("location") is None:
227                         raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
228                             "File object is missing 'location': %s" % sf)
229                     sfpath = sf["location"]
230                     required = True
231                 else:
232                     pattern = sf["pattern"]
233                     required = sf.get("required")
234             elif isinstance(sf, str):
235                 pattern = sf
236                 required = True
237             else:
238                 raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
239                     "Expression must return list, object, string or null")
240
241             if pattern is not None:
242                 if "${" in pattern or "$(" in pattern:
243                     sfname = builder.do_eval(pattern, context=primary)
244                 else:
245                     sfname = substitute(primary["basename"], pattern)
246
247                 if sfname is None:
248                     continue
249
250                 if isinstance(sfname, str):
251                     p_location = primary["location"]
252                     if "/" in p_location:
253                         sfpath = (
254                             p_location[0 : p_location.rindex("/") + 1]
255                             + sfname
256                         )
257
258             required = builder.do_eval(required, context=primary)
259
260             if isinstance(sfname, list) or isinstance(sfname, dict):
261                 each = aslist(sfname)
262                 for e in each:
263                     if required and not fsaccess.exists(e.get("location")):
264                         raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
265                             "Required secondary file '%s' does not exist" % e.get("location"))
266                 found.extend(each)
267
268             if isinstance(sfname, str):
269                 if fsaccess.exists(sfpath):
270                     if pattern is not None:
271                         found.append({"location": sfpath, "class": "File"})
272                     else:
273                         found.append(sf)
274                 elif required:
275                     raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
276                         "Required secondary file '%s' does not exist" % sfpath)
277
278         primary["secondaryFiles"] = cmap(found)
279         if discovered is not None:
280             discovered[primary["location"]] = primary["secondaryFiles"]
281     elif inputschema["type"] not in primitive_types_set and inputschema["type"] not in ("File", "Directory"):
282         set_secondary(fsaccess, builder, inputschema["type"], secondaryspec, primary, discovered)
283
284 def discover_secondary_files(fsaccess, builder, inputs, job_order, discovered=None):
285     for inputschema in inputs:
286         primary = job_order.get(shortname(inputschema["id"]))
287         if isinstance(primary, (Mapping, Sequence)):
288             set_secondary(fsaccess, builder, inputschema, None, primary, discovered)
289
290 def upload_dependencies(arvrunner, name, document_loader,
291                         workflowobj, uri, runtimeContext,
292                         include_primary=True, discovered_secondaryfiles=None,
293                         cache=None):
294     """Upload the dependencies of the workflowobj document to Keep.
295
296     Returns a pathmapper object mapping local paths to keep references.  Also
297     does an in-place update of references in "workflowobj".
298
299     Use scandeps to find $schemas, File and Directory
300     fields that represent external references.
301
302     If workflowobj has an "id" field, this will reload the document to ensure
303     it is scanning the raw document prior to preprocessing.
304     """
305
306     scanobj = workflowobj
307     metadata = scanobj
308
309     with Perf(metrics, "scandeps"):
310         sc_result = scandeps(uri, scanobj,
311                              set(),
312                              set(("location",)),
313                              None, urljoin=document_loader.fetcher.urljoin,
314                              nestdirs=False)
315         optional_deps = scandeps(uri, scanobj,
316                              set(),
317                              set(("$schemas",)),
318                              None, urljoin=document_loader.fetcher.urljoin,
319                              nestdirs=False)
320
321     if sc_result is None:
322         sc_result = []
323
324     if optional_deps is None:
325         optional_deps = []
326
327     if optional_deps:
328         sc_result.extend(optional_deps)
329
330     sc = []
331     uuids = {}
332
333     def collect_uuids(obj):
334         loc = obj.get("location", "")
335         sp = loc.split(":")
336         if sp[0] == "keep":
337             # Collect collection uuids that need to be resolved to
338             # portable data hashes
339             gp = collection_uuid_pattern.match(loc)
340             if gp:
341                 uuids[gp.groups()[0]] = obj
342             if collectionUUID in obj:
343                 uuids[obj[collectionUUID]] = obj
344
345     def collect_uploads(obj):
346         loc = obj.get("location", "")
347         sp = loc.split(":")
348         if len(sp) < 1:
349             return
350         if sp[0] in ("file", "http", "https"):
351             # Record local files than need to be uploaded,
352             # don't include file literals, keep references, etc.
353             sc.append(obj)
354         collect_uuids(obj)
355
356     with Perf(metrics, "collect uuids"):
357         visit_class(workflowobj, ("File", "Directory"), collect_uuids)
358
359     with Perf(metrics, "collect uploads"):
360         visit_class(sc_result, ("File", "Directory"), collect_uploads)
361
362     # Resolve any collection uuids we found to portable data hashes
363     # and assign them to uuid_map
364     uuid_map = {}
365     fetch_uuids = list(uuids.keys())
366     with Perf(metrics, "fetch_uuids"):
367         while fetch_uuids:
368             # For a large number of fetch_uuids, API server may limit
369             # response size, so keep fetching from API server has nothing
370             # more to give us.
371             lookups = arvrunner.api.collections().list(
372                 filters=[["uuid", "in", fetch_uuids]],
373                 count="none",
374                 select=["uuid", "portable_data_hash"]).execute(
375                     num_retries=arvrunner.num_retries)
376
377             if not lookups["items"]:
378                 break
379
380             for l in lookups["items"]:
381                 uuid_map[l["uuid"]] = l["portable_data_hash"]
382
383             fetch_uuids = [u for u in fetch_uuids if u not in uuid_map]
384
385     normalizeFilesDirs(sc)
386
387     if "id" in workflowobj:
388         defrg, _ = urllib.parse.urldefrag(workflowobj["id"])
389         if include_primary:
390             # make sure it's included
391             sc.append({"class": "File", "location": defrg})
392         else:
393             # make sure it's excluded
394             sc = [d for d in sc if d.get("location") != defrg]
395
396     def visit_default(obj):
397         def defaults_are_optional(f):
398             if "location" not in f and "path" in f:
399                 f["location"] = f["path"]
400                 del f["path"]
401             normalizeFilesDirs(f)
402             optional_deps.append(f)
403         visit_class(obj["default"], ("File", "Directory"), defaults_are_optional)
404
405     find_defaults(workflowobj, visit_default)
406
407     discovered = {}
408     def discover_default_secondary_files(obj):
409         builder_job_order = {}
410         for t in obj["inputs"]:
411             builder_job_order[shortname(t["id"])] = t["default"] if "default" in t else None
412         # Need to create a builder object to evaluate expressions.
413         builder = make_builder(builder_job_order,
414                                obj.get("hints", []),
415                                obj.get("requirements", []),
416                                ArvRuntimeContext(),
417                                metadata)
418         discover_secondary_files(arvrunner.fs_access,
419                                  builder,
420                                  obj["inputs"],
421                                  builder_job_order,
422                                  discovered)
423
424     copied, _ = document_loader.resolve_all(copy.deepcopy(cmap(workflowobj)), base_url=uri, checklinks=False)
425     visit_class(copied, ("CommandLineTool", "Workflow"), discover_default_secondary_files)
426
427     for d in list(discovered):
428         # Only interested in discovered secondaryFiles which are local
429         # files that need to be uploaded.
430         if d.startswith("file:"):
431             sc.extend(discovered[d])
432         else:
433             del discovered[d]
434
435     with Perf(metrics, "mapper"):
436         mapper = ArvPathMapper(arvrunner, sc, "",
437                                "keep:%s",
438                                "keep:%s/%s",
439                                name=name,
440                                single_collection=True,
441                                optional_deps=optional_deps)
442
443     for k, v in uuid_map.items():
444         mapper._pathmap["keep:"+k] = MapperEnt(v, "", "", False)
445
446     keeprefs = set()
447     def addkeepref(k):
448         if k.startswith("keep:"):
449             keeprefs.add(collection_pdh_pattern.match(k).group(1))
450
451
452     def collectloc(p):
453         loc = p.get("location")
454         if loc and (not loc.startswith("_:")) and (not loc.startswith("keep:")):
455             addkeepref(p["location"])
456             return
457
458         if not loc:
459             return
460
461         if collectionUUID in p:
462             uuid = p[collectionUUID]
463             if uuid not in uuid_map:
464                 raise SourceLine(p, collectionUUID, validate.ValidationException).makeError(
465                     "Collection uuid %s not found" % uuid)
466             gp = collection_pdh_pattern.match(loc)
467             if gp and uuid_map[uuid] != gp.groups()[0]:
468                 # This file entry has both collectionUUID and a PDH
469                 # location. If the PDH doesn't match the one returned
470                 # the API server, raise an error.
471                 raise SourceLine(p, "location", validate.ValidationException).makeError(
472                     "Expected collection uuid %s to be %s but API server reported %s" % (
473                         uuid, gp.groups()[0], uuid_map[p[collectionUUID]]))
474
475         gp = collection_uuid_pattern.match(loc)
476         if not gp:
477             # Not a uuid pattern (must be a pdh pattern)
478             addkeepref(p["location"])
479             return
480
481         uuid = gp.groups()[0]
482         if uuid not in uuid_map:
483             raise SourceLine(p, "location", validate.ValidationException).makeError(
484                 "Collection uuid %s not found" % uuid)
485
486     with Perf(metrics, "collectloc"):
487         visit_class(workflowobj, ("File", "Directory"), collectloc)
488         visit_class(discovered, ("File", "Directory"), collectloc)
489
490     if discovered_secondaryfiles is not None:
491         for d in discovered:
492             discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d]
493
494     if runtimeContext.copy_deps:
495         # Find referenced collections and copy them into the
496         # destination project, for easy sharing.
497         already_present = list(arvados.util.keyset_list_all(arvrunner.api.collections().list,
498                                      filters=[["portable_data_hash", "in", list(keeprefs)],
499                                               ["owner_uuid", "=", runtimeContext.project_uuid]],
500                                      select=["uuid", "portable_data_hash", "created_at"]))
501
502         keeprefs = keeprefs - set(a["portable_data_hash"] for a in already_present)
503         for kr in keeprefs:
504             col = arvrunner.api.collections().list(filters=[["portable_data_hash", "=", kr]],
505                                                   order="created_at desc",
506                                                    select=["name", "description", "properties", "portable_data_hash", "manifest_text", "storage_classes_desired", "trash_at"],
507                                                    limit=1).execute()
508             if len(col["items"]) == 0:
509                 logger.warning("Cannot find collection with portable data hash %s", kr)
510                 continue
511             col = col["items"][0]
512             col["name"] = arvados.util.trim_name(col["name"])
513             try:
514                 arvrunner.api.collections().create(body={"collection": {
515                     "owner_uuid": runtimeContext.project_uuid,
516                     "name": col["name"],
517                     "description": col["description"],
518                     "properties": col["properties"],
519                     "portable_data_hash": col["portable_data_hash"],
520                     "manifest_text": col["manifest_text"],
521                     "storage_classes_desired": col["storage_classes_desired"],
522                     "trash_at": col["trash_at"]
523                 }}, ensure_unique_name=True).execute()
524             except Exception as e:
525                 logger.warning("Unable to copy collection to destination: %s", e)
526
527     if "$schemas" in workflowobj:
528         sch = CommentedSeq()
529         for s in workflowobj["$schemas"]:
530             if s in mapper:
531                 sch.append(mapper.mapper(s).resolved)
532         workflowobj["$schemas"] = sch
533
534     return mapper
535
536
537 def upload_docker(arvrunner, tool, runtimeContext):
538     """Uploads Docker images used in CommandLineTool objects."""
539
540     if isinstance(tool, CommandLineTool):
541         (docker_req, docker_is_req) = tool.get_requirement("DockerRequirement")
542         if docker_req:
543             if docker_req.get("dockerOutputDirectory") and arvrunner.work_api != "containers":
544                 raise SourceLine(docker_req, "dockerOutputDirectory", UnsupportedRequirement).makeError(
545                     "Option 'dockerOutputDirectory' of DockerRequirement not supported.")
546
547             arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, docker_req, True, runtimeContext)
548         else:
549             arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, {"dockerPull": "arvados/jobs:"+__version__},
550                                                        True, runtimeContext)
551     elif isinstance(tool, cwltool.workflow.Workflow):
552         for s in tool.steps:
553             upload_docker(arvrunner, s.embedded_tool, runtimeContext)
554
555
556 def packed_workflow(arvrunner, tool, merged_map, runtimeContext, git_info):
557     """Create a packed workflow.
558
559     A "packed" workflow is one where all the components have been combined into a single document."""
560
561     rewrites = {}
562     packed = pack(arvrunner.loadingContext, tool.tool["id"],
563                   rewrite_out=rewrites,
564                   loader=tool.doc_loader)
565
566     rewrite_to_orig = {v: k for k,v in rewrites.items()}
567
568     def visit(v, cur_id):
569         if isinstance(v, dict):
570             if v.get("class") in ("CommandLineTool", "Workflow", "ExpressionTool"):
571                 if tool.metadata["cwlVersion"] == "v1.0" and "id" not in v:
572                     raise SourceLine(v, None, Exception).makeError("Embedded process object is missing required 'id' field, add an 'id' or use to cwlVersion: v1.1")
573                 if "id" in v:
574                     cur_id = rewrite_to_orig.get(v["id"], v["id"])
575             if "path" in v and "location" not in v:
576                 v["location"] = v["path"]
577                 del v["path"]
578             if "location" in v and cur_id in merged_map:
579                 if v["location"] in merged_map[cur_id].resolved:
580                     v["location"] = merged_map[cur_id].resolved[v["location"]]
581                 if v["location"] in merged_map[cur_id].secondaryFiles:
582                     v["secondaryFiles"] = merged_map[cur_id].secondaryFiles[v["location"]]
583             if v.get("class") == "DockerRequirement":
584                 v["http://arvados.org/cwl#dockerCollectionPDH"] = arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, v, True,
585                                                                                                              runtimeContext)
586             for l in v:
587                 visit(v[l], cur_id)
588         if isinstance(v, list):
589             for l in v:
590                 visit(l, cur_id)
591     visit(packed, None)
592
593     if git_info:
594         for g in git_info:
595             packed[g] = git_info[g]
596
597     return packed
598
599
600 def tag_git_version(packed):
601     if tool.tool["id"].startswith("file://"):
602         path = os.path.dirname(tool.tool["id"][7:])
603         try:
604             githash = subprocess.check_output(['git', 'log', '--first-parent', '--max-count=1', '--format=%H'], stderr=subprocess.STDOUT, cwd=path).strip()
605         except (OSError, subprocess.CalledProcessError):
606             pass
607         else:
608             packed["http://schema.org/version"] = githash
609
610 def setloc(mapper, p):
611     loc = p.get("location")
612     if loc and (not loc.startswith("_:")) and (not loc.startswith("keep:")):
613         p["location"] = mapper.mapper(p["location"]).resolved
614         return
615
616     if not loc:
617         return
618
619     if collectionUUID in p:
620         uuid = p[collectionUUID]
621         keepuuid = "keep:"+uuid
622         if keepuuid not in mapper:
623             raise SourceLine(p, collectionUUID, validate.ValidationException).makeError(
624                 "Collection uuid %s not found" % uuid)
625         gp = collection_pdh_pattern.match(loc)
626         if gp and mapper.mapper(keepuuid).resolved != gp.groups()[0]:
627             # This file entry has both collectionUUID and a PDH
628             # location. If the PDH doesn't match the one returned
629             # the API server, raise an error.
630             raise SourceLine(p, "location", validate.ValidationException).makeError(
631                 "Expected collection uuid %s to be %s but API server reported %s" % (
632                     uuid, gp.groups()[0], mapper.mapper(keepuuid).resolved))
633
634     gp = collection_uuid_pattern.match(loc)
635     if not gp:
636         # Not a uuid pattern (must be a pdh pattern)
637         return
638
639     uuid = gp.groups()[0]
640     keepuuid = "keep:"+uuid
641     if keepuuid not in mapper:
642         raise SourceLine(p, "location", validate.ValidationException).makeError(
643             "Collection uuid %s not found" % uuid)
644     p["location"] = "keep:%s%s" % (mapper.mapper(keepuuid).resolved, gp.groups()[1] if gp.groups()[1] else "")
645     p[collectionUUID] = uuid
646
647 def update_from_mapper(workflowobj, mapper):
648     with Perf(metrics, "setloc"):
649         visit_class(workflowobj, ("File", "Directory"), partial(setloc, mapper))
650
651 def apply_merged_map(merged_map, workflowobj):
652     def visit(v, cur_id):
653         if isinstance(v, dict):
654             if v.get("class") in ("CommandLineTool", "Workflow", "ExpressionTool"):
655                 if "id" in v:
656                     cur_id = v["id"]
657             if "path" in v and "location" not in v:
658                 v["location"] = v["path"]
659                 del v["path"]
660             if "location" in v and cur_id in merged_map:
661                 if v["location"] in merged_map[cur_id].resolved:
662                     v["location"] = merged_map[cur_id].resolved[v["location"]]
663                 if v["location"] in merged_map[cur_id].secondaryFiles:
664                     v["secondaryFiles"] = merged_map[cur_id].secondaryFiles[v["location"]]
665             #if v.get("class") == "DockerRequirement":
666             #    v["http://arvados.org/cwl#dockerCollectionPDH"] = arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, v, True,
667             #                                                                                                 runtimeContext)
668             for l in v:
669                 visit(v[l], cur_id)
670         if isinstance(v, list):
671             for l in v:
672                 visit(l, cur_id)
673     visit(workflowobj, None)
674
675 def update_from_merged_map(tool, merged_map):
676     tool.visit(partial(apply_merged_map, merged_map))
677
678 def upload_job_order(arvrunner, name, tool, job_order, runtimeContext):
679     """Upload local files referenced in the input object and return updated input
680     object with 'location' updated to the proper keep references.
681     """
682
683     # Make a copy of the job order and set defaults.
684     builder_job_order = copy.copy(job_order)
685
686     # fill_in_defaults throws an error if there are any
687     # missing required parameters, we don't want it to do that
688     # so make them all optional.
689     inputs_copy = copy.deepcopy(tool.tool["inputs"])
690     for i in inputs_copy:
691         if "null" not in i["type"]:
692             i["type"] = ["null"] + aslist(i["type"])
693
694     fill_in_defaults(inputs_copy,
695                      builder_job_order,
696                      arvrunner.fs_access)
697     # Need to create a builder object to evaluate expressions.
698     builder = make_builder(builder_job_order,
699                            tool.hints,
700                            tool.requirements,
701                            ArvRuntimeContext(),
702                            tool.metadata)
703     # Now update job_order with secondaryFiles
704     discover_secondary_files(arvrunner.fs_access,
705                              builder,
706                              tool.tool["inputs"],
707                              job_order)
708
709     _jobloaderctx = jobloaderctx.copy()
710     jobloader = schema_salad.ref_resolver.Loader(_jobloaderctx, fetcher_constructor=tool.doc_loader.fetcher_constructor)
711
712     jobmapper = upload_dependencies(arvrunner,
713                                     name,
714                                     jobloader,
715                                     job_order,
716                                     job_order.get("id", "#"),
717                                     runtimeContext)
718
719     if "id" in job_order:
720         del job_order["id"]
721
722     # Need to filter this out, gets added by cwltool when providing
723     # parameters on the command line.
724     if "job_order" in job_order:
725         del job_order["job_order"]
726
727     update_from_mapper(job_order, jobmapper)
728
729     return job_order, jobmapper
730
731 FileUpdates = namedtuple("FileUpdates", ["resolved", "secondaryFiles"])
732
733 def upload_workflow_deps(arvrunner, tool, runtimeContext):
734     # Ensure that Docker images needed by this workflow are available
735
736     with Perf(metrics, "upload_docker"):
737         upload_docker(arvrunner, tool, runtimeContext)
738
739     document_loader = tool.doc_loader
740
741     merged_map = {}
742     tool_dep_cache = {}
743
744     todo = []
745
746     # Standard traversal is top down, we want to go bottom up, so use
747     # the visitor to accumalate a list of nodes to visit, then
748     # visit them in reverse order.
749     def upload_tool_deps(deptool):
750         if "id" in deptool:
751             todo.append(deptool)
752
753     tool.visit(upload_tool_deps)
754
755     for deptool in reversed(todo):
756         discovered_secondaryfiles = {}
757         with Perf(metrics, "upload_dependencies %s" % shortname(deptool["id"])):
758             pm = upload_dependencies(arvrunner,
759                                      "%s dependencies" % (shortname(deptool["id"])),
760                                      document_loader,
761                                      deptool,
762                                      deptool["id"],
763                                      runtimeContext,
764                                      include_primary=False,
765                                      discovered_secondaryfiles=discovered_secondaryfiles,
766                                      cache=tool_dep_cache)
767
768         document_loader.idx[deptool["id"]] = deptool
769         toolmap = {}
770         for k,v in pm.items():
771             toolmap[k] = v.resolved
772
773         merged_map[deptool["id"]] = FileUpdates(toolmap, discovered_secondaryfiles)
774
775     return merged_map
776
777 def arvados_jobs_image(arvrunner, img, runtimeContext):
778     """Determine if the right arvados/jobs image version is available.  If not, try to pull and upload it."""
779
780     try:
781         return arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, {"dockerPull": img},
782                                                           True, runtimeContext)
783     except Exception as e:
784         raise Exception("Docker image %s is not available\n%s" % (img, e) )
785
786
787 def upload_workflow_collection(arvrunner, name, packed, runtimeContext):
788     collection = arvados.collection.Collection(api_client=arvrunner.api,
789                                                keep_client=arvrunner.keep_client,
790                                                num_retries=arvrunner.num_retries)
791     with collection.open("workflow.cwl", "w") as f:
792         f.write(json.dumps(packed, indent=2, sort_keys=True, separators=(',',': ')))
793
794     filters = [["portable_data_hash", "=", collection.portable_data_hash()],
795                ["name", "like", name+"%"]]
796     if runtimeContext.project_uuid:
797         filters.append(["owner_uuid", "=", runtimeContext.project_uuid])
798     exists = arvrunner.api.collections().list(filters=filters).execute(num_retries=arvrunner.num_retries)
799
800     if exists["items"]:
801         logger.info("Using collection %s", exists["items"][0]["uuid"])
802     else:
803         collection.save_new(name=name,
804                             owner_uuid=runtimeContext.project_uuid,
805                             ensure_unique_name=True,
806                             num_retries=arvrunner.num_retries)
807         logger.info("Uploaded to %s", collection.manifest_locator())
808
809     return collection.portable_data_hash()
810
811
812 class Runner(Process):
813     """Base class for runner processes, which submit an instance of
814     arvados-cwl-runner and wait for the final result."""
815
816     def __init__(self, runner,
817                  tool, loadingContext, enable_reuse,
818                  output_name, output_tags, submit_runner_ram=0,
819                  name=None, on_error=None, submit_runner_image=None,
820                  intermediate_output_ttl=0, merged_map=None,
821                  priority=None, secret_store=None,
822                  collection_cache_size=256,
823                  collection_cache_is_default=True,
824                  git_info=None,
825                  reuse_runner=False):
826
827         self.loadingContext = loadingContext.copy()
828
829         super(Runner, self).__init__(tool.tool, loadingContext)
830
831         self.arvrunner = runner
832         self.embedded_tool = tool
833         self.job_order = None
834         self.running = False
835         if enable_reuse:
836             # If reuse is permitted by command line arguments but
837             # disabled by the workflow itself, disable it.
838             reuse_req, _ = self.embedded_tool.get_requirement("http://arvados.org/cwl#ReuseRequirement")
839             if reuse_req:
840                 enable_reuse = reuse_req["enableReuse"]
841             reuse_req, _ = self.embedded_tool.get_requirement("WorkReuse")
842             if reuse_req:
843                 enable_reuse = reuse_req["enableReuse"]
844         self.enable_reuse = enable_reuse
845         self.uuid = None
846         self.final_output = None
847         self.output_name = output_name
848         self.output_tags = output_tags
849         self.name = name
850         self.on_error = on_error
851         self.jobs_image = submit_runner_image or "arvados/jobs:"+__version__
852         self.intermediate_output_ttl = intermediate_output_ttl
853         self.priority = priority
854         self.secret_store = secret_store
855         self.enable_dev = self.loadingContext.enable_dev
856         self.git_info = git_info
857         self.fast_parser = self.loadingContext.fast_parser
858         self.reuse_runner = reuse_runner
859
860         self.submit_runner_cores = 1
861         self.submit_runner_ram = 1024  # defaut 1 GiB
862         self.collection_cache_size = collection_cache_size
863
864         runner_resource_req, _ = self.embedded_tool.get_requirement("http://arvados.org/cwl#WorkflowRunnerResources")
865         if runner_resource_req:
866             if runner_resource_req.get("coresMin"):
867                 self.submit_runner_cores = runner_resource_req["coresMin"]
868             if runner_resource_req.get("ramMin"):
869                 self.submit_runner_ram = runner_resource_req["ramMin"]
870             if runner_resource_req.get("keep_cache") and collection_cache_is_default:
871                 self.collection_cache_size = runner_resource_req["keep_cache"]
872
873         if submit_runner_ram:
874             # Command line / initializer overrides default and/or spec from workflow
875             self.submit_runner_ram = submit_runner_ram
876
877         if self.submit_runner_ram <= 0:
878             raise Exception("Value of submit-runner-ram must be greater than zero")
879
880         if self.submit_runner_cores <= 0:
881             raise Exception("Value of submit-runner-cores must be greater than zero")
882
883         self.merged_map = merged_map or {}
884
885     def job(self,
886             job_order,         # type: Mapping[Text, Text]
887             output_callbacks,  # type: Callable[[Any, Any], Any]
888             runtimeContext     # type: RuntimeContext
889            ):  # type: (...) -> Generator[Any, None, None]
890         self.job_order = job_order
891         self._init_job(job_order, runtimeContext)
892         yield self
893
894     def update_pipeline_component(self, record):
895         pass
896
897     def done(self, record):
898         """Base method for handling a completed runner."""
899
900         try:
901             if record["state"] == "Complete":
902                 if record.get("exit_code") is not None:
903                     if record["exit_code"] == 33:
904                         processStatus = "UnsupportedRequirement"
905                     elif record["exit_code"] == 0:
906                         processStatus = "success"
907                     else:
908                         processStatus = "permanentFail"
909                 else:
910                     processStatus = "success"
911             else:
912                 processStatus = "permanentFail"
913
914             outputs = {}
915
916             if processStatus == "permanentFail":
917                 logc = arvados.collection.CollectionReader(record["log"],
918                                                            api_client=self.arvrunner.api,
919                                                            keep_client=self.arvrunner.keep_client,
920                                                            num_retries=self.arvrunner.num_retries)
921                 done.logtail(logc, logger.error, "%s (%s) error log:" % (self.arvrunner.label(self), record["uuid"]), maxlen=40,
922                              include_crunchrun=(record.get("exit_code") is None or record.get("exit_code") > 127))
923
924             self.final_output = record["output"]
925             outc = arvados.collection.CollectionReader(self.final_output,
926                                                        api_client=self.arvrunner.api,
927                                                        keep_client=self.arvrunner.keep_client,
928                                                        num_retries=self.arvrunner.num_retries)
929             if "cwl.output.json" in outc:
930                 with outc.open("cwl.output.json", "rb") as f:
931                     if f.size() > 0:
932                         outputs = json.loads(f.read().decode())
933             def keepify(fileobj):
934                 path = fileobj["location"]
935                 if not path.startswith("keep:"):
936                     fileobj["location"] = "keep:%s/%s" % (record["output"], path)
937             adjustFileObjs(outputs, keepify)
938             adjustDirObjs(outputs, keepify)
939         except Exception:
940             logger.exception("[%s] While getting final output object", self.name)
941             self.arvrunner.output_callback({}, "permanentFail")
942         else:
943             self.arvrunner.output_callback(outputs, processStatus)
944
945
946 def print_keep_deps_visitor(api, runtimeContext, references, doc_loader, tool):
947     def collect_locators(obj):
948         loc = obj.get("location", "")
949
950         g = arvados.util.keepuri_pattern.match(loc)
951         if g:
952             references.add(g[1])
953
954         if obj.get("class") == "http://arvados.org/cwl#WorkflowRunnerResources" and "acrContainerImage" in obj:
955             references.add(obj["acrContainerImage"])
956
957         if obj.get("class") == "DockerRequirement":
958             references.add(arvados_cwl.arvdocker.arv_docker_get_image(api, obj, False, runtimeContext))
959
960     sc_result = scandeps(tool["id"], tool,
961                          set(),
962                          set(("location", "id")),
963                          None, urljoin=doc_loader.fetcher.urljoin,
964                          nestdirs=False)
965
966     visit_class(sc_result, ("File", "Directory"), collect_locators)
967     visit_class(tool, ("DockerRequirement", "http://arvados.org/cwl#WorkflowRunnerResources"), collect_locators)
968
969
970 def print_keep_deps(arvRunner, runtimeContext, merged_map, tool):
971     references = set()
972
973     tool.visit(partial(print_keep_deps_visitor, arvRunner.api, runtimeContext, references, tool.doc_loader))
974
975     for mm in merged_map:
976         for k, v in merged_map[mm].resolved.items():
977             g = arvados.util.keepuri_pattern.match(v)
978             if g:
979                 references.add(g[1])
980
981     json.dump(sorted(references), arvRunner.stdout)
982     print(file=arvRunner.stdout)