Merge branch '19712-remove-perl'
[arvados.git] / sdk / cwl / arvados_cwl / runner.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 from future import standard_library
6 standard_library.install_aliases()
7 from future.utils import  viewvalues, viewitems
8 from past.builtins import basestring
9
10 import os
11 import sys
12 import re
13 import urllib.parse
14 from functools import partial
15 import logging
16 import json
17 import copy
18 from collections import namedtuple
19 from io import StringIO
20 from typing import (
21     Any,
22     Callable,
23     Dict,
24     Iterable,
25     Iterator,
26     List,
27     Mapping,
28     MutableMapping,
29     Sequence,
30     MutableSequence,
31     Optional,
32     Set,
33     Sized,
34     Tuple,
35     Type,
36     Union,
37     cast,
38 )
39 from cwltool.utils import (
40     CWLObjectType,
41     CWLOutputAtomType,
42     CWLOutputType,
43 )
44
45 if os.name == "posix" and sys.version_info[0] < 3:
46     import subprocess32 as subprocess
47 else:
48     import subprocess
49
50 from schema_salad.sourceline import SourceLine, cmap
51
52 from cwltool.command_line_tool import CommandLineTool
53 import cwltool.workflow
54 from cwltool.process import (scandeps, UnsupportedRequirement, normalizeFilesDirs,
55                              shortname, Process, fill_in_defaults)
56 from cwltool.load_tool import fetch_document, jobloaderctx
57 from cwltool.utils import aslist, adjustFileObjs, adjustDirObjs, visit_class
58 from cwltool.builder import substitute
59 from cwltool.pack import pack
60 from cwltool.update import INTERNAL_VERSION
61 from cwltool.builder import Builder
62 import schema_salad.validate as validate
63 import schema_salad.ref_resolver
64
65 import arvados.collection
66 import arvados.util
67 from .util import collectionUUID
68 from ruamel.yaml import YAML
69 from ruamel.yaml.comments import CommentedMap, CommentedSeq
70
71 import arvados_cwl.arvdocker
72 from .pathmapper import ArvPathMapper, trim_listing, collection_pdh_pattern, collection_uuid_pattern
73 from ._version import __version__
74 from . import done
75 from . context import ArvRuntimeContext
76 from .perf import Perf
77
78 logger = logging.getLogger('arvados.cwl-runner')
79 metrics = logging.getLogger('arvados.cwl-runner.metrics')
80
81 def trim_anonymous_location(obj):
82     """Remove 'location' field from File and Directory literals.
83
84     To make internal handling easier, literals are assigned a random id for
85     'location'.  However, when writing the record back out, this can break
86     reproducibility.  Since it is valid for literals not have a 'location'
87     field, remove it.
88
89     """
90
91     if obj.get("location", "").startswith("_:"):
92         del obj["location"]
93
94
95 def remove_redundant_fields(obj):
96     for field in ("path", "nameext", "nameroot", "dirname"):
97         if field in obj:
98             del obj[field]
99
100
101 def find_defaults(d, op):
102     if isinstance(d, list):
103         for i in d:
104             find_defaults(i, op)
105     elif isinstance(d, dict):
106         if "default" in d:
107             op(d)
108         else:
109             for i in viewvalues(d):
110                 find_defaults(i, op)
111
112 def make_builder(joborder, hints, requirements, runtimeContext, metadata):
113     return Builder(
114                  job=joborder,
115                  files=[],               # type: List[Dict[Text, Text]]
116                  bindings=[],            # type: List[Dict[Text, Any]]
117                  schemaDefs={},          # type: Dict[Text, Dict[Text, Any]]
118                  names=None,               # type: Names
119                  requirements=requirements,        # type: List[Dict[Text, Any]]
120                  hints=hints,               # type: List[Dict[Text, Any]]
121                  resources={},           # type: Dict[str, int]
122                  mutation_manager=None,    # type: Optional[MutationManager]
123                  formatgraph=None,         # type: Optional[Graph]
124                  make_fs_access=None,      # type: Type[StdFsAccess]
125                  fs_access=None,           # type: StdFsAccess
126                  job_script_provider=runtimeContext.job_script_provider, # type: Optional[Any]
127                  timeout=runtimeContext.eval_timeout,             # type: float
128                  debug=runtimeContext.debug,               # type: bool
129                  js_console=runtimeContext.js_console,          # type: bool
130                  force_docker_pull=runtimeContext.force_docker_pull,   # type: bool
131                  loadListing="",         # type: Text
132                  outdir="",              # type: Text
133                  tmpdir="",              # type: Text
134                  stagedir="",            # type: Text
135                  cwlVersion=metadata.get("http://commonwl.org/cwltool#original_cwlVersion") or metadata.get("cwlVersion"),
136                  container_engine="docker"
137                 )
138
139 def search_schemadef(name, reqs):
140     for r in reqs:
141         if r["class"] == "SchemaDefRequirement":
142             for sd in r["types"]:
143                 if sd["name"] == name:
144                     return sd
145     return None
146
147 primitive_types_set = frozenset(("null", "boolean", "int", "long",
148                                  "float", "double", "string", "record",
149                                  "array", "enum"))
150
151 def set_secondary(fsaccess, builder, inputschema, secondaryspec, primary, discovered):
152     if isinstance(inputschema, Sequence) and not isinstance(inputschema, basestring):
153         # union type, collect all possible secondaryFiles
154         for i in inputschema:
155             set_secondary(fsaccess, builder, i, secondaryspec, primary, discovered)
156         return
157
158     if inputschema == "File":
159         inputschema = {"type": "File"}
160
161     if isinstance(inputschema, basestring):
162         sd = search_schemadef(inputschema, reversed(builder.hints+builder.requirements))
163         if sd:
164             inputschema = sd
165         else:
166             return
167
168     if "secondaryFiles" in inputschema:
169         # set secondaryFiles, may be inherited by compound types.
170         secondaryspec = inputschema["secondaryFiles"]
171
172     if (isinstance(inputschema["type"], (Mapping, Sequence)) and
173         not isinstance(inputschema["type"], basestring)):
174         # compound type (union, array, record)
175         set_secondary(fsaccess, builder, inputschema["type"], secondaryspec, primary, discovered)
176
177     elif (inputschema["type"] == "record" and
178           isinstance(primary, Mapping)):
179         #
180         # record type, find secondary files associated with fields.
181         #
182         for f in inputschema["fields"]:
183             p = primary.get(shortname(f["name"]))
184             if p:
185                 set_secondary(fsaccess, builder, f, secondaryspec, p, discovered)
186
187     elif (inputschema["type"] == "array" and
188           isinstance(primary, Sequence)):
189         #
190         # array type, find secondary files of elements
191         #
192         for p in primary:
193             set_secondary(fsaccess, builder, {"type": inputschema["items"]}, secondaryspec, p, discovered)
194
195     elif (inputschema["type"] == "File" and
196           isinstance(primary, Mapping) and
197           primary.get("class") == "File"):
198
199         if "secondaryFiles" in primary or not secondaryspec:
200             # Nothing to do.
201             return
202
203         #
204         # Found a file, check for secondaryFiles
205         #
206         specs = []
207         primary["secondaryFiles"] = secondaryspec
208         for i, sf in enumerate(aslist(secondaryspec)):
209             if builder.cwlVersion == "v1.0":
210                 pattern = sf
211             else:
212                 pattern = sf["pattern"]
213             if pattern is None:
214                 continue
215             if isinstance(pattern, list):
216                 specs.extend(pattern)
217             elif isinstance(pattern, dict):
218                 specs.append(pattern)
219             elif isinstance(pattern, str):
220                 if builder.cwlVersion == "v1.0":
221                     specs.append({"pattern": pattern, "required": True})
222                 else:
223                     specs.append({"pattern": pattern, "required": sf.get("required")})
224             else:
225                 raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
226                     "Expression must return list, object, string or null")
227
228         found = []
229         for i, sf in enumerate(specs):
230             if isinstance(sf, dict):
231                 if sf.get("class") == "File":
232                     pattern = None
233                     if sf.get("location") is None:
234                         raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
235                             "File object is missing 'location': %s" % sf)
236                     sfpath = sf["location"]
237                     required = True
238                 else:
239                     pattern = sf["pattern"]
240                     required = sf.get("required")
241             elif isinstance(sf, str):
242                 pattern = sf
243                 required = True
244             else:
245                 raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
246                     "Expression must return list, object, string or null")
247
248             if pattern is not None:
249                 if "${" in pattern or "$(" in pattern:
250                     sfname = builder.do_eval(pattern, context=primary)
251                 else:
252                     sfname = substitute(primary["basename"], pattern)
253
254                 if sfname is None:
255                     continue
256
257                 if isinstance(sfname, str):
258                     p_location = primary["location"]
259                     if "/" in p_location:
260                         sfpath = (
261                             p_location[0 : p_location.rindex("/") + 1]
262                             + sfname
263                         )
264
265             required = builder.do_eval(required, context=primary)
266
267             if isinstance(sfname, list) or isinstance(sfname, dict):
268                 each = aslist(sfname)
269                 for e in each:
270                     if required and not fsaccess.exists(e.get("location")):
271                         raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
272                             "Required secondary file '%s' does not exist" % e.get("location"))
273                 found.extend(each)
274
275             if isinstance(sfname, str):
276                 if fsaccess.exists(sfpath):
277                     if pattern is not None:
278                         found.append({"location": sfpath, "class": "File"})
279                     else:
280                         found.append(sf)
281                 elif required:
282                     raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
283                         "Required secondary file '%s' does not exist" % sfpath)
284
285         primary["secondaryFiles"] = cmap(found)
286         if discovered is not None:
287             discovered[primary["location"]] = primary["secondaryFiles"]
288     elif inputschema["type"] not in primitive_types_set and inputschema["type"] not in ("File", "Directory"):
289         set_secondary(fsaccess, builder, inputschema["type"], secondaryspec, primary, discovered)
290
291 def discover_secondary_files(fsaccess, builder, inputs, job_order, discovered=None):
292     for inputschema in inputs:
293         primary = job_order.get(shortname(inputschema["id"]))
294         if isinstance(primary, (Mapping, Sequence)):
295             set_secondary(fsaccess, builder, inputschema, None, primary, discovered)
296
297 def upload_dependencies(arvrunner, name, document_loader,
298                         workflowobj, uri, loadref_run, runtimeContext,
299                         include_primary=True, discovered_secondaryfiles=None,
300                         cache=None):
301     """Upload the dependencies of the workflowobj document to Keep.
302
303     Returns a pathmapper object mapping local paths to keep references.  Also
304     does an in-place update of references in "workflowobj".
305
306     Use scandeps to find $import, $include, $schemas, run, File and Directory
307     fields that represent external references.
308
309     If workflowobj has an "id" field, this will reload the document to ensure
310     it is scanning the raw document prior to preprocessing.
311     """
312
313     loaded = set()
314     def loadref(b, u):
315         joined = document_loader.fetcher.urljoin(b, u)
316         defrg, _ = urllib.parse.urldefrag(joined)
317         if defrg not in loaded:
318             loaded.add(defrg)
319             if cache is not None and defrg in cache:
320                 return cache[defrg]
321             # Use fetch_text to get raw file (before preprocessing).
322             text = document_loader.fetch_text(defrg)
323             if isinstance(text, bytes):
324                 textIO = StringIO(text.decode('utf-8'))
325             else:
326                 textIO = StringIO(text)
327             yamlloader = YAML(typ='safe', pure=True)
328             result = yamlloader.load(textIO)
329             if cache is not None:
330                 cache[defrg] = result
331             return result
332         else:
333             return {}
334
335     if loadref_run:
336         loadref_fields = set(("$import", "run"))
337     else:
338         loadref_fields = set(("$import",))
339
340     scanobj = workflowobj
341     if "id" in workflowobj and not workflowobj["id"].startswith("_:"):
342         defrg, _ = urllib.parse.urldefrag(workflowobj["id"])
343         if cache is not None and defrg not in cache:
344             # if we haven't seen this file before, want raw file
345             # content (before preprocessing) to ensure that external
346             # references like $include haven't already been inlined.
347             scanobj = loadref("", workflowobj["id"])
348
349     metadata = scanobj
350
351     with Perf(metrics, "scandeps include, location"):
352         sc_result = scandeps(uri, scanobj,
353                              loadref_fields,
354                              set(("$include", "location")),
355                              loadref, urljoin=document_loader.fetcher.urljoin,
356                              nestdirs=False)
357
358     with Perf(metrics, "scandeps $schemas"):
359         optional_deps = scandeps(uri, scanobj,
360                                       loadref_fields,
361                                       set(("$schemas",)),
362                                       loadref, urljoin=document_loader.fetcher.urljoin,
363                                       nestdirs=False)
364
365     if sc_result is None:
366         sc_result = []
367
368     if optional_deps is None:
369         optional_deps = []
370
371     if optional_deps:
372         sc_result.extend(optional_deps)
373
374     sc = []
375     uuids = {}
376
377     def collect_uuids(obj):
378         loc = obj.get("location", "")
379         sp = loc.split(":")
380         if sp[0] == "keep":
381             # Collect collection uuids that need to be resolved to
382             # portable data hashes
383             gp = collection_uuid_pattern.match(loc)
384             if gp:
385                 uuids[gp.groups()[0]] = obj
386             if collectionUUID in obj:
387                 uuids[obj[collectionUUID]] = obj
388
389     def collect_uploads(obj):
390         loc = obj.get("location", "")
391         sp = loc.split(":")
392         if len(sp) < 1:
393             return
394         if sp[0] in ("file", "http", "https"):
395             # Record local files than need to be uploaded,
396             # don't include file literals, keep references, etc.
397             sc.append(obj)
398         collect_uuids(obj)
399
400     with Perf(metrics, "collect uuids"):
401         visit_class(workflowobj, ("File", "Directory"), collect_uuids)
402
403     with Perf(metrics, "collect uploads"):
404         visit_class(sc_result, ("File", "Directory"), collect_uploads)
405
406     # Resolve any collection uuids we found to portable data hashes
407     # and assign them to uuid_map
408     uuid_map = {}
409     fetch_uuids = list(uuids.keys())
410     with Perf(metrics, "fetch_uuids"):
411         while fetch_uuids:
412             # For a large number of fetch_uuids, API server may limit
413             # response size, so keep fetching from API server has nothing
414             # more to give us.
415             lookups = arvrunner.api.collections().list(
416                 filters=[["uuid", "in", fetch_uuids]],
417                 count="none",
418                 select=["uuid", "portable_data_hash"]).execute(
419                     num_retries=arvrunner.num_retries)
420
421             if not lookups["items"]:
422                 break
423
424             for l in lookups["items"]:
425                 uuid_map[l["uuid"]] = l["portable_data_hash"]
426
427             fetch_uuids = [u for u in fetch_uuids if u not in uuid_map]
428
429     normalizeFilesDirs(sc)
430
431     if "id" in workflowobj:
432         defrg, _ = urllib.parse.urldefrag(workflowobj["id"])
433         if include_primary:
434             # make sure it's included
435             sc.append({"class": "File", "location": defrg})
436         else:
437             # make sure it's excluded
438             sc = [d for d in sc if d.get("location") != defrg]
439
440     def visit_default(obj):
441         def defaults_are_optional(f):
442             if "location" not in f and "path" in f:
443                 f["location"] = f["path"]
444                 del f["path"]
445             normalizeFilesDirs(f)
446             optional_deps.append(f)
447         visit_class(obj["default"], ("File", "Directory"), defaults_are_optional)
448
449     find_defaults(workflowobj, visit_default)
450
451     discovered = {}
452     def discover_default_secondary_files(obj):
453         builder_job_order = {}
454         for t in obj["inputs"]:
455             builder_job_order[shortname(t["id"])] = t["default"] if "default" in t else None
456         # Need to create a builder object to evaluate expressions.
457         builder = make_builder(builder_job_order,
458                                obj.get("hints", []),
459                                obj.get("requirements", []),
460                                ArvRuntimeContext(),
461                                metadata)
462         discover_secondary_files(arvrunner.fs_access,
463                                  builder,
464                                  obj["inputs"],
465                                  builder_job_order,
466                                  discovered)
467
468     copied, _ = document_loader.resolve_all(copy.deepcopy(cmap(workflowobj)), base_url=uri, checklinks=False)
469     visit_class(copied, ("CommandLineTool", "Workflow"), discover_default_secondary_files)
470
471     for d in list(discovered):
472         # Only interested in discovered secondaryFiles which are local
473         # files that need to be uploaded.
474         if d.startswith("file:"):
475             sc.extend(discovered[d])
476         else:
477             del discovered[d]
478
479     with Perf(metrics, "mapper"):
480         mapper = ArvPathMapper(arvrunner, sc, "",
481                                "keep:%s",
482                                "keep:%s/%s",
483                                name=name,
484                                single_collection=True,
485                                optional_deps=optional_deps)
486
487     keeprefs = set()
488     def addkeepref(k):
489         if k.startswith("keep:"):
490             keeprefs.add(collection_pdh_pattern.match(k).group(1))
491
492     def setloc(p):
493         loc = p.get("location")
494         if loc and (not loc.startswith("_:")) and (not loc.startswith("keep:")):
495             p["location"] = mapper.mapper(p["location"]).resolved
496             addkeepref(p["location"])
497             return
498
499         if not loc:
500             return
501
502         if collectionUUID in p:
503             uuid = p[collectionUUID]
504             if uuid not in uuid_map:
505                 raise SourceLine(p, collectionUUID, validate.ValidationException).makeError(
506                     "Collection uuid %s not found" % uuid)
507             gp = collection_pdh_pattern.match(loc)
508             if gp and uuid_map[uuid] != gp.groups()[0]:
509                 # This file entry has both collectionUUID and a PDH
510                 # location. If the PDH doesn't match the one returned
511                 # the API server, raise an error.
512                 raise SourceLine(p, "location", validate.ValidationException).makeError(
513                     "Expected collection uuid %s to be %s but API server reported %s" % (
514                         uuid, gp.groups()[0], uuid_map[p[collectionUUID]]))
515
516         gp = collection_uuid_pattern.match(loc)
517         if not gp:
518             # Not a uuid pattern (must be a pdh pattern)
519             addkeepref(p["location"])
520             return
521
522         uuid = gp.groups()[0]
523         if uuid not in uuid_map:
524             raise SourceLine(p, "location", validate.ValidationException).makeError(
525                 "Collection uuid %s not found" % uuid)
526         p["location"] = "keep:%s%s" % (uuid_map[uuid], gp.groups()[1] if gp.groups()[1] else "")
527         p[collectionUUID] = uuid
528
529     with Perf(metrics, "setloc"):
530         visit_class(workflowobj, ("File", "Directory"), setloc)
531         visit_class(discovered, ("File", "Directory"), setloc)
532
533     if discovered_secondaryfiles is not None:
534         for d in discovered:
535             discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d]
536
537     if runtimeContext.copy_deps:
538         # Find referenced collections and copy them into the
539         # destination project, for easy sharing.
540         already_present = list(arvados.util.keyset_list_all(arvrunner.api.collections().list,
541                                      filters=[["portable_data_hash", "in", list(keeprefs)],
542                                               ["owner_uuid", "=", runtimeContext.project_uuid]],
543                                      select=["uuid", "portable_data_hash", "created_at"]))
544
545         keeprefs = keeprefs - set(a["portable_data_hash"] for a in already_present)
546         for kr in keeprefs:
547             col = arvrunner.api.collections().list(filters=[["portable_data_hash", "=", kr]],
548                                                   order="created_at desc",
549                                                    select=["name", "description", "properties", "portable_data_hash", "manifest_text", "storage_classes_desired", "trash_at"],
550                                                    limit=1).execute()
551             if len(col["items"]) == 0:
552                 logger.warning("Cannot find collection with portable data hash %s", kr)
553                 continue
554             col = col["items"][0]
555             try:
556                 arvrunner.api.collections().create(body={"collection": {
557                     "owner_uuid": runtimeContext.project_uuid,
558                     "name": col["name"],
559                     "description": col["description"],
560                     "properties": col["properties"],
561                     "portable_data_hash": col["portable_data_hash"],
562                     "manifest_text": col["manifest_text"],
563                     "storage_classes_desired": col["storage_classes_desired"],
564                     "trash_at": col["trash_at"]
565                 }}, ensure_unique_name=True).execute()
566             except Exception as e:
567                 logger.warning("Unable copy collection to destination: %s", e)
568
569     if "$schemas" in workflowobj:
570         sch = CommentedSeq()
571         for s in workflowobj["$schemas"]:
572             if s in mapper:
573                 sch.append(mapper.mapper(s).resolved)
574         workflowobj["$schemas"] = sch
575
576     return mapper
577
578
579 def upload_docker(arvrunner, tool, runtimeContext):
580     """Uploads Docker images used in CommandLineTool objects."""
581
582     if isinstance(tool, CommandLineTool):
583         (docker_req, docker_is_req) = tool.get_requirement("DockerRequirement")
584         if docker_req:
585             if docker_req.get("dockerOutputDirectory") and arvrunner.work_api != "containers":
586                 raise SourceLine(docker_req, "dockerOutputDirectory", UnsupportedRequirement).makeError(
587                     "Option 'dockerOutputDirectory' of DockerRequirement not supported.")
588
589             arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, docker_req, True,
590                                                        runtimeContext.project_uuid,
591                                                        runtimeContext.force_docker_pull,
592                                                        runtimeContext.tmp_outdir_prefix,
593                                                        runtimeContext.match_local_docker,
594                                                        runtimeContext.copy_deps)
595         else:
596             arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, {"dockerPull": "arvados/jobs:"+__version__},
597                                                        True,
598                                                        runtimeContext.project_uuid,
599                                                        runtimeContext.force_docker_pull,
600                                                        runtimeContext.tmp_outdir_prefix,
601                                                        runtimeContext.match_local_docker,
602                                                        runtimeContext.copy_deps)
603     elif isinstance(tool, cwltool.workflow.Workflow):
604         for s in tool.steps:
605             upload_docker(arvrunner, s.embedded_tool, runtimeContext)
606
607
608 def packed_workflow(arvrunner, tool, merged_map, runtimeContext, git_info):
609     """Create a packed workflow.
610
611     A "packed" workflow is one where all the components have been combined into a single document."""
612
613     rewrites = {}
614     packed = pack(arvrunner.loadingContext, tool.tool["id"],
615                   rewrite_out=rewrites,
616                   loader=tool.doc_loader)
617
618     rewrite_to_orig = {v: k for k,v in viewitems(rewrites)}
619
620     def visit(v, cur_id):
621         if isinstance(v, dict):
622             if v.get("class") in ("CommandLineTool", "Workflow", "ExpressionTool"):
623                 if tool.metadata["cwlVersion"] == "v1.0" and "id" not in v:
624                     raise SourceLine(v, None, Exception).makeError("Embedded process object is missing required 'id' field, add an 'id' or use to cwlVersion: v1.1")
625                 if "id" in v:
626                     cur_id = rewrite_to_orig.get(v["id"], v["id"])
627             if "path" in v and "location" not in v:
628                 v["location"] = v["path"]
629                 del v["path"]
630             if "location" in v and cur_id in merged_map:
631                 if v["location"] in merged_map[cur_id].resolved:
632                     v["location"] = merged_map[cur_id].resolved[v["location"]]
633                 if v["location"] in merged_map[cur_id].secondaryFiles:
634                     v["secondaryFiles"] = merged_map[cur_id].secondaryFiles[v["location"]]
635             if v.get("class") == "DockerRequirement":
636                 v["http://arvados.org/cwl#dockerCollectionPDH"] = arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, v, True,
637                                                                                                              runtimeContext.project_uuid,
638                                                                                                              runtimeContext.force_docker_pull,
639                                                                                                              runtimeContext.tmp_outdir_prefix,
640                                                                                                              runtimeContext.match_local_docker,
641                                                                                                              runtimeContext.copy_deps)
642             for l in v:
643                 visit(v[l], cur_id)
644         if isinstance(v, list):
645             for l in v:
646                 visit(l, cur_id)
647     visit(packed, None)
648
649     if git_info:
650         for g in git_info:
651             packed[g] = git_info[g]
652
653     return packed
654
655
656 def tag_git_version(packed):
657     if tool.tool["id"].startswith("file://"):
658         path = os.path.dirname(tool.tool["id"][7:])
659         try:
660             githash = subprocess.check_output(['git', 'log', '--first-parent', '--max-count=1', '--format=%H'], stderr=subprocess.STDOUT, cwd=path).strip()
661         except (OSError, subprocess.CalledProcessError):
662             pass
663         else:
664             packed["http://schema.org/version"] = githash
665
666
667 def upload_job_order(arvrunner, name, tool, job_order, runtimeContext):
668     """Upload local files referenced in the input object and return updated input
669     object with 'location' updated to the proper keep references.
670     """
671
672     # Make a copy of the job order and set defaults.
673     builder_job_order = copy.copy(job_order)
674
675     # fill_in_defaults throws an error if there are any
676     # missing required parameters, we don't want it to do that
677     # so make them all optional.
678     inputs_copy = copy.deepcopy(tool.tool["inputs"])
679     for i in inputs_copy:
680         if "null" not in i["type"]:
681             i["type"] = ["null"] + aslist(i["type"])
682
683     fill_in_defaults(inputs_copy,
684                      builder_job_order,
685                      arvrunner.fs_access)
686     # Need to create a builder object to evaluate expressions.
687     builder = make_builder(builder_job_order,
688                            tool.hints,
689                            tool.requirements,
690                            ArvRuntimeContext(),
691                            tool.metadata)
692     # Now update job_order with secondaryFiles
693     discover_secondary_files(arvrunner.fs_access,
694                              builder,
695                              tool.tool["inputs"],
696                              job_order)
697
698     _jobloaderctx = jobloaderctx.copy()
699     jobloader = schema_salad.ref_resolver.Loader(_jobloaderctx, fetcher_constructor=tool.doc_loader.fetcher_constructor)
700
701     jobmapper = upload_dependencies(arvrunner,
702                                     name,
703                                     jobloader,
704                                     job_order,
705                                     job_order.get("id", "#"),
706                                     False,
707                                     runtimeContext)
708
709     if "id" in job_order:
710         del job_order["id"]
711
712     # Need to filter this out, gets added by cwltool when providing
713     # parameters on the command line.
714     if "job_order" in job_order:
715         del job_order["job_order"]
716
717     return job_order
718
719 FileUpdates = namedtuple("FileUpdates", ["resolved", "secondaryFiles"])
720
721 def upload_workflow_deps(arvrunner, tool, runtimeContext):
722     # Ensure that Docker images needed by this workflow are available
723
724     with Perf(metrics, "upload_docker"):
725         upload_docker(arvrunner, tool, runtimeContext)
726
727     document_loader = tool.doc_loader
728
729     merged_map = {}
730     tool_dep_cache = {}
731
732     todo = []
733
734     # Standard traversal is top down, we want to go bottom up, so use
735     # the visitor to accumalate a list of nodes to visit, then
736     # visit them in reverse order.
737     def upload_tool_deps(deptool):
738         if "id" in deptool:
739             todo.append(deptool)
740
741     tool.visit(upload_tool_deps)
742
743     for deptool in reversed(todo):
744         discovered_secondaryfiles = {}
745         with Perf(metrics, "upload_dependencies %s" % shortname(deptool["id"])):
746             pm = upload_dependencies(arvrunner,
747                                      "%s dependencies" % (shortname(deptool["id"])),
748                                      document_loader,
749                                      deptool,
750                                      deptool["id"],
751                                      False,
752                                      runtimeContext,
753                                      include_primary=False,
754                                      discovered_secondaryfiles=discovered_secondaryfiles,
755                                      cache=tool_dep_cache)
756         document_loader.idx[deptool["id"]] = deptool
757         toolmap = {}
758         for k,v in pm.items():
759             toolmap[k] = v.resolved
760         merged_map[deptool["id"]] = FileUpdates(toolmap, discovered_secondaryfiles)
761
762     return merged_map
763
764 def arvados_jobs_image(arvrunner, img, runtimeContext):
765     """Determine if the right arvados/jobs image version is available.  If not, try to pull and upload it."""
766
767     try:
768         return arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, {"dockerPull": img},
769                                                           True,
770                                                           runtimeContext.project_uuid,
771                                                           runtimeContext.force_docker_pull,
772                                                           runtimeContext.tmp_outdir_prefix,
773                                                           runtimeContext.match_local_docker,
774                                                           runtimeContext.copy_deps)
775     except Exception as e:
776         raise Exception("Docker image %s is not available\n%s" % (img, e) )
777
778
779 def upload_workflow_collection(arvrunner, name, packed, runtimeContext):
780     collection = arvados.collection.Collection(api_client=arvrunner.api,
781                                                keep_client=arvrunner.keep_client,
782                                                num_retries=arvrunner.num_retries)
783     with collection.open("workflow.cwl", "w") as f:
784         f.write(json.dumps(packed, indent=2, sort_keys=True, separators=(',',': ')))
785
786     filters = [["portable_data_hash", "=", collection.portable_data_hash()],
787                ["name", "like", name+"%"]]
788     if runtimeContext.project_uuid:
789         filters.append(["owner_uuid", "=", runtimeContext.project_uuid])
790     exists = arvrunner.api.collections().list(filters=filters).execute(num_retries=arvrunner.num_retries)
791
792     if exists["items"]:
793         logger.info("Using collection %s", exists["items"][0]["uuid"])
794     else:
795         collection.save_new(name=name,
796                             owner_uuid=runtimeContext.project_uuid,
797                             ensure_unique_name=True,
798                             num_retries=arvrunner.num_retries)
799         logger.info("Uploaded to %s", collection.manifest_locator())
800
801     return collection.portable_data_hash()
802
803
804 class Runner(Process):
805     """Base class for runner processes, which submit an instance of
806     arvados-cwl-runner and wait for the final result."""
807
808     def __init__(self, runner, updated_tool,
809                  tool, loadingContext, enable_reuse,
810                  output_name, output_tags, submit_runner_ram=0,
811                  name=None, on_error=None, submit_runner_image=None,
812                  intermediate_output_ttl=0, merged_map=None,
813                  priority=None, secret_store=None,
814                  collection_cache_size=256,
815                  collection_cache_is_default=True,
816                  git_info=None):
817
818         loadingContext = loadingContext.copy()
819         loadingContext.metadata = updated_tool.metadata.copy()
820
821         super(Runner, self).__init__(updated_tool.tool, loadingContext)
822
823         self.arvrunner = runner
824         self.embedded_tool = tool
825         self.job_order = None
826         self.running = False
827         if enable_reuse:
828             # If reuse is permitted by command line arguments but
829             # disabled by the workflow itself, disable it.
830             reuse_req, _ = self.embedded_tool.get_requirement("http://arvados.org/cwl#ReuseRequirement")
831             if reuse_req:
832                 enable_reuse = reuse_req["enableReuse"]
833         self.enable_reuse = enable_reuse
834         self.uuid = None
835         self.final_output = None
836         self.output_name = output_name
837         self.output_tags = output_tags
838         self.name = name
839         self.on_error = on_error
840         self.jobs_image = submit_runner_image or "arvados/jobs:"+__version__
841         self.intermediate_output_ttl = intermediate_output_ttl
842         self.priority = priority
843         self.secret_store = secret_store
844         self.enable_dev = loadingContext.enable_dev
845         self.git_info = git_info
846
847         self.submit_runner_cores = 1
848         self.submit_runner_ram = 1024  # defaut 1 GiB
849         self.collection_cache_size = collection_cache_size
850
851         runner_resource_req, _ = self.embedded_tool.get_requirement("http://arvados.org/cwl#WorkflowRunnerResources")
852         if runner_resource_req:
853             if runner_resource_req.get("coresMin"):
854                 self.submit_runner_cores = runner_resource_req["coresMin"]
855             if runner_resource_req.get("ramMin"):
856                 self.submit_runner_ram = runner_resource_req["ramMin"]
857             if runner_resource_req.get("keep_cache") and collection_cache_is_default:
858                 self.collection_cache_size = runner_resource_req["keep_cache"]
859
860         if submit_runner_ram:
861             # Command line / initializer overrides default and/or spec from workflow
862             self.submit_runner_ram = submit_runner_ram
863
864         if self.submit_runner_ram <= 0:
865             raise Exception("Value of submit-runner-ram must be greater than zero")
866
867         if self.submit_runner_cores <= 0:
868             raise Exception("Value of submit-runner-cores must be greater than zero")
869
870         self.merged_map = merged_map or {}
871
872     def job(self,
873             job_order,         # type: Mapping[Text, Text]
874             output_callbacks,  # type: Callable[[Any, Any], Any]
875             runtimeContext     # type: RuntimeContext
876            ):  # type: (...) -> Generator[Any, None, None]
877         self.job_order = job_order
878         self._init_job(job_order, runtimeContext)
879         yield self
880
881     def update_pipeline_component(self, record):
882         pass
883
884     def done(self, record):
885         """Base method for handling a completed runner."""
886
887         try:
888             if record["state"] == "Complete":
889                 if record.get("exit_code") is not None:
890                     if record["exit_code"] == 33:
891                         processStatus = "UnsupportedRequirement"
892                     elif record["exit_code"] == 0:
893                         processStatus = "success"
894                     else:
895                         processStatus = "permanentFail"
896                 else:
897                     processStatus = "success"
898             else:
899                 processStatus = "permanentFail"
900
901             outputs = {}
902
903             if processStatus == "permanentFail":
904                 logc = arvados.collection.CollectionReader(record["log"],
905                                                            api_client=self.arvrunner.api,
906                                                            keep_client=self.arvrunner.keep_client,
907                                                            num_retries=self.arvrunner.num_retries)
908                 done.logtail(logc, logger.error, "%s (%s) error log:" % (self.arvrunner.label(self), record["uuid"]), maxlen=40)
909
910             self.final_output = record["output"]
911             outc = arvados.collection.CollectionReader(self.final_output,
912                                                        api_client=self.arvrunner.api,
913                                                        keep_client=self.arvrunner.keep_client,
914                                                        num_retries=self.arvrunner.num_retries)
915             if "cwl.output.json" in outc:
916                 with outc.open("cwl.output.json", "rb") as f:
917                     if f.size() > 0:
918                         outputs = json.loads(f.read().decode())
919             def keepify(fileobj):
920                 path = fileobj["location"]
921                 if not path.startswith("keep:"):
922                     fileobj["location"] = "keep:%s/%s" % (record["output"], path)
923             adjustFileObjs(outputs, keepify)
924             adjustDirObjs(outputs, keepify)
925         except Exception:
926             logger.exception("[%s] While getting final output object", self.name)
927             self.arvrunner.output_callback({}, "permanentFail")
928         else:
929             self.arvrunner.output_callback(outputs, processStatus)