sdk/cwl/arvados_cwl/runner.py

   1 # Copyright (C) The Arvados Authors. All rights reserved.
   2 #
   3 # SPDX-License-Identifier: Apache-2.0
   4
   5 import os
   6 import sys
   7 import re
   8 import urllib.parse
   9 from functools import partial
  10 import logging
  11 import json
  12 import copy
  13 from collections import namedtuple
  14 from io import StringIO
  15 from typing import (
  16     Any,
  17     Callable,
  18     Dict,
  19     Iterable,
  20     Iterator,
  21     List,
  22     Mapping,
  23     MutableMapping,
  24     Sequence,
  25     MutableSequence,
  26     Optional,
  27     Set,
  28     Sized,
  29     Tuple,
  30     Type,
  31     Union,
  32     cast,
  33 )
  34
  35 import subprocess
  36
  37 from schema_salad.sourceline import SourceLine, cmap
  38
  39 from cwltool.command_line_tool import CommandLineTool
  40 import cwltool.workflow
  41 from cwltool.process import (scandeps, UnsupportedRequirement, normalizeFilesDirs,
  42                              shortname, Process, fill_in_defaults)
  43 from cwltool.load_tool import fetch_document, jobloaderctx
  44 from cwltool.utils import aslist, adjustFileObjs, adjustDirObjs, visit_class
  45 from cwltool.builder import substitute
  46 from cwltool.pack import pack
  47 from cwltool.update import INTERNAL_VERSION
  48 from cwltool.builder import Builder
  49 import schema_salad.validate as validate
  50 import schema_salad.ref_resolver
  51 from cwltool.secrets import SecretStore
  52
  53 import arvados.collection
  54 import arvados.util
  55 from .util import collectionUUID
  56 from ruamel.yaml import YAML
  57 from ruamel.yaml.comments import CommentedMap, CommentedSeq
  58
  59 import arvados_cwl.arvdocker
  60 from .pathmapper import ArvPathMapper, trim_listing, collection_pdh_pattern, collection_uuid_pattern, MapperEnt
  61 from ._version import __version__
  62 from . import done
  63 from . context import ArvRuntimeContext
  64 from .perf import Perf
  65
  66 basestring = (bytes, str)
  67 logger = logging.getLogger('arvados.cwl-runner')
  68 metrics = logging.getLogger('arvados.cwl-runner.metrics')
  69
  70 def trim_anonymous_location(obj):
  71     """Remove 'location' field from File and Directory literals.
  72
  73     To make internal handling easier, literals are assigned a random id for
  74     'location'.  However, when writing the record back out, this can break
  75     reproducibility.  Since it is valid for literals not have a 'location'
  76     field, remove it.
  77
  78     """
  79
  80     if obj.get("location", "").startswith("_:"):
  81         del obj["location"]
  82
  83
  84 def remove_redundant_fields(obj):
  85     for field in ("path", "nameext", "nameroot", "dirname"):
  86         if field in obj:
  87             del obj[field]
  88
  89
  90 def find_defaults(d, op):
  91     if isinstance(d, list):
  92         for i in d:
  93             find_defaults(i, op)
  94     elif isinstance(d, dict):
  95         if "default" in d:
  96             op(d)
  97         else:
  98             for i in d.values():
  99                 find_defaults(i, op)
 100
 101 def make_builder(joborder, hints, requirements, runtimeContext, metadata):
 102     return Builder(
 103                  job=joborder,
 104                  files=[],               # type: List[Dict[Text, Text]]
 105                  bindings=[],            # type: List[Dict[Text, Any]]
 106                  schemaDefs={},          # type: Dict[Text, Dict[Text, Any]]
 107                  names=None,               # type: Names
 108                  requirements=requirements,        # type: List[Dict[Text, Any]]
 109                  hints=hints,               # type: List[Dict[Text, Any]]
 110                  resources={},           # type: Dict[str, int]
 111                  mutation_manager=None,    # type: Optional[MutationManager]
 112                  formatgraph=None,         # type: Optional[Graph]
 113                  make_fs_access=None,      # type: Type[StdFsAccess]
 114                  fs_access=None,           # type: StdFsAccess
 115                  job_script_provider=runtimeContext.job_script_provider, # type: Optional[Any]
 116                  timeout=runtimeContext.eval_timeout,             # type: float
 117                  debug=runtimeContext.debug,               # type: bool
 118                  js_console=runtimeContext.js_console,          # type: bool
 119                  force_docker_pull=runtimeContext.force_docker_pull,   # type: bool
 120                  loadListing="",         # type: Text
 121                  outdir="",              # type: Text
 122                  tmpdir="",              # type: Text
 123                  stagedir="",            # type: Text
 124                  cwlVersion=metadata.get("http://commonwl.org/cwltool#original_cwlVersion") or metadata.get("cwlVersion"),
 125                  container_engine="docker"
 126                 )
 127
 128 def search_schemadef(name, reqs):
 129     for r in reqs:
 130         if r["class"] == "SchemaDefRequirement":
 131             for sd in r["types"]:
 132                 if sd["name"] == name:
 133                     return sd
 134     return None
 135
 136 primitive_types_set = frozenset(("null", "boolean", "int", "long",
 137                                  "float", "double", "string", "record",
 138                                  "array", "enum"))
 139
 140 def set_secondary(fsaccess, builder, inputschema, secondaryspec, primary, discovered):
 141     if isinstance(inputschema, Sequence) and not isinstance(inputschema, basestring):
 142         # union type, collect all possible secondaryFiles
 143         for i in inputschema:
 144             set_secondary(fsaccess, builder, i, secondaryspec, primary, discovered)
 145         return
 146
 147     if inputschema == "File":
 148         inputschema = {"type": "File"}
 149
 150     if isinstance(inputschema, basestring):
 151         sd = search_schemadef(inputschema, reversed(builder.hints+builder.requirements))
 152         if sd:
 153             inputschema = sd
 154         else:
 155             return
 156
 157     if "secondaryFiles" in inputschema:
 158         # set secondaryFiles, may be inherited by compound types.
 159         secondaryspec = inputschema["secondaryFiles"]
 160
 161     if (isinstance(inputschema["type"], (Mapping, Sequence)) and
 162         not isinstance(inputschema["type"], basestring)):
 163         # compound type (union, array, record)
 164         set_secondary(fsaccess, builder, inputschema["type"], secondaryspec, primary, discovered)
 165
 166     elif (inputschema["type"] == "record" and
 167           isinstance(primary, Mapping)):
 168         #
 169         # record type, find secondary files associated with fields.
 170         #
 171         for f in inputschema["fields"]:
 172             p = primary.get(shortname(f["name"]))
 173             if p:
 174                 set_secondary(fsaccess, builder, f, secondaryspec, p, discovered)
 175
 176     elif (inputschema["type"] == "array" and
 177           isinstance(primary, Sequence)):
 178         #
 179         # array type, find secondary files of elements
 180         #
 181         for p in primary:
 182             set_secondary(fsaccess, builder, {"type": inputschema["items"]}, secondaryspec, p, discovered)
 183
 184     elif (inputschema["type"] == "File" and
 185           isinstance(primary, Mapping) and
 186           primary.get("class") == "File"):
 187
 188         if "secondaryFiles" in primary or not secondaryspec:
 189             # Nothing to do.
 190             return
 191
 192         #
 193         # Found a file, check for secondaryFiles
 194         #
 195         specs = []
 196         primary["secondaryFiles"] = secondaryspec
 197         for i, sf in enumerate(aslist(secondaryspec)):
 198             if builder.cwlVersion == "v1.0":
 199                 pattern = sf
 200             else:
 201                 pattern = sf["pattern"]
 202             if pattern is None:
 203                 continue
 204             if isinstance(pattern, list):
 205                 specs.extend(pattern)
 206             elif isinstance(pattern, dict):
 207                 specs.append(pattern)
 208             elif isinstance(pattern, str):
 209                 if builder.cwlVersion == "v1.0":
 210                     specs.append({"pattern": pattern, "required": True})
 211                 else:
 212                     specs.append({"pattern": pattern, "required": sf.get("required")})
 213             else:
 214                 raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
 215                     "Expression must return list, object, string or null")
 216
 217         found = []
 218         for i, sf in enumerate(specs):
 219             if isinstance(sf, dict):
 220                 if sf.get("class") == "File":
 221                     pattern = None
 222                     if sf.get("location") is None:
 223                         raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
 224                             "File object is missing 'location': %s" % sf)
 225                     sfpath = sf["location"]
 226                     required = True
 227                 else:
 228                     pattern = sf["pattern"]
 229                     required = sf.get("required")
 230             elif isinstance(sf, str):
 231                 pattern = sf
 232                 required = True
 233             else:
 234                 raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
 235                     "Expression must return list, object, string or null")
 236
 237             if pattern is not None:
 238                 if "${" in pattern or "$(" in pattern:
 239                     sfname = builder.do_eval(pattern, context=primary)
 240                 else:
 241                     sfname = substitute(primary["basename"], pattern)
 242
 243                 if sfname is None:
 244                     continue
 245
 246                 if isinstance(sfname, str):
 247                     p_location = primary["location"]
 248                     if "/" in p_location:
 249                         sfpath = (
 250                             p_location[0 : p_location.rindex("/") + 1]
 251                             + sfname
 252                         )
 253
 254             required = builder.do_eval(required, context=primary)
 255
 256             if isinstance(sfname, list) or isinstance(sfname, dict):
 257                 each = aslist(sfname)
 258                 for e in each:
 259                     if required and not fsaccess.exists(e.get("location")):
 260                         raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
 261                             "Required secondary file '%s' does not exist" % e.get("location"))
 262                 found.extend(each)
 263
 264             if isinstance(sfname, str):
 265                 if fsaccess.exists(sfpath):
 266                     if pattern is not None:
 267                         found.append({"location": sfpath, "class": "File"})
 268                     else:
 269                         found.append(sf)
 270                 elif required:
 271                     raise SourceLine(primary["secondaryFiles"], i, validate.ValidationException).makeError(
 272                         "Required secondary file '%s' does not exist" % sfpath)
 273
 274         primary["secondaryFiles"] = cmap(found)
 275         if discovered is not None:
 276             discovered[primary["location"]] = primary["secondaryFiles"]
 277     elif inputschema["type"] not in primitive_types_set and inputschema["type"] not in ("File", "Directory"):
 278         set_secondary(fsaccess, builder, inputschema["type"], secondaryspec, primary, discovered)
 279
 280 def discover_secondary_files(fsaccess, builder, inputs, job_order, discovered=None):
 281     for inputschema in inputs:
 282         primary = job_order.get(shortname(inputschema["id"]))
 283         if isinstance(primary, (Mapping, Sequence)):
 284             set_secondary(fsaccess, builder, inputschema, None, primary, discovered)
 285
 286 def upload_dependencies(arvrunner, name, document_loader,
 287                         workflowobj, uri, runtimeContext,
 288                         include_primary=True, discovered_secondaryfiles=None,
 289                         cache=None):
 290     """Upload the dependencies of the workflowobj document to Keep.
 291
 292     Returns a pathmapper object mapping local paths to keep references.  Also
 293     does an in-place update of references in "workflowobj".
 294
 295     Use scandeps to find $schemas, File and Directory
 296     fields that represent external references.
 297
 298     If workflowobj has an "id" field, this will reload the document to ensure
 299     it is scanning the raw document prior to preprocessing.
 300     """
 301
 302     scanobj = workflowobj
 303     metadata = scanobj
 304
 305     with Perf(metrics, "scandeps"):
 306         sc_result = scandeps(uri, scanobj,
 307                              set(),
 308                              set(("location",)),
 309                              None, urljoin=document_loader.fetcher.urljoin,
 310                              nestdirs=False)
 311         optional_deps = scandeps(uri, scanobj,
 312                              set(),
 313                              set(("$schemas",)),
 314                              None, urljoin=document_loader.fetcher.urljoin,
 315                              nestdirs=False)
 316
 317     if sc_result is None:
 318         sc_result = []
 319
 320     if optional_deps is None:
 321         optional_deps = []
 322
 323     if optional_deps:
 324         sc_result.extend(optional_deps)
 325
 326     sc = []
 327     uuids = {}
 328
 329     def collect_uuids(obj):
 330         loc = obj.get("location", "")
 331         sp = loc.split(":")
 332         if sp[0] == "keep":
 333             # Collect collection uuids that need to be resolved to
 334             # portable data hashes
 335             gp = collection_uuid_pattern.match(loc)
 336             if gp:
 337                 uuids[gp.groups()[0]] = obj
 338             if collectionUUID in obj:
 339                 uuids[obj[collectionUUID]] = obj
 340
 341     def collect_uploads(obj):
 342         loc = obj.get("location", "")
 343         sp = loc.split(":")
 344         if len(sp) < 1:
 345             return
 346         if sp[0] in ("file", "http", "https"):
 347             # Record local files than need to be uploaded,
 348             # don't include file literals, keep references, etc.
 349             sc.append(obj)
 350         collect_uuids(obj)
 351
 352     with Perf(metrics, "collect uuids"):
 353         visit_class(workflowobj, ("File", "Directory"), collect_uuids)
 354
 355     with Perf(metrics, "collect uploads"):
 356         visit_class(sc_result, ("File", "Directory"), collect_uploads)
 357
 358     # Resolve any collection uuids we found to portable data hashes
 359     # and assign them to uuid_map
 360     uuid_map = {}
 361     fetch_uuids = list(uuids.keys())
 362     with Perf(metrics, "fetch_uuids"):
 363         while fetch_uuids:
 364             # For a large number of fetch_uuids, API server may limit
 365             # response size, so keep fetching from API server has nothing
 366             # more to give us.
 367             lookups = arvrunner.api.collections().list(
 368                 filters=[["uuid", "in", fetch_uuids]],
 369                 count="none",
 370                 select=["uuid", "portable_data_hash"]).execute(
 371                     num_retries=arvrunner.num_retries)
 372
 373             if not lookups["items"]:
 374                 break
 375
 376             for l in lookups["items"]:
 377                 uuid_map[l["uuid"]] = l["portable_data_hash"]
 378
 379             fetch_uuids = [u for u in fetch_uuids if u not in uuid_map]
 380
 381     normalizeFilesDirs(sc)
 382
 383     if "id" in workflowobj:
 384         defrg, _ = urllib.parse.urldefrag(workflowobj["id"])
 385         if include_primary:
 386             # make sure it's included
 387             sc.append({"class": "File", "location": defrg})
 388         else:
 389             # make sure it's excluded
 390             sc = [d for d in sc if d.get("location") != defrg]
 391
 392     def visit_default(obj):
 393         def defaults_are_optional(f):
 394             if "location" not in f and "path" in f:
 395                 f["location"] = f["path"]
 396                 del f["path"]
 397             normalizeFilesDirs(f)
 398             optional_deps.append(f)
 399         visit_class(obj["default"], ("File", "Directory"), defaults_are_optional)
 400
 401     find_defaults(workflowobj, visit_default)
 402
 403     discovered = {}
 404     def discover_default_secondary_files(obj):
 405         builder_job_order = {}
 406         for t in obj["inputs"]:
 407             builder_job_order[shortname(t["id"])] = t["default"] if "default" in t else None
 408         # Need to create a builder object to evaluate expressions.
 409         builder = make_builder(builder_job_order,
 410                                obj.get("hints", []),
 411                                obj.get("requirements", []),
 412                                ArvRuntimeContext(),
 413                                metadata)
 414         discover_secondary_files(arvrunner.fs_access,
 415                                  builder,
 416                                  obj["inputs"],
 417                                  builder_job_order,
 418                                  discovered)
 419
 420     copied, _ = document_loader.resolve_all(copy.deepcopy(cmap(workflowobj)), base_url=uri, checklinks=False)
 421     visit_class(copied, ("CommandLineTool", "Workflow"), discover_default_secondary_files)
 422
 423     for d in list(discovered):
 424         # Only interested in discovered secondaryFiles which are local
 425         # files that need to be uploaded.
 426         if d.startswith("file:"):
 427             sc.extend(discovered[d])
 428         else:
 429             del discovered[d]
 430
 431     with Perf(metrics, "mapper"):
 432         mapper = ArvPathMapper(arvrunner, sc, "",
 433                                "keep:%s",
 434                                "keep:%s/%s",
 435                                name=name,
 436                                single_collection=True,
 437                                optional_deps=optional_deps)
 438
 439     for k, v in uuid_map.items():
 440         mapper._pathmap["keep:"+k] = MapperEnt(v, "", "", False)
 441
 442     keeprefs = set()
 443     def addkeepref(k):
 444         if k.startswith("keep:"):
 445             keeprefs.add(collection_pdh_pattern.match(k).group(1))
 446
 447
 448     def collectloc(p):
 449         loc = p.get("location")
 450         if loc and (not loc.startswith("_:")) and (not loc.startswith("keep:")):
 451             addkeepref(p["location"])
 452             return
 453
 454         if not loc:
 455             return
 456
 457         if collectionUUID in p:
 458             uuid = p[collectionUUID]
 459             if uuid not in uuid_map:
 460                 raise SourceLine(p, collectionUUID, validate.ValidationException).makeError(
 461                     "Collection uuid %s not found" % uuid)
 462             gp = collection_pdh_pattern.match(loc)
 463             if gp and uuid_map[uuid] != gp.groups()[0]:
 464                 # This file entry has both collectionUUID and a PDH
 465                 # location. If the PDH doesn't match the one returned
 466                 # the API server, raise an error.
 467                 raise SourceLine(p, "location", validate.ValidationException).makeError(
 468                     "Expected collection uuid %s to be %s but API server reported %s" % (
 469                         uuid, gp.groups()[0], uuid_map[p[collectionUUID]]))
 470
 471         gp = collection_uuid_pattern.match(loc)
 472         if not gp:
 473             # Not a uuid pattern (must be a pdh pattern)
 474             addkeepref(p["location"])
 475             return
 476
 477         uuid = gp.groups()[0]
 478         if uuid not in uuid_map:
 479             raise SourceLine(p, "location", validate.ValidationException).makeError(
 480                 "Collection uuid %s not found" % uuid)
 481
 482     with Perf(metrics, "collectloc"):
 483         visit_class(workflowobj, ("File", "Directory"), collectloc)
 484         visit_class(discovered, ("File", "Directory"), collectloc)
 485
 486     if discovered_secondaryfiles is not None:
 487         for d in discovered:
 488             discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d]
 489
 490     if runtimeContext.copy_deps:
 491         # Find referenced collections and copy them into the
 492         # destination project, for easy sharing.
 493         already_present = list(arvados.util.keyset_list_all(arvrunner.api.collections().list,
 494                                      filters=[["portable_data_hash", "in", list(keeprefs)],
 495                                               ["owner_uuid", "=", runtimeContext.project_uuid]],
 496                                      select=["uuid", "portable_data_hash", "created_at"]))
 497
 498         keeprefs = keeprefs - set(a["portable_data_hash"] for a in already_present)
 499         for kr in keeprefs:
 500             col = arvrunner.api.collections().list(filters=[["portable_data_hash", "=", kr]],
 501                                                   order="created_at desc",
 502                                                    select=["name", "description", "properties", "portable_data_hash", "manifest_text", "storage_classes_desired", "trash_at"],
 503                                                    limit=1).execute()
 504             if len(col["items"]) == 0:
 505                 logger.warning("Cannot find collection with portable data hash %s", kr)
 506                 continue
 507             col = col["items"][0]
 508             col["name"] = arvados.util.trim_name(col["name"])
 509             try:
 510                 arvrunner.api.collections().create(body={"collection": {
 511                     "owner_uuid": runtimeContext.project_uuid,
 512                     "name": col["name"],
 513                     "description": col["description"],
 514                     "properties": col["properties"],
 515                     "portable_data_hash": col["portable_data_hash"],
 516                     "manifest_text": col["manifest_text"],
 517                     "storage_classes_desired": col["storage_classes_desired"],
 518                     "trash_at": col["trash_at"]
 519                 }}, ensure_unique_name=True).execute()
 520             except Exception as e:
 521                 logger.warning("Unable to copy collection to destination: %s", e)
 522
 523     if "$schemas" in workflowobj:
 524         sch = CommentedSeq()
 525         for s in workflowobj["$schemas"]:
 526             if s in mapper:
 527                 sch.append(mapper.mapper(s).resolved)
 528         workflowobj["$schemas"] = sch
 529
 530     return mapper
 531
 532
 533 def upload_docker(arvrunner, tool, runtimeContext):
 534     """Uploads Docker images used in CommandLineTool objects."""
 535
 536     if isinstance(tool, CommandLineTool):
 537         (docker_req, docker_is_req) = tool.get_requirement("DockerRequirement")
 538         if docker_req:
 539             if docker_req.get("dockerOutputDirectory") and arvrunner.work_api != "containers":
 540                 raise SourceLine(docker_req, "dockerOutputDirectory", UnsupportedRequirement).makeError(
 541                     "Option 'dockerOutputDirectory' of DockerRequirement not supported.")
 542
 543             arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, docker_req, True, runtimeContext)
 544         else:
 545             arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, {"dockerPull": "arvados/jobs:"+__version__},
 546                                                        True, runtimeContext)
 547     elif isinstance(tool, cwltool.workflow.Workflow):
 548         for s in tool.steps:
 549             upload_docker(arvrunner, s.embedded_tool, runtimeContext)
 550
 551
 552 def packed_workflow(arvrunner, tool, merged_map, runtimeContext, git_info):
 553     """Create a packed workflow.
 554
 555     A "packed" workflow is one where all the components have been combined into a single document."""
 556
 557     rewrites = {}
 558     packed = pack(arvrunner.loadingContext, tool.tool["id"],
 559                   rewrite_out=rewrites,
 560                   loader=tool.doc_loader)
 561
 562     rewrite_to_orig = {v: k for k,v in rewrites.items()}
 563
 564     def visit(v, cur_id):
 565         if isinstance(v, dict):
 566             if v.get("class") in ("CommandLineTool", "Workflow", "ExpressionTool"):
 567                 if tool.metadata["cwlVersion"] == "v1.0" and "id" not in v:
 568                     raise SourceLine(v, None, Exception).makeError("Embedded process object is missing required 'id' field, add an 'id' or use to cwlVersion: v1.1")
 569                 if "id" in v:
 570                     cur_id = rewrite_to_orig.get(v["id"], v["id"])
 571             if "path" in v and "location" not in v:
 572                 v["location"] = v["path"]
 573                 del v["path"]
 574             if "location" in v and cur_id in merged_map:
 575                 if v["location"] in merged_map[cur_id].resolved:
 576                     v["location"] = merged_map[cur_id].resolved[v["location"]]
 577                 if v["location"] in merged_map[cur_id].secondaryFiles:
 578                     v["secondaryFiles"] = merged_map[cur_id].secondaryFiles[v["location"]]
 579             if v.get("class") == "DockerRequirement":
 580                 v["http://arvados.org/cwl#dockerCollectionPDH"] = arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, v, True,
 581                                                                                                              runtimeContext)
 582             for l in v:
 583                 visit(v[l], cur_id)
 584         if isinstance(v, list):
 585             for l in v:
 586                 visit(l, cur_id)
 587     visit(packed, None)
 588
 589     if git_info:
 590         for g in git_info:
 591             packed[g] = git_info[g]
 592
 593     return packed
 594
 595
 596 def tag_git_version(packed):
 597     if tool.tool["id"].startswith("file://"):
 598         path = os.path.dirname(tool.tool["id"][7:])
 599         try:
 600             githash = subprocess.check_output(['git', 'log', '--first-parent', '--max-count=1', '--format=%H'], stderr=subprocess.STDOUT, cwd=path).strip()
 601         except (OSError, subprocess.CalledProcessError):
 602             pass
 603         else:
 604             packed["http://schema.org/version"] = githash
 605
 606 def setloc(mapper, p):
 607     loc = p.get("location")
 608     if loc and (not loc.startswith("_:")) and (not loc.startswith("keep:")):
 609         p["location"] = mapper.mapper(p["location"]).resolved
 610         return
 611
 612     if not loc:
 613         return
 614
 615     if collectionUUID in p:
 616         uuid = p[collectionUUID]
 617         keepuuid = "keep:"+uuid
 618         if keepuuid not in mapper:
 619             raise SourceLine(p, collectionUUID, validate.ValidationException).makeError(
 620                 "Collection uuid %s not found" % uuid)
 621         gp = collection_pdh_pattern.match(loc)
 622         if gp and mapper.mapper(keepuuid).resolved != gp.groups()[0]:
 623             # This file entry has both collectionUUID and a PDH
 624             # location. If the PDH doesn't match the one returned
 625             # the API server, raise an error.
 626             raise SourceLine(p, "location", validate.ValidationException).makeError(
 627                 "Expected collection uuid %s to be %s but API server reported %s" % (
 628                     uuid, gp.groups()[0], mapper.mapper(keepuuid).resolved))
 629
 630     gp = collection_uuid_pattern.match(loc)
 631     if not gp:
 632         # Not a uuid pattern (must be a pdh pattern)
 633         return
 634
 635     uuid = gp.groups()[0]
 636     keepuuid = "keep:"+uuid
 637     if keepuuid not in mapper:
 638         raise SourceLine(p, "location", validate.ValidationException).makeError(
 639             "Collection uuid %s not found" % uuid)
 640     p["location"] = "keep:%s%s" % (mapper.mapper(keepuuid).resolved, gp.groups()[1] if gp.groups()[1] else "")
 641     p[collectionUUID] = uuid
 642
 643 def update_from_mapper(workflowobj, mapper):
 644     with Perf(metrics, "setloc"):
 645         visit_class(workflowobj, ("File", "Directory"), partial(setloc, mapper))
 646
 647 def apply_merged_map(merged_map, workflowobj):
 648     def visit(v, cur_id):
 649         if isinstance(v, dict):
 650             if v.get("class") in ("CommandLineTool", "Workflow", "ExpressionTool"):
 651                 if "id" in v:
 652                     cur_id = v["id"]
 653             if "path" in v and "location" not in v:
 654                 v["location"] = v["path"]
 655                 del v["path"]
 656             if "location" in v and cur_id in merged_map:
 657                 if v["location"] in merged_map[cur_id].resolved:
 658                     v["location"] = merged_map[cur_id].resolved[v["location"]]
 659                 if v["location"] in merged_map[cur_id].secondaryFiles:
 660                     v["secondaryFiles"] = merged_map[cur_id].secondaryFiles[v["location"]]
 661             #if v.get("class") == "DockerRequirement":
 662             #    v["http://arvados.org/cwl#dockerCollectionPDH"] = arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, v, True,
 663             #                                                                                                 runtimeContext)
 664             for l in v:
 665                 visit(v[l], cur_id)
 666         if isinstance(v, list):
 667             for l in v:
 668                 visit(l, cur_id)
 669     visit(workflowobj, None)
 670
 671 def update_from_merged_map(tool, merged_map):
 672     tool.visit(partial(apply_merged_map, merged_map))
 673
 674 def upload_job_order(arvrunner, name, tool, job_order, runtimeContext):
 675     """Upload local files referenced in the input object and return updated input
 676     object with 'location' updated to the proper keep references.
 677     """
 678
 679     # Make a copy of the job order and set defaults.
 680     builder_job_order = copy.copy(job_order)
 681
 682     # fill_in_defaults throws an error if there are any
 683     # missing required parameters, we don't want it to do that
 684     # so make them all optional.
 685     inputs_copy = copy.deepcopy(tool.tool["inputs"])
 686     for i in inputs_copy:
 687         if "null" not in i["type"]:
 688             i["type"] = ["null"] + aslist(i["type"])
 689
 690     fill_in_defaults(inputs_copy,
 691                      builder_job_order,
 692                      arvrunner.fs_access)
 693     # Need to create a builder object to evaluate expressions.
 694     builder = make_builder(builder_job_order,
 695                            tool.hints,
 696                            tool.requirements,
 697                            ArvRuntimeContext(),
 698                            tool.metadata)
 699     # Now update job_order with secondaryFiles
 700     discover_secondary_files(arvrunner.fs_access,
 701                              builder,
 702                              tool.tool["inputs"],
 703                              job_order)
 704
 705     _jobloaderctx = jobloaderctx.copy()
 706     jobloader = schema_salad.ref_resolver.Loader(_jobloaderctx, fetcher_constructor=tool.doc_loader.fetcher_constructor)
 707
 708     jobmapper = upload_dependencies(arvrunner,
 709                                     name,
 710                                     jobloader,
 711                                     job_order,
 712                                     job_order.get("id", "#"),
 713                                     runtimeContext)
 714
 715     if "id" in job_order:
 716         del job_order["id"]
 717
 718     # Need to filter this out, gets added by cwltool when providing
 719     # parameters on the command line.
 720     if "job_order" in job_order:
 721         del job_order["job_order"]
 722
 723     update_from_mapper(job_order, jobmapper)
 724
 725     return job_order, jobmapper
 726
 727 FileUpdates = namedtuple("FileUpdates", ["resolved", "secondaryFiles"])
 728
 729 def upload_workflow_deps(arvrunner, tool, runtimeContext):
 730     # Ensure that Docker images needed by this workflow are available
 731
 732     with Perf(metrics, "upload_docker"):
 733         upload_docker(arvrunner, tool, runtimeContext)
 734
 735     document_loader = tool.doc_loader
 736
 737     merged_map = {}
 738     tool_dep_cache = {}
 739
 740     todo = []
 741
 742     # Standard traversal is top down, we want to go bottom up, so use
 743     # the visitor to accumalate a list of nodes to visit, then
 744     # visit them in reverse order.
 745     def upload_tool_deps(deptool):
 746         if "id" in deptool:
 747             todo.append(deptool)
 748
 749     tool.visit(upload_tool_deps)
 750
 751     for deptool in reversed(todo):
 752         discovered_secondaryfiles = {}
 753         with Perf(metrics, "upload_dependencies %s" % shortname(deptool["id"])):
 754             pm = upload_dependencies(arvrunner,
 755                                      "%s dependencies" % (shortname(deptool["id"])),
 756                                      document_loader,
 757                                      deptool,
 758                                      deptool["id"],
 759                                      runtimeContext,
 760                                      include_primary=False,
 761                                      discovered_secondaryfiles=discovered_secondaryfiles,
 762                                      cache=tool_dep_cache)
 763
 764         document_loader.idx[deptool["id"]] = deptool
 765         toolmap = {}
 766         for k,v in pm.items():
 767             toolmap[k] = v.resolved
 768
 769         merged_map[deptool["id"]] = FileUpdates(toolmap, discovered_secondaryfiles)
 770
 771     return merged_map
 772
 773 def arvados_jobs_image(arvrunner, img, runtimeContext):
 774     """Determine if the right arvados/jobs image version is available.  If not, try to pull and upload it."""
 775
 776     try:
 777         return arvados_cwl.arvdocker.arv_docker_get_image(arvrunner.api, {"dockerPull": img},
 778                                                           True, runtimeContext)
 779     except Exception as e:
 780         raise Exception("Docker image %s is not available\n%s" % (img, e) )
 781
 782
 783 def upload_workflow_collection(arvrunner, name, packed, runtimeContext):
 784     collection = arvados.collection.Collection(api_client=arvrunner.api,
 785                                                keep_client=arvrunner.keep_client,
 786                                                num_retries=arvrunner.num_retries)
 787     with collection.open("workflow.cwl", "w") as f:
 788         f.write(json.dumps(packed, indent=2, sort_keys=True, separators=(',',': ')))
 789
 790     filters = [["portable_data_hash", "=", collection.portable_data_hash()],
 791                ["name", "like", name+"%"]]
 792     if runtimeContext.project_uuid:
 793         filters.append(["owner_uuid", "=", runtimeContext.project_uuid])
 794     exists = arvrunner.api.collections().list(filters=filters).execute(num_retries=arvrunner.num_retries)
 795
 796     if exists["items"]:
 797         logger.info("Using collection %s", exists["items"][0]["uuid"])
 798     else:
 799         collection.save_new(name=name,
 800                             owner_uuid=runtimeContext.project_uuid,
 801                             ensure_unique_name=True,
 802                             num_retries=arvrunner.num_retries)
 803         logger.info("Uploaded to %s", collection.manifest_locator())
 804
 805     return collection.portable_data_hash()
 806
 807
 808 class Runner(Process):
 809     """Base class for runner processes, which submit an instance of
 810     arvados-cwl-runner and wait for the final result."""
 811
 812     def __init__(self, runner,
 813                  tool, loadingContext, enable_reuse,
 814                  output_name, output_tags, submit_runner_ram=0,
 815                  name=None, on_error=None, submit_runner_image=None,
 816                  intermediate_output_ttl=0, merged_map=None,
 817                  priority=None, secret_store=None,
 818                  collection_cache_size=256,
 819                  collection_cache_is_default=True,
 820                  git_info=None,
 821                  reuse_runner=False):
 822
 823         self.loadingContext = loadingContext.copy()
 824
 825         super(Runner, self).__init__(tool.tool, loadingContext)
 826
 827         self.arvrunner = runner
 828         self.embedded_tool = tool
 829         self.job_order = None
 830         self.running = False
 831         if enable_reuse:
 832             # If reuse is permitted by command line arguments but
 833             # disabled by the workflow itself, disable it.
 834             reuse_req, _ = self.embedded_tool.get_requirement("http://arvados.org/cwl#ReuseRequirement")
 835             if reuse_req:
 836                 enable_reuse = reuse_req["enableReuse"]
 837             reuse_req, _ = self.embedded_tool.get_requirement("WorkReuse")
 838             if reuse_req:
 839                 enable_reuse = reuse_req["enableReuse"]
 840         self.enable_reuse = enable_reuse
 841         self.uuid = None
 842         self.final_output = None
 843         self.output_name = output_name
 844         self.output_tags = output_tags
 845         self.name = name
 846         self.on_error = on_error
 847         self.jobs_image = submit_runner_image or "arvados/jobs:"+__version__
 848         self.intermediate_output_ttl = intermediate_output_ttl
 849         self.priority = priority
 850         self.secret_store = secret_store
 851         self.enable_dev = self.loadingContext.enable_dev
 852         self.git_info = git_info
 853         self.fast_parser = self.loadingContext.fast_parser
 854         self.reuse_runner = reuse_runner
 855
 856         self.submit_runner_cores = 1
 857         self.submit_runner_ram = 1024  # defaut 1 GiB
 858         self.collection_cache_size = collection_cache_size
 859
 860         runner_resource_req, _ = self.embedded_tool.get_requirement("http://arvados.org/cwl#WorkflowRunnerResources")
 861         if runner_resource_req:
 862             if runner_resource_req.get("coresMin"):
 863                 self.submit_runner_cores = runner_resource_req["coresMin"]
 864             if runner_resource_req.get("ramMin"):
 865                 self.submit_runner_ram = runner_resource_req["ramMin"]
 866             if runner_resource_req.get("keep_cache") and collection_cache_is_default:
 867                 self.collection_cache_size = runner_resource_req["keep_cache"]
 868
 869         if submit_runner_ram:
 870             # Command line / initializer overrides default and/or spec from workflow
 871             self.submit_runner_ram = submit_runner_ram
 872
 873         if self.submit_runner_ram <= 0:
 874             raise Exception("Value of submit-runner-ram must be greater than zero")
 875
 876         if self.submit_runner_cores <= 0:
 877             raise Exception("Value of submit-runner-cores must be greater than zero")
 878
 879         self.merged_map = merged_map or {}
 880
 881     def job(self,
 882             job_order,         # type: Mapping[Text, Text]
 883             output_callbacks,  # type: Callable[[Any, Any], Any]
 884             runtimeContext     # type: RuntimeContext
 885            ):  # type: (...) -> Generator[Any, None, None]
 886         self.job_order = job_order
 887         self._init_job(job_order, runtimeContext)
 888         yield self
 889
 890     def update_pipeline_component(self, record):
 891         pass
 892
 893     def done(self, record):
 894         """Base method for handling a completed runner."""
 895
 896         try:
 897             if record["state"] == "Complete":
 898                 if record.get("exit_code") is not None:
 899                     if record["exit_code"] == 33:
 900                         processStatus = "UnsupportedRequirement"
 901                     elif record["exit_code"] == 0:
 902                         processStatus = "success"
 903                     else:
 904                         processStatus = "permanentFail"
 905                 else:
 906                     processStatus = "success"
 907             else:
 908                 processStatus = "permanentFail"
 909
 910             outputs = {}
 911
 912             if processStatus == "permanentFail":
 913                 logc = arvados.collection.CollectionReader(record["log"],
 914                                                            api_client=self.arvrunner.api,
 915                                                            keep_client=self.arvrunner.keep_client,
 916                                                            num_retries=self.arvrunner.num_retries)
 917                 done.logtail(logc, logger.error, "%s (%s) error log:" % (self.arvrunner.label(self), record["uuid"]), maxlen=40,
 918                              include_crunchrun=(record.get("exit_code") is None or record.get("exit_code") > 127))
 919
 920             self.final_output = record["output"]
 921             outc = arvados.collection.CollectionReader(self.final_output,
 922                                                        api_client=self.arvrunner.api,
 923                                                        keep_client=self.arvrunner.keep_client,
 924                                                        num_retries=self.arvrunner.num_retries)
 925             if "cwl.output.json" in outc:
 926                 with outc.open("cwl.output.json", "rb") as f:
 927                     if f.size() > 0:
 928                         outputs = json.loads(str(f.read(), 'utf-8'))
 929             def keepify(fileobj):
 930                 path = fileobj["location"]
 931                 if not path.startswith("keep:"):
 932                     fileobj["location"] = "keep:%s/%s" % (record["output"], path)
 933             adjustFileObjs(outputs, keepify)
 934             adjustDirObjs(outputs, keepify)
 935         except Exception:
 936             logger.exception("[%s] While getting final output object", self.name)
 937             self.arvrunner.output_callback({}, "permanentFail")
 938         else:
 939             self.arvrunner.output_callback(outputs, processStatus)
 940
 941
 942 def print_keep_deps_visitor(api, runtimeContext, references, doc_loader, tool):
 943     def collect_locators(obj):
 944         loc = obj.get("location", "")
 945
 946         g = arvados.util.keepuri_pattern.match(loc)
 947         if g:
 948             references.add(g[1])
 949
 950         if obj.get("class") == "http://arvados.org/cwl#WorkflowRunnerResources" and "acrContainerImage" in obj:
 951             references.add(obj["acrContainerImage"])
 952
 953         if obj.get("class") == "DockerRequirement":
 954             references.add(arvados_cwl.arvdocker.arv_docker_get_image(api, obj, False, runtimeContext))
 955
 956     sc_result = scandeps(tool["id"], tool,
 957                          set(),
 958                          set(("location", "id")),
 959                          None, urljoin=doc_loader.fetcher.urljoin,
 960                          nestdirs=False)
 961
 962     visit_class(sc_result, ("File", "Directory"), collect_locators)
 963     visit_class(tool, ("DockerRequirement", "http://arvados.org/cwl#WorkflowRunnerResources"), collect_locators)
 964
 965
 966 def print_keep_deps(arvRunner, runtimeContext, merged_map, tool):
 967     references = set()
 968
 969     tool.visit(partial(print_keep_deps_visitor, arvRunner.api, runtimeContext, references, tool.doc_loader))
 970
 971     for mm in merged_map:
 972         for k, v in merged_map[mm].resolved.items():
 973             g = arvados.util.keepuri_pattern.match(v)
 974             if g:
 975                 references.add(g[1])
 976
 977     json.dump(sorted(references), arvRunner.stdout)
 978     print(file=arvRunner.stdout)
 979
 980 class ArvSecretStore(SecretStore):
 981     def add(self, value):
 982         if value is None:
 983             return None
 984         return super().add(value)