-DEFAULT_PRIORITY = 500
-
-class ArvCwlRunner(object):
- """Execute a CWL tool or workflow, submit work (using either jobs or
- containers API), wait for them to complete, and report output.
-
- """
-
- def __init__(self, api_client, work_api=None, keep_client=None, output_name=None, output_tags=None, num_retries=4):
- self.api = api_client
- self.processes = {}
- self.lock = threading.Lock()
- self.cond = threading.Condition(self.lock)
- self.final_output = None
- self.final_status = None
- self.uploaded = {}
- self.num_retries = num_retries
- self.uuid = None
- self.stop_polling = threading.Event()
- self.poll_api = None
- self.pipeline = None
- self.final_output_collection = None
- self.output_name = output_name
- self.output_tags = output_tags
- self.project_uuid = None
- self.intermediate_output_ttl = 0
- self.intermediate_output_collections = []
- self.trash_intermediate = False
-
- if keep_client is not None:
- self.keep_client = keep_client
- else:
- self.keep_client = arvados.keep.KeepClient(api_client=self.api, num_retries=self.num_retries)
-
- self.collection_cache = CollectionCache(self.api, self.keep_client, self.num_retries)
-
- self.work_api = None
- expected_api = ["jobs", "containers"]
- for api in expected_api:
- try:
- methods = self.api._rootDesc.get('resources')[api]['methods']
- if ('httpMethod' in methods['create'] and
- (work_api == api or work_api is None)):
- self.work_api = api
- break
- except KeyError:
- pass
-
- if not self.work_api:
- if work_api is None:
- raise Exception("No supported APIs")
- else:
- raise Exception("Unsupported API '%s', expected one of %s" % (work_api, expected_api))
-
- def arv_make_tool(self, toolpath_object, **kwargs):
- kwargs["work_api"] = self.work_api
- kwargs["fetcher_constructor"] = partial(CollectionFetcher,
- api_client=self.api,
- fs_access=CollectionFsAccess("", collection_cache=self.collection_cache),
- num_retries=self.num_retries)
- kwargs["resolver"] = partial(collectionResolver, self.api, num_retries=self.num_retries)
- if "class" in toolpath_object and toolpath_object["class"] == "CommandLineTool":
- return ArvadosCommandTool(self, toolpath_object, **kwargs)
- elif "class" in toolpath_object and toolpath_object["class"] == "Workflow":
- return ArvadosWorkflow(self, toolpath_object, **kwargs)
- else:
- return cwltool.workflow.defaultMakeTool(toolpath_object, **kwargs)
-
- def output_callback(self, out, processStatus):
- if processStatus == "success":
- logger.info("Overall process status is %s", processStatus)
- if self.pipeline:
- self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
- body={"state": "Complete"}).execute(num_retries=self.num_retries)
- else:
- logger.warn("Overall process status is %s", processStatus)
- if self.pipeline:
- self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
- body={"state": "Failed"}).execute(num_retries=self.num_retries)
- self.final_status = processStatus
- self.final_output = out
-
- def on_message(self, event):
- if "object_uuid" in event:
- if event["object_uuid"] in self.processes and event["event_type"] == "update":
- if event["properties"]["new_attributes"]["state"] == "Running" and self.processes[event["object_uuid"]].running is False:
- uuid = event["object_uuid"]
- with self.lock:
- j = self.processes[uuid]
- logger.info("%s %s is Running", self.label(j), uuid)
- j.running = True
- j.update_pipeline_component(event["properties"]["new_attributes"])
- elif event["properties"]["new_attributes"]["state"] in ("Complete", "Failed", "Cancelled", "Final"):
- uuid = event["object_uuid"]
- try:
- self.cond.acquire()
- j = self.processes[uuid]
- logger.info("%s %s is %s", self.label(j), uuid, event["properties"]["new_attributes"]["state"])
- with Perf(metrics, "done %s" % j.name):
- j.done(event["properties"]["new_attributes"])
- self.cond.notify()
- finally:
- self.cond.release()
-
- def label(self, obj):
- return "[%s %s]" % (self.work_api[0:-1], obj.name)
-
- def poll_states(self):
- """Poll status of jobs or containers listed in the processes dict.
-
- Runs in a separate thread.
- """
-
- try:
- while True:
- self.stop_polling.wait(15)
- if self.stop_polling.is_set():
- break
- with self.lock:
- keys = self.processes.keys()
- if not keys:
- continue
-
- if self.work_api == "containers":
- table = self.poll_api.container_requests()
- elif self.work_api == "jobs":
- table = self.poll_api.jobs()
-
- try:
- proc_states = table.list(filters=[["uuid", "in", keys]]).execute(num_retries=self.num_retries)
- except Exception as e:
- logger.warn("Error checking states on API server: %s", e)
- continue
-
- for p in proc_states["items"]:
- self.on_message({
- "object_uuid": p["uuid"],
- "event_type": "update",
- "properties": {
- "new_attributes": p
- }
- })
- except:
- logger.error("Fatal error in state polling thread.", exc_info=(sys.exc_info()[1] if self.debug else False))
- self.cond.acquire()
- self.processes.clear()
- self.cond.notify()
- self.cond.release()
- finally:
- self.stop_polling.set()
-
- def get_uploaded(self):
- return self.uploaded.copy()
-
- def add_uploaded(self, src, pair):
- self.uploaded[src] = pair
-
- def add_intermediate_output(self, uuid):
- if uuid:
- self.intermediate_output_collections.append(uuid)
-
- def trash_intermediate_output(self):
- logger.info("Cleaning up intermediate output collections")
- for i in self.intermediate_output_collections:
- try:
- self.api.collections().delete(uuid=i).execute(num_retries=self.num_retries)
- except:
- logger.warn("Failed to delete intermediate output: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
- if sys.exc_info()[0] is KeyboardInterrupt:
- break
-
- def check_features(self, obj):
- if isinstance(obj, dict):
- if obj.get("writable") and self.work_api != "containers":
- raise SourceLine(obj, "writable", UnsupportedRequirement).makeError("InitialWorkDir feature 'writable: true' not supported with --api=jobs")
- if obj.get("class") == "DockerRequirement":
- if obj.get("dockerOutputDirectory"):
- if self.work_api != "containers":
- raise SourceLine(obj, "dockerOutputDirectory", UnsupportedRequirement).makeError(
- "Option 'dockerOutputDirectory' of DockerRequirement not supported with --api=jobs.")
- if not obj.get("dockerOutputDirectory").startswith('/'):
- raise SourceLine(obj, "dockerOutputDirectory", validate.ValidationException).makeError(
- "Option 'dockerOutputDirectory' must be an absolute path.")
- for v in obj.itervalues():
- self.check_features(v)
- elif isinstance(obj, list):
- for i,v in enumerate(obj):
- with SourceLine(obj, i, UnsupportedRequirement, logger.isEnabledFor(logging.DEBUG)):
- self.check_features(v)
-
- def make_output_collection(self, name, tagsString, outputObj):
- outputObj = copy.deepcopy(outputObj)
-
- files = []
- def capture(fileobj):
- files.append(fileobj)
-
- adjustDirObjs(outputObj, capture)
- adjustFileObjs(outputObj, capture)
-
- generatemapper = NoFollowPathMapper(files, "", "", separateDirs=False)
-
- final = arvados.collection.Collection(api_client=self.api,
- keep_client=self.keep_client,
- num_retries=self.num_retries)
-
- for k,v in generatemapper.items():
- if k.startswith("_:"):
- if v.type == "Directory":
- continue
- if v.type == "CreateFile":
- with final.open(v.target, "wb") as f:
- f.write(v.resolved.encode("utf-8"))
- continue
-
- if not k.startswith("keep:"):
- raise Exception("Output source is not in keep or a literal")
- sp = k.split("/")
- srccollection = sp[0][5:]
- try:
- reader = self.collection_cache.get(srccollection)
- srcpath = "/".join(sp[1:]) if len(sp) > 1 else "."
- final.copy(srcpath, v.target, source_collection=reader, overwrite=False)
- except arvados.errors.ArgumentError as e:
- logger.error("Creating CollectionReader for '%s' '%s': %s", k, v, e)
- raise
- except IOError as e:
- logger.warn("While preparing output collection: %s", e)
-
- def rewrite(fileobj):
- fileobj["location"] = generatemapper.mapper(fileobj["location"]).target
- for k in ("listing", "contents", "nameext", "nameroot", "dirname"):
- if k in fileobj:
- del fileobj[k]
-
- adjustDirObjs(outputObj, rewrite)
- adjustFileObjs(outputObj, rewrite)
-
- with final.open("cwl.output.json", "w") as f:
- json.dump(outputObj, f, sort_keys=True, indent=4, separators=(',',': '))
-
- final.save_new(name=name, owner_uuid=self.project_uuid, ensure_unique_name=True)
-
- logger.info("Final output collection %s \"%s\" (%s)", final.portable_data_hash(),
- final.api_response()["name"],
- final.manifest_locator())
-
- final_uuid = final.manifest_locator()
- tags = tagsString.split(',')
- for tag in tags:
- self.api.links().create(body={
- "head_uuid": final_uuid, "link_class": "tag", "name": tag
- }).execute(num_retries=self.num_retries)
-
- def finalcollection(fileobj):
- fileobj["location"] = "keep:%s/%s" % (final.portable_data_hash(), fileobj["location"])
-
- adjustDirObjs(outputObj, finalcollection)
- adjustFileObjs(outputObj, finalcollection)
-
- return (outputObj, final)
-
- def set_crunch_output(self):
- if self.work_api == "containers":
- try:
- current = self.api.containers().current().execute(num_retries=self.num_retries)
- except ApiError as e:
- # Status code 404 just means we're not running in a container.
- if e.resp.status != 404:
- logger.info("Getting current container: %s", e)
- return
- try:
- self.api.containers().update(uuid=current['uuid'],
- body={
- 'output': self.final_output_collection.portable_data_hash(),
- }).execute(num_retries=self.num_retries)
- self.api.collections().update(uuid=self.final_output_collection.manifest_locator(),
- body={
- 'is_trashed': True
- }).execute(num_retries=self.num_retries)
- except Exception as e:
- logger.info("Setting container output: %s", e)
- elif self.work_api == "jobs" and "TASK_UUID" in os.environ:
- self.api.job_tasks().update(uuid=os.environ["TASK_UUID"],
- body={
- 'output': self.final_output_collection.portable_data_hash(),
- 'success': self.final_status == "success",
- 'progress':1.0
- }).execute(num_retries=self.num_retries)
-
- def arv_executor(self, tool, job_order, **kwargs):
- self.debug = kwargs.get("debug")
-
- tool.visit(self.check_features)
-
- self.project_uuid = kwargs.get("project_uuid")
- self.pipeline = None
- make_fs_access = kwargs.get("make_fs_access") or partial(CollectionFsAccess,
- collection_cache=self.collection_cache)
- self.fs_access = make_fs_access(kwargs["basedir"])
-
-
- self.trash_intermediate = kwargs["trash_intermediate"]
- if self.trash_intermediate and self.work_api != "containers":
- raise Exception("--trash-intermediate is only supported with --api=containers.")
-
- self.intermediate_output_ttl = kwargs["intermediate_output_ttl"]
- if self.intermediate_output_ttl and self.work_api != "containers":
- raise Exception("--intermediate-output-ttl is only supported with --api=containers.")
- if self.intermediate_output_ttl < 0:
- raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl)
-
- if not kwargs.get("name"):
- kwargs["name"] = self.name = tool.tool.get("label") or tool.metadata.get("label") or os.path.basename(tool.tool["id"])
-
- # Upload direct dependencies of workflow steps, get back mapping of files to keep references.
- # Also uploads docker images.
- merged_map = upload_workflow_deps(self, tool)
-
- # Reload tool object which may have been updated by
- # upload_workflow_deps
- tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]],
- makeTool=self.arv_make_tool,
- loader=tool.doc_loader,
- avsc_names=tool.doc_schema,
- metadata=tool.metadata)
-
- # Upload local file references in the job order.
- job_order = upload_job_order(self, "%s input" % kwargs["name"],
- tool, job_order)
-
- existing_uuid = kwargs.get("update_workflow")
- if existing_uuid or kwargs.get("create_workflow"):
- # Create a pipeline template or workflow record and exit.
- if self.work_api == "jobs":
- tmpl = RunnerTemplate(self, tool, job_order,
- kwargs.get("enable_reuse"),
- uuid=existing_uuid,
- submit_runner_ram=kwargs.get("submit_runner_ram"),
- name=kwargs["name"],
- merged_map=merged_map)
- tmpl.save()
- # cwltool.main will write our return value to stdout.
- return (tmpl.uuid, "success")
- elif self.work_api == "containers":
- return (upload_workflow(self, tool, job_order,
- self.project_uuid,
- uuid=existing_uuid,
- submit_runner_ram=kwargs.get("submit_runner_ram"),
- name=kwargs["name"],
- merged_map=merged_map),
- "success")
-
- self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse")
- self.eval_timeout = kwargs.get("eval_timeout")
-
- kwargs["make_fs_access"] = make_fs_access
- kwargs["enable_reuse"] = kwargs.get("enable_reuse")
- kwargs["use_container"] = True
- kwargs["tmpdir_prefix"] = "tmp"
- kwargs["compute_checksum"] = kwargs.get("compute_checksum")
-
- if self.work_api == "containers":
- if self.ignore_docker_for_reuse:
- raise Exception("--ignore-docker-for-reuse not supported with containers API.")
- kwargs["outdir"] = "/var/spool/cwl"
- kwargs["docker_outdir"] = "/var/spool/cwl"
- kwargs["tmpdir"] = "/tmp"
- kwargs["docker_tmpdir"] = "/tmp"
- elif self.work_api == "jobs":
- if kwargs["priority"] != DEFAULT_PRIORITY:
- raise Exception("--priority not implemented for jobs API.")
- kwargs["outdir"] = "$(task.outdir)"
- kwargs["docker_outdir"] = "$(task.outdir)"
- kwargs["tmpdir"] = "$(task.tmpdir)"
-
- if kwargs["priority"] < 1 or kwargs["priority"] > 1000:
- raise Exception("--priority must be in the range 1..1000.")
-
- runnerjob = None
- if kwargs.get("submit"):
- # Submit a runner job to run the workflow for us.
- if self.work_api == "containers":
- if tool.tool["class"] == "CommandLineTool" and kwargs.get("wait"):
- kwargs["runnerjob"] = tool.tool["id"]
- runnerjob = tool.job(job_order,
- self.output_callback,
- **kwargs).next()
- else:
- runnerjob = RunnerContainer(self, tool, job_order, kwargs.get("enable_reuse"),
- self.output_name,
- self.output_tags,
- submit_runner_ram=kwargs.get("submit_runner_ram"),
- name=kwargs.get("name"),
- on_error=kwargs.get("on_error"),
- submit_runner_image=kwargs.get("submit_runner_image"),
- intermediate_output_ttl=kwargs.get("intermediate_output_ttl"),
- merged_map=merged_map,
- priority=kwargs.get("priority"))
- elif self.work_api == "jobs":
- runnerjob = RunnerJob(self, tool, job_order, kwargs.get("enable_reuse"),
- self.output_name,
- self.output_tags,
- submit_runner_ram=kwargs.get("submit_runner_ram"),
- name=kwargs.get("name"),
- on_error=kwargs.get("on_error"),
- submit_runner_image=kwargs.get("submit_runner_image"),
- merged_map=merged_map)
- elif "cwl_runner_job" not in kwargs and self.work_api == "jobs":
- # Create pipeline for local run
- self.pipeline = self.api.pipeline_instances().create(
- body={
- "owner_uuid": self.project_uuid,
- "name": kwargs["name"] if kwargs.get("name") else shortname(tool.tool["id"]),
- "components": {},
- "state": "RunningOnClient"}).execute(num_retries=self.num_retries)
- logger.info("Pipeline instance %s", self.pipeline["uuid"])
-
- if runnerjob and not kwargs.get("wait"):
- runnerjob.run(wait=kwargs.get("wait"))
- return (runnerjob.uuid, "success")
-
- self.poll_api = arvados.api('v1')
- self.polling_thread = threading.Thread(target=self.poll_states)
- self.polling_thread.start()
-
- if runnerjob:
- jobiter = iter((runnerjob,))
- else:
- if "cwl_runner_job" in kwargs:
- self.uuid = kwargs.get("cwl_runner_job").get('uuid')
- jobiter = tool.job(job_order,
- self.output_callback,
- **kwargs)
-
- try:
- self.cond.acquire()
- # Will continue to hold the lock for the duration of this code
- # except when in cond.wait(), at which point on_message can update
- # job state and process output callbacks.
-
- loopperf = Perf(metrics, "jobiter")
- loopperf.__enter__()
- for runnable in jobiter:
- loopperf.__exit__()
-
- if self.stop_polling.is_set():
- break
-
- if runnable:
- with Perf(metrics, "run"):
- runnable.run(**kwargs)
- else:
- if self.processes:
- self.cond.wait(1)
- else:
- logger.error("Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs.")
- break
- loopperf.__enter__()
- loopperf.__exit__()
-
- while self.processes:
- self.cond.wait(1)
-
- except UnsupportedRequirement:
- raise
- except:
- if sys.exc_info()[0] is KeyboardInterrupt:
- logger.error("Interrupted, marking pipeline as failed")
- else:
- logger.error("Execution failed: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False))
- if self.pipeline:
- self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
- body={"state": "Failed"}).execute(num_retries=self.num_retries)
- if runnerjob and runnerjob.uuid and self.work_api == "containers":
- self.api.container_requests().update(uuid=runnerjob.uuid,
- body={"priority": "0"}).execute(num_retries=self.num_retries)
- finally:
- self.cond.release()
- self.stop_polling.set()
- self.polling_thread.join()
-
- if self.final_status == "UnsupportedRequirement":
- raise UnsupportedRequirement("Check log for details.")
-
- if self.final_output is None:
- raise WorkflowException("Workflow did not return a result.")
-
- if kwargs.get("submit") and isinstance(runnerjob, Runner):
- logger.info("Final output collection %s", runnerjob.final_output)
- else:
- if self.output_name is None:
- self.output_name = "Output of %s" % (shortname(tool.tool["id"]))
- if self.output_tags is None:
- self.output_tags = ""
- self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, self.output_tags, self.final_output)
- self.set_crunch_output()
-
- if kwargs.get("compute_checksum"):
- adjustDirObjs(self.final_output, partial(get_listing, self.fs_access))
- adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access))
-
- if self.trash_intermediate and self.final_status == "success":
- self.trash_intermediate_output()
-
- return (self.final_output, self.final_status)
-
-