6 import arvados.commands.keepdocker
7 import arvados.commands.run
8 import arvados.collection
10 import cwltool.draft2tool
11 import cwltool.workflow
13 from cwltool.process import shortname
14 from cwltool.errors import WorkflowException
25 from cwltool.process import get_feature, adjustFiles, scandeps
26 from arvados.api import OrderedJsonModel
28 logger = logging.getLogger('arvados.cwl-runner')
29 logger.setLevel(logging.INFO)
31 crunchrunner_pdh = "83db29f08544e1c319572a6bd971088a+140"
32 crunchrunner_download = "https://cloud.curoverse.com/collections/download/qr1hi-4zz18-n3m1yxd0vx78jic/1i1u2qtq66k1atziv4ocfgsg5nu5tj11n4r6e0bhvjg03rix4m/crunchrunner"
33 certs_download = "https://cloud.curoverse.com/collections/download/qr1hi-4zz18-n3m1yxd0vx78jic/1i1u2qtq66k1atziv4ocfgsg5nu5tj11n4r6e0bhvjg03rix4m/ca-certificates.crt"
35 tmpdirre = re.compile(r"^\S+ \S+ \d+ \d+ stderr \S+ \S+ crunchrunner: \$\(task\.tmpdir\)=(.*)")
36 outdirre = re.compile(r"^\S+ \S+ \d+ \d+ stderr \S+ \S+ crunchrunner: \$\(task\.outdir\)=(.*)")
37 keepre = re.compile(r"^\S+ \S+ \d+ \d+ stderr \S+ \S+ crunchrunner: \$\(task\.keep\)=(.*)")
40 def arv_docker_get_image(api_client, dockerRequirement, pull_image, project_uuid):
41 if "dockerImageId" not in dockerRequirement and "dockerPull" in dockerRequirement:
42 dockerRequirement["dockerImageId"] = dockerRequirement["dockerPull"]
44 sp = dockerRequirement["dockerImageId"].split(":")
46 image_tag = sp[1] if len(sp) > 1 else None
48 images = arvados.commands.keepdocker.list_images_in_arv(api_client, 3,
49 image_name=image_name,
53 imageId = cwltool.docker.get_image(dockerRequirement, pull_image)
54 args = ["--project-uuid="+project_uuid, image_name]
56 args.append(image_tag)
57 logger.info("Uploading Docker image %s", ":".join(args[1:]))
58 arvados.commands.keepdocker.main(args)
60 return dockerRequirement["dockerImageId"]
63 class CollectionFsAccess(cwltool.process.StdFsAccess):
64 def __init__(self, basedir):
66 self.basedir = basedir
68 def get_collection(self, path):
70 if p[0].startswith("keep:") and arvados.util.keep_locator_pattern.match(p[0][5:]):
72 if pdh not in self.collections:
73 self.collections[pdh] = arvados.collection.CollectionReader(pdh)
74 return (self.collections[pdh], "/".join(p[1:]))
78 def _match(self, collection, patternsegments, parent):
79 if not patternsegments:
82 if not isinstance(collection, arvados.collection.RichCollectionBase):
86 # iterate over the files and subcollections in 'collection'
87 for filename in collection:
88 if patternsegments[0] == '.':
89 # Pattern contains something like "./foo" so just shift
91 ret.extend(self._match(collection, patternsegments[1:], parent))
92 elif fnmatch.fnmatch(filename, patternsegments[0]):
93 cur = os.path.join(parent, filename)
94 if len(patternsegments) == 1:
97 ret.extend(self._match(collection[filename], patternsegments[1:], cur))
100 def glob(self, pattern):
101 collection, rest = self.get_collection(pattern)
102 patternsegments = rest.split("/")
103 return self._match(collection, patternsegments, "keep:" + collection.manifest_locator())
105 def open(self, fn, mode):
106 collection, rest = self.get_collection(fn)
108 return collection.open(rest, mode)
110 return open(self._abs(fn), mode)
112 def exists(self, fn):
113 collection, rest = self.get_collection(fn)
115 return collection.exists(rest)
117 return os.path.exists(self._abs(fn))
119 class ArvadosJob(object):
120 def __init__(self, runner):
121 self.arvrunner = runner
124 def run(self, dry_run=False, pull_image=True, **kwargs):
125 script_parameters = {
126 "command": self.command_line
128 runtime_constraints = {}
130 if self.generatefiles:
131 vwd = arvados.collection.Collection()
132 script_parameters["task.vwd"] = {}
133 for t in self.generatefiles:
134 if isinstance(self.generatefiles[t], dict):
135 src, rest = self.arvrunner.fs_access.get_collection(self.generatefiles[t]["path"].replace("$(task.keep)/", "keep:"))
136 vwd.copy(rest, t, source_collection=src)
138 with vwd.open(t, "w") as f:
139 f.write(self.generatefiles[t])
141 for t in self.generatefiles:
142 script_parameters["task.vwd"][t] = "$(task.keep)/%s/%s" % (vwd.portable_data_hash(), t)
144 script_parameters["task.env"] = {"TMPDIR": "$(task.tmpdir)"}
146 script_parameters["task.env"].update(self.environment)
149 script_parameters["task.stdin"] = self.pathmapper.mapper(self.stdin)[1]
152 script_parameters["task.stdout"] = self.stdout
154 (docker_req, docker_is_req) = get_feature(self, "DockerRequirement")
155 if docker_req and kwargs.get("use_container") is not False:
156 runtime_constraints["docker_image"] = arv_docker_get_image(self.arvrunner.api, docker_req, pull_image, self.arvrunner.project_uuid)
158 resources = self.builder.resources
159 if resources is not None:
160 runtime_constraints["min_cores_per_node"] = resources.get("cores", 1)
161 runtime_constraints["min_ram_mb_per_node"] = resources.get("ram")
162 runtime_constraints["min_scratch_mb_per_node"] = resources.get("tmpdirSize", 0) + resources.get("outdirSize", 0)
165 response = self.arvrunner.api.jobs().create(body={
166 "owner_uuid": self.arvrunner.project_uuid,
167 "script": "crunchrunner",
168 "repository": "arvados",
169 "script_version": "master",
170 "minimum_script_version": "9e5b98e8f5f4727856b53447191f9c06e3da2ba6",
171 "script_parameters": {"tasks": [script_parameters], "crunchrunner": crunchrunner_pdh+"/crunchrunner"},
172 "runtime_constraints": runtime_constraints
173 }, find_or_create=kwargs.get("enable_reuse", True)).execute(num_retries=self.arvrunner.num_retries)
175 self.arvrunner.jobs[response["uuid"]] = self
177 self.arvrunner.pipeline["components"][self.name] = {"job": response}
178 self.arvrunner.pipeline = self.arvrunner.api.pipeline_instances().update(uuid=self.arvrunner.pipeline["uuid"],
180 "components": self.arvrunner.pipeline["components"]
181 }).execute(num_retries=self.arvrunner.num_retries)
183 logger.info("Job %s (%s) is %s", self.name, response["uuid"], response["state"])
185 if response["state"] in ("Complete", "Failed", "Cancelled"):
187 except Exception as e:
188 logger.error("Got error %s" % str(e))
189 self.output_callback({}, "permanentFail")
191 def update_pipeline_component(self, record):
192 self.arvrunner.pipeline["components"][self.name] = {"job": record}
193 self.arvrunner.pipeline = self.arvrunner.api.pipeline_instances().update(uuid=self.arvrunner.pipeline["uuid"],
195 "components": self.arvrunner.pipeline["components"]
196 }).execute(num_retries=self.arvrunner.num_retries)
198 def done(self, record):
200 self.update_pipeline_component(record)
205 if record["state"] == "Complete":
206 processStatus = "success"
208 processStatus = "permanentFail"
213 logc = arvados.collection.Collection(record["log"])
214 log = logc.open(logc.keys()[0])
218 for l in log.readlines():
219 g = tmpdirre.match(l)
222 g = outdirre.match(l)
229 # It turns out if the job fails and restarts it can
230 # come up on a different compute node, so we have to
231 # read the log to the end to be sure instead of taking the
234 #if tmpdir and outdir and keepdir:
237 self.builder.outdir = outdir
238 self.builder.pathmapper.keepdir = keepdir
239 outputs = self.collect_outputs("keep:" + record["output"])
240 except WorkflowException as e:
241 logger.error("Error while collecting job outputs:\n%s", e, exc_info=(e if self.arvrunner.debug else False))
242 processStatus = "permanentFail"
243 except Exception as e:
244 logger.exception("Got unknown exception while collecting job outputs:")
245 processStatus = "permanentFail"
247 self.output_callback(outputs, processStatus)
249 del self.arvrunner.jobs[record["uuid"]]
252 class RunnerJob(object):
253 def __init__(self, runner, tool, job_order, enable_reuse):
254 self.arvrunner = runner
256 self.job_order = job_order
258 self.enable_reuse = enable_reuse
260 def update_pipeline_component(self, record):
263 def upload_docker(self, tool):
264 if isinstance(tool, cwltool.draft2tool.CommandLineTool):
265 (docker_req, docker_is_req) = get_feature(tool, "DockerRequirement")
267 arv_docker_get_image(self.arvrunner.api, docker_req, True, self.arvrunner.project_uuid)
268 elif isinstance(tool, cwltool.workflow.Workflow):
270 self.upload_docker(s.embedded_tool)
272 def run(self, dry_run=False, pull_image=True, **kwargs):
273 self.upload_docker(self.tool)
275 workflowfiles = set()
277 workflowfiles.add(self.tool.tool["id"])
279 self.name = os.path.basename(self.tool.tool["id"])
281 def visitFiles(files, path):
285 document_loader, _, _ = cwltool.process.get_schema()
287 return document_loader.resolve_ref(u, base_url=b)[0]
289 sc = scandeps("", self.tool.tool,
290 set(("$import", "run")),
291 set(("$include", "$schemas", "path")),
293 adjustFiles(sc, functools.partial(visitFiles, workflowfiles))
294 adjustFiles(self.job_order, functools.partial(visitFiles, jobfiles))
296 workflowmapper = ArvPathMapper(self.arvrunner, workflowfiles, "",
302 jobmapper = ArvPathMapper(self.arvrunner, jobfiles, "",
305 name=os.path.basename(self.job_order.get("id", "#")),
308 adjustFiles(self.job_order, lambda p: jobmapper.mapper(p)[1])
310 if "id" in self.job_order:
311 del self.job_order["id"]
313 self.job_order["cwl:tool"] = workflowmapper.mapper(self.tool.tool["id"])[1]
315 response = self.arvrunner.api.jobs().create(body={
316 "script": "cwl-runner",
317 "script_version": "8654-arv-jobs-cwl-runner",
318 "repository": "arvados",
319 "script_parameters": self.job_order,
320 "runtime_constraints": {
321 "docker_image": "arvados/jobs"
323 }, find_or_create=self.enable_reuse).execute(num_retries=self.arvrunner.num_retries)
325 self.arvrunner.jobs[response["uuid"]] = self
327 logger.info("Submitted job %s", response["uuid"])
329 if response["state"] in ("Complete", "Failed", "Cancelled"):
332 def done(self, record):
333 if record["state"] == "Complete":
334 processStatus = "success"
336 processStatus = "permanentFail"
340 outc = arvados.collection.Collection(record["output"])
341 with outc.open("cwl.output.json") as f:
342 outputs = json.load(f)
343 self.arvrunner.output_callback(outputs, processStatus)
345 del self.arvrunner.jobs[record["uuid"]]
347 class ArvPathMapper(cwltool.pathmapper.PathMapper):
348 def __init__(self, arvrunner, referenced_files, basedir,
349 collection_pattern, file_pattern, name=None, **kwargs):
350 self._pathmap = arvrunner.get_uploaded()
353 pdh_path = re.compile(r'^keep:[0-9a-f]{32}\+\d+/.+')
355 for src in referenced_files:
356 if isinstance(src, basestring) and pdh_path.match(src):
357 self._pathmap[src] = (src, collection_pattern % src[5:])
358 if src not in self._pathmap:
359 ab = cwltool.pathmapper.abspath(src, basedir)
360 st = arvados.commands.run.statfile("", ab, fnPattern=file_pattern)
361 if kwargs.get("conformance_test"):
362 self._pathmap[src] = (src, ab)
363 elif isinstance(st, arvados.commands.run.UploadFile):
364 uploadfiles.append((src, ab, st))
365 elif isinstance(st, arvados.commands.run.ArvFile):
366 self._pathmap[src] = (ab, st.fn)
368 raise cwltool.workflow.WorkflowException("Input file path '%s' is invalid" % st)
371 arvados.commands.run.uploadfiles([u[2] for u in uploadfiles],
373 dry_run=kwargs.get("dry_run"),
375 fnPattern=file_pattern,
377 project=arvrunner.project_uuid)
379 for src, ab, st in uploadfiles:
380 arvrunner.add_uploaded(src, (ab, st.fn))
381 self._pathmap[src] = (ab, st.fn)
385 def reversemap(self, target):
386 if target.startswith("keep:"):
387 return (target, target)
388 elif self.keepdir and target.startswith(self.keepdir):
389 return (target, "keep:" + target[len(self.keepdir)+1:])
391 return super(ArvPathMapper, self).reversemap(target)
394 class ArvadosCommandTool(cwltool.draft2tool.CommandLineTool):
395 def __init__(self, arvrunner, toolpath_object, **kwargs):
396 super(ArvadosCommandTool, self).__init__(toolpath_object, **kwargs)
397 self.arvrunner = arvrunner
399 def makeJobRunner(self):
400 return ArvadosJob(self.arvrunner)
402 def makePathMapper(self, reffiles, input_basedir, **kwargs):
403 return ArvPathMapper(self.arvrunner, reffiles, input_basedir,
405 "$(task.keep)/%s/%s",
409 class ArvCwlRunner(object):
410 def __init__(self, api_client):
411 self.api = api_client
413 self.lock = threading.Lock()
414 self.cond = threading.Condition(self.lock)
415 self.final_output = None
419 def arvMakeTool(self, toolpath_object, **kwargs):
420 if "class" in toolpath_object and toolpath_object["class"] == "CommandLineTool":
421 return ArvadosCommandTool(self, toolpath_object, **kwargs)
423 return cwltool.workflow.defaultMakeTool(toolpath_object, **kwargs)
425 def output_callback(self, out, processStatus):
426 if processStatus == "success":
427 logger.info("Overall job status is %s", processStatus)
429 self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
430 body={"state": "Complete"}).execute(num_retries=self.num_retries)
433 logger.warn("Overall job status is %s", processStatus)
435 self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
436 body={"state": "Failed"}).execute(num_retries=self.num_retries)
437 self.final_output = out
440 def on_message(self, event):
441 if "object_uuid" in event:
442 if event["object_uuid"] in self.jobs and event["event_type"] == "update":
443 if event["properties"]["new_attributes"]["state"] == "Running" and self.jobs[event["object_uuid"]].running is False:
444 uuid = event["object_uuid"]
447 logger.info("Job %s (%s) is Running", j.name, uuid)
449 j.update_pipeline_component(event["properties"]["new_attributes"])
450 elif event["properties"]["new_attributes"]["state"] in ("Complete", "Failed", "Cancelled"):
451 uuid = event["object_uuid"]
455 logger.info("Job %s (%s) is %s", j.name, uuid, event["properties"]["new_attributes"]["state"])
456 j.done(event["properties"]["new_attributes"])
461 def get_uploaded(self):
462 return self.uploaded.copy()
464 def add_uploaded(self, src, pair):
465 self.uploaded[src] = pair
467 def arvExecutor(self, tool, job_order, input_basedir, args, **kwargs):
468 self.debug = args.debug
471 logger.setLevel(logging.WARN)
472 logging.getLogger('arvados.arv-run').setLevel(logging.WARN)
475 self.api.collections().get(uuid=crunchrunner_pdh).execute()
476 except arvados.errors.ApiError as e:
478 h = httplib2.Http(ca_certs=arvados.util.ca_certs_path())
479 resp, content = h.request(crunchrunner_download, "GET")
480 resp2, content2 = h.request(certs_download, "GET")
481 with arvados.collection.Collection() as col:
482 with col.open("crunchrunner", "w") as f:
484 with col.open("ca-certificates.crt", "w") as f:
487 col.save_new("crunchrunner binary", ensure_unique_name=True)
489 useruuid = self.api.users().current().execute()["uuid"]
490 self.project_uuid = args.project_uuid if args.project_uuid else useruuid
494 runnerjob = RunnerJob(self, tool, job_order, args.enable_reuse)
499 events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message)
501 self.fs_access = CollectionFsAccess(input_basedir)
503 kwargs["fs_access"] = self.fs_access
504 kwargs["enable_reuse"] = args.enable_reuse
506 kwargs["outdir"] = "$(task.outdir)"
507 kwargs["tmpdir"] = "$(task.tmpdir)"
509 if kwargs.get("conformance_test"):
510 return cwltool.main.single_job_executor(tool, job_order, input_basedir, args, **kwargs)
513 jobiter = iter((runnerjob,))
516 if "cwl_runner_job" in kwargs:
517 components[os.path.basename(tool.tool["id"])] = {"job": kwargs["cwl_runner_job"]}
519 self.pipeline = self.api.pipeline_instances().create(
521 "owner_uuid": self.project_uuid,
522 "name": shortname(tool.tool["id"]),
523 "components": components,
524 "state": "RunningOnClient"}).execute(num_retries=self.num_retries)
526 logger.info("Pipeline instance %s", self.pipeline["uuid"])
528 jobiter = tool.job(job_order,
530 self.output_callback,
531 docker_outdir="$(task.outdir)",
536 # Will continue to hold the lock for the duration of this code
537 # except when in cond.wait(), at which point on_message can update
538 # job state and process output callbacks.
540 for runnable in jobiter:
542 runnable.run(**kwargs)
547 logger.error("Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs.")
555 if self.final_output is None:
556 raise cwltool.workflow.WorkflowException("Workflow did not return a result.")
558 # create final output collection
560 if sys.exc_info()[0] is KeyboardInterrupt:
561 logger.error("Interrupted, marking pipeline as failed")
563 logger.error("Caught unhandled exception, marking pipeline as failed. Error was: %s", sys.exc_info()[0], exc_info=(sys.exc_info()[1] if self.debug else False))
565 self.api.pipeline_instances().update(uuid=self.pipeline["uuid"],
566 body={"state": "Failed"}).execute(num_retries=self.num_retries)
570 return self.final_output
573 def main(args, stdout, stderr, api_client=None):
574 args.insert(0, "--leave-outputs")
575 parser = cwltool.main.arg_parser()
576 exgroup = parser.add_mutually_exclusive_group()
577 exgroup.add_argument("--enable-reuse", action="store_true",
578 default=True, dest="enable_reuse",
580 exgroup.add_argument("--disable-reuse", action="store_false",
581 default=True, dest="enable_reuse",
583 parser.add_argument("--project-uuid", type=str, help="Project that will own the workflow jobs")
584 parser.add_argument("--submit", action="store_true", help="Submit job and print job uuid.",
586 parser.add_argument("--wait", action="store_true", help="Wait for completion after submitting cwl-runner job.",
590 if api_client is None:
591 api_client=arvados.api('v1', model=OrderedJsonModel())
592 runner = ArvCwlRunner(api_client)
593 except Exception as e:
597 return cwltool.main.main(args,
600 executor=runner.arvExecutor,
601 makeTool=runner.arvMakeTool,