10111: Merge branch 'master' into 10111-collection-labels
[arvados.git] / sdk / cwl / arvados_cwl / crunch_script.py
1 # Crunch script integration for running arvados-cwl-runner (importing
2 # arvados_cwl module) inside a crunch job.
3 #
4 # This gets the job record, transforms the script parameters into a valid CWL
5 # input object, then executes the CWL runner to run the underlying workflow or
6 # tool.  When the workflow completes, record the output object in an output
7 # collection for this runner job.
8
9 import arvados
10 import arvados_cwl
11 import arvados.collection
12 import arvados.util
13 import cwltool.main
14 import logging
15 import os
16 import json
17 import argparse
18 import re
19 import functools
20
21 from arvados.api import OrderedJsonModel
22 from cwltool.process import shortname
23 from cwltool.pathmapper import adjustFileObjs, adjustDirObjs, normalizeFilesDirs
24 from cwltool.load_tool import load_tool
25 from cwltool.errors import WorkflowException
26
27 from .fsaccess import CollectionFetcher, CollectionFsAccess
28
29 logger = logging.getLogger('arvados.cwl-runner')
30
31 def run():
32     # Timestamps are added by crunch-job, so don't print redundant timestamps.
33     arvados.log_handler.setFormatter(logging.Formatter('%(name)s %(levelname)s: %(message)s'))
34
35     # Print package versions
36     logger.info(arvados_cwl.versionstring())
37
38     api = arvados.api("v1")
39
40     arvados_cwl.add_arv_hints()
41
42     runner = None
43     try:
44         job_order_object = arvados.current_job()['script_parameters']
45         toolpath = "file://%s/%s" % (os.environ['TASK_KEEPMOUNT'], job_order_object.pop("cwl:tool"))
46
47         pdh_path = re.compile(r'^[0-9a-f]{32}\+\d+(/.+)?$')
48
49         def keeppath(v):
50             if pdh_path.match(v):
51                 return "keep:%s" % v
52             else:
53                 return v
54
55         def keeppathObj(v):
56             if "location" in v:
57                 v["location"] = keeppath(v["location"])
58
59         for k,v in job_order_object.items():
60             if isinstance(v, basestring) and arvados.util.keep_locator_pattern.match(v):
61                 job_order_object[k] = {
62                     "class": "File",
63                     "location": "keep:%s" % v
64                 }
65
66         adjustFileObjs(job_order_object, keeppathObj)
67         adjustDirObjs(job_order_object, keeppathObj)
68         normalizeFilesDirs(job_order_object)
69
70         output_name = None
71         output_tags = None
72         enable_reuse = True
73         on_error = "continue"
74         if "arv:output_name" in job_order_object:
75             output_name = job_order_object["arv:output_name"]
76             del job_order_object["arv:output_name"]
77
78         if "arv:output_tags" in job_order_object:
79             output_tags = job_order_object["arv:output_tags"]
80             del job_order_object["arv:output_tags"]
81
82         if "arv:enable_reuse" in job_order_object:
83             enable_reuse = job_order_object["arv:enable_reuse"]
84             del job_order_object["arv:enable_reuse"]
85
86         if "arv:on_error" in job_order_object:
87             on_error = job_order_object["arv:on_error"]
88             del job_order_object["arv:on_error"]
89
90         runner = arvados_cwl.ArvCwlRunner(api_client=arvados.api('v1', model=OrderedJsonModel()),
91                                           output_name=output_name, output_tags=output_tags)
92
93         make_fs_access = functools.partial(CollectionFsAccess,
94                                  collection_cache=runner.collection_cache)
95
96         t = load_tool(toolpath, runner.arv_make_tool,
97                       fetcher_constructor=functools.partial(CollectionFetcher,
98                                                   api_client=runner.api,
99                                                   fs_access=make_fs_access(""),
100                                                   num_retries=runner.num_retries))
101
102         args = argparse.Namespace()
103         args.project_uuid = arvados.current_job()["owner_uuid"]
104         args.enable_reuse = enable_reuse
105         args.on_error = on_error
106         args.submit = False
107         args.debug = False
108         args.quiet = False
109         args.ignore_docker_for_reuse = False
110         args.basedir = os.getcwd()
111         args.name = None
112         args.cwl_runner_job={"uuid": arvados.current_job()["uuid"], "state": arvados.current_job()["state"]}
113         args.make_fs_access = make_fs_access
114
115         runner.arv_executor(t, job_order_object, **vars(args))
116     except Exception as e:
117         if isinstance(e, WorkflowException):
118             logging.info("Workflow error %s", e)
119         else:
120             logging.exception("Unhandled exception")
121         if runner and runner.final_output_collection:
122             outputCollection = runner.final_output_collection.portable_data_hash()
123         else:
124             outputCollection = None
125         api.job_tasks().update(uuid=arvados.current_task()['uuid'],
126                                              body={
127                                                  'output': outputCollection,
128                                                  'success': False,
129                                                  'progress':1.0
130                                              }).execute()