Merge branch 'master' into 10078-dashboard-perf
[arvados.git] / sdk / cwl / arvados_cwl / runner.py
1 import os
2 import urlparse
3 from functools import partial
4 import logging
5 import json
6 import re
7 from cStringIO import StringIO
8
9 import cwltool.draft2tool
10 from cwltool.draft2tool import CommandLineTool
11 import cwltool.workflow
12 from cwltool.process import get_feature, scandeps, UnsupportedRequirement, normalizeFilesDirs
13 from cwltool.load_tool import fetch_document
14 from cwltool.pathmapper import adjustFileObjs, adjustDirObjs
15
16 import arvados.collection
17 import ruamel.yaml as yaml
18
19 from .arvdocker import arv_docker_get_image
20 from .pathmapper import ArvPathMapper
21
22 logger = logging.getLogger('arvados.cwl-runner')
23
24 cwltool.draft2tool.ACCEPTLIST_RE = re.compile(r"^[a-zA-Z0-9._+-]+$")
25
26 def trim_listing(obj):
27     """Remove 'listing' field from Directory objects that are keep references.
28
29     When Directory objects represent Keep references, it redundant and
30     potentially very expensive to pass fully enumerated Directory objects
31     between instances of cwl-runner (e.g. a submitting a job, or using the
32     RunInSingleContainer feature), so delete the 'listing' field when it is
33     safe to do so.
34     """
35
36     if obj.get("location", "").startswith("keep:") and "listing" in obj:
37         del obj["listing"]
38     if obj.get("location", "").startswith("_:"):
39         del obj["location"]
40
41 def upload_dependencies(arvrunner, name, document_loader,
42                         workflowobj, uri, loadref_run):
43     """Upload the dependencies of the workflowobj document to Keep.
44
45     Returns a pathmapper object mapping local paths to keep references.  Also
46     does an in-place update of references in "workflowobj".
47
48     Use scandeps to find $import, $include, $schemas, run, File and Directory
49     fields that represent external references.
50
51     If workflowobj has an "id" field, this will reload the document to ensure
52     it is scanning the raw document prior to preprocessing.
53     """
54
55     loaded = set()
56     def loadref(b, u):
57         joined = urlparse.urljoin(b, u)
58         defrg, _ = urlparse.urldefrag(joined)
59         if defrg not in loaded:
60             loaded.add(defrg)
61             # Use fetch_text to get raw file (before preprocessing).
62             text = document_loader.fetch_text(defrg)
63             if isinstance(text, bytes):
64                 textIO = StringIO(text.decode('utf-8'))
65             else:
66                 textIO = StringIO(text)
67             return yaml.safe_load(textIO)
68         else:
69             return {}
70
71     if loadref_run:
72         loadref_fields = set(("$import", "run"))
73     else:
74         loadref_fields = set(("$import",))
75
76     scanobj = workflowobj
77     if "id" in workflowobj:
78         # Need raw file content (before preprocessing) to ensure
79         # that external references in $include and $mixin are captured.
80         scanobj = loadref("", workflowobj["id"])
81
82     sc = scandeps(uri, scanobj,
83                   loadref_fields,
84                   set(("$include", "$schemas", "location")),
85                   loadref)
86
87     normalizeFilesDirs(sc)
88
89     if "id" in workflowobj:
90         sc.append({"class": "File", "location": workflowobj["id"]})
91
92     mapper = ArvPathMapper(arvrunner, sc, "",
93                            "keep:%s",
94                            "keep:%s/%s",
95                            name=name)
96
97     def setloc(p):
98         if "location" in p and (not p["location"].startswith("_:")) and (not p["location"].startswith("keep:")):
99             p["location"] = mapper.mapper(p["location"]).resolved
100     adjustFileObjs(workflowobj, setloc)
101     adjustDirObjs(workflowobj, setloc)
102
103     return mapper
104
105
106 def upload_docker(arvrunner, tool):
107     if isinstance(tool, CommandLineTool):
108         (docker_req, docker_is_req) = get_feature(tool, "DockerRequirement")
109         if docker_req:
110             arv_docker_get_image(arvrunner.api, docker_req, True, arvrunner.project_uuid)
111     elif isinstance(tool, cwltool.workflow.Workflow):
112         for s in tool.steps:
113             upload_docker(arvrunner, s.embedded_tool)
114
115
116 class Runner(object):
117     def __init__(self, runner, tool, job_order, enable_reuse):
118         self.arvrunner = runner
119         self.tool = tool
120         self.job_order = job_order
121         self.running = False
122         self.enable_reuse = enable_reuse
123         self.uuid = None
124
125     def update_pipeline_component(self, record):
126         pass
127
128     def arvados_job_spec(self, *args, **kwargs):
129         upload_docker(self.arvrunner, self.tool)
130
131         self.name = os.path.basename(self.tool.tool["id"])
132
133         workflowmapper = upload_dependencies(self.arvrunner,
134                                              self.name,
135                                              self.tool.doc_loader,
136                                              self.tool.tool,
137                                              self.tool.tool["id"],
138                                              True)
139
140         jobmapper = upload_dependencies(self.arvrunner,
141                                         os.path.basename(self.job_order.get("id", "#")),
142                                         self.tool.doc_loader,
143                                         self.job_order,
144                                         self.job_order.get("id", "#"),
145                                         False)
146
147         adjustDirObjs(self.job_order, trim_listing)
148
149         if "id" in self.job_order:
150             del self.job_order["id"]
151
152         return workflowmapper
153
154
155     def done(self, record):
156         if record["state"] == "Complete":
157             if record.get("exit_code") is not None:
158                 if record["exit_code"] == 33:
159                     processStatus = "UnsupportedRequirement"
160                 elif record["exit_code"] == 0:
161                     processStatus = "success"
162                 else:
163                     processStatus = "permanentFail"
164             else:
165                 processStatus = "success"
166         else:
167             processStatus = "permanentFail"
168
169         outputs = None
170         try:
171             try:
172                 outc = arvados.collection.Collection(record["output"])
173                 with outc.open("cwl.output.json") as f:
174                     outputs = json.load(f)
175                 def keepify(fileobj):
176                     path = fileobj["location"]
177                     if not path.startswith("keep:"):
178                         fileobj["location"] = "keep:%s/%s" % (record["output"], path)
179                 adjustFileObjs(outputs, keepify)
180                 adjustDirObjs(outputs, keepify)
181             except Exception as e:
182                 logger.error("While getting final output object: %s", e)
183             self.arvrunner.output_callback(outputs, processStatus)
184         finally:
185             del self.arvrunner.processes[record["uuid"]]