14198: Resolve to Docker images to PDH and set "http://arvados.org/cwl#dockerCollecti...
[arvados.git] / sdk / cwl / arvados_cwl / fsaccess.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 import fnmatch
6 import os
7 import errno
8 import urlparse
9 import re
10 import logging
11 import threading
12 from collections import OrderedDict
13
14 import ruamel.yaml as yaml
15
16 import cwltool.stdfsaccess
17 from cwltool.pathmapper import abspath
18 import cwltool.resolver
19
20 import arvados.util
21 import arvados.collection
22 import arvados.arvfile
23 import arvados.errors
24
25 from googleapiclient.errors import HttpError
26
27 from schema_salad.ref_resolver import DefaultFetcher
28
29 logger = logging.getLogger('arvados.cwl-runner')
30
31 class CollectionCache(object):
32     def __init__(self, api_client, keep_client, num_retries,
33                  cap=256*1024*1024,
34                  min_entries=2):
35         self.api_client = api_client
36         self.keep_client = keep_client
37         self.num_retries = num_retries
38         self.collections = OrderedDict()
39         self.lock = threading.Lock()
40         self.total = 0
41         self.cap = cap
42         self.min_entries = min_entries
43
44     def cap_cache(self):
45         if self.total > self.cap:
46             # ordered list iterates from oldest to newest
47             for pdh, v in self.collections.items():
48                 if self.total < self.cap or len(self.collections) < self.min_entries:
49                     break
50                 # cut it loose
51                 logger.debug("Evicting collection reader %s from cache", pdh)
52                 del self.collections[pdh]
53                 self.total -= v[1]
54
55     def get(self, pdh):
56         with self.lock:
57             if pdh not in self.collections:
58                 logger.debug("Creating collection reader for %s", pdh)
59                 cr = arvados.collection.CollectionReader(pdh, api_client=self.api_client,
60                                                          keep_client=self.keep_client,
61                                                          num_retries=self.num_retries)
62                 sz = len(cr.manifest_text()) * 128
63                 self.collections[pdh] = (cr, sz)
64                 self.total += sz
65                 self.cap_cache()
66             else:
67                 cr, sz = self.collections[pdh]
68                 # bump it to the back
69                 del self.collections[pdh]
70                 self.collections[pdh] = (cr, sz)
71             return cr
72
73
74 class CollectionFsAccess(cwltool.stdfsaccess.StdFsAccess):
75     """Implement the cwltool FsAccess interface for Arvados Collections."""
76
77     def __init__(self, basedir, collection_cache=None):
78         super(CollectionFsAccess, self).__init__(basedir)
79         self.collection_cache = collection_cache
80
81     def get_collection(self, path):
82         sp = path.split("/", 1)
83         p = sp[0]
84         if p.startswith("keep:") and arvados.util.keep_locator_pattern.match(p[5:]):
85             pdh = p[5:]
86             return (self.collection_cache.get(pdh), urlparse.unquote(sp[1]) if len(sp) == 2 else None)
87         else:
88             return (None, path)
89
90     def _match(self, collection, patternsegments, parent):
91         if not patternsegments:
92             return []
93
94         if not isinstance(collection, arvados.collection.RichCollectionBase):
95             return []
96
97         ret = []
98         # iterate over the files and subcollections in 'collection'
99         for filename in collection:
100             if patternsegments[0] == '.':
101                 # Pattern contains something like "./foo" so just shift
102                 # past the "./"
103                 ret.extend(self._match(collection, patternsegments[1:], parent))
104             elif fnmatch.fnmatch(filename, patternsegments[0]):
105                 cur = os.path.join(parent, filename)
106                 if len(patternsegments) == 1:
107                     ret.append(cur)
108                 else:
109                     ret.extend(self._match(collection[filename], patternsegments[1:], cur))
110         return ret
111
112     def glob(self, pattern):
113         collection, rest = self.get_collection(pattern)
114         if collection is not None and not rest:
115             return [pattern]
116         patternsegments = rest.split("/")
117         return sorted(self._match(collection, patternsegments, "keep:" + collection.manifest_locator()))
118
119     def open(self, fn, mode):
120         collection, rest = self.get_collection(fn)
121         if collection is not None:
122             return collection.open(rest, mode)
123         else:
124             return super(CollectionFsAccess, self).open(self._abs(fn), mode)
125
126     def exists(self, fn):
127         try:
128             collection, rest = self.get_collection(fn)
129         except HttpError as err:
130             if err.resp.status == 404:
131                 return False
132             else:
133                 raise
134         if collection is not None:
135             if rest:
136                 return collection.exists(rest)
137             else:
138                 return True
139         else:
140             return super(CollectionFsAccess, self).exists(fn)
141
142     def size(self, fn):  # type: (unicode) -> bool
143         collection, rest = self.get_collection(fn)
144         if collection is not None:
145             if rest:
146                 arvfile = collection.find(rest)
147                 if isinstance(arvfile, arvados.arvfile.ArvadosFile):
148                     return arvfile.size()
149             raise IOError(errno.EINVAL, "Not a path to a file %s" % (fn))
150         else:
151             return super(CollectionFsAccess, self).size(fn)
152
153     def isfile(self, fn):  # type: (unicode) -> bool
154         collection, rest = self.get_collection(fn)
155         if collection is not None:
156             if rest:
157                 return isinstance(collection.find(rest), arvados.arvfile.ArvadosFile)
158             else:
159                 return False
160         else:
161             return super(CollectionFsAccess, self).isfile(fn)
162
163     def isdir(self, fn):  # type: (unicode) -> bool
164         collection, rest = self.get_collection(fn)
165         if collection is not None:
166             if rest:
167                 return isinstance(collection.find(rest), arvados.collection.RichCollectionBase)
168             else:
169                 return True
170         else:
171             return super(CollectionFsAccess, self).isdir(fn)
172
173     def listdir(self, fn):  # type: (unicode) -> List[unicode]
174         collection, rest = self.get_collection(fn)
175         if collection is not None:
176             if rest:
177                 dir = collection.find(rest)
178             else:
179                 dir = collection
180             if dir is None:
181                 raise IOError(errno.ENOENT, "Directory '%s' in '%s' not found" % (rest, collection.portable_data_hash()))
182             if not isinstance(dir, arvados.collection.RichCollectionBase):
183                 raise IOError(errno.ENOENT, "Path '%s' in '%s' is not a Directory" % (rest, collection.portable_data_hash()))
184             return [abspath(l, fn) for l in dir.keys()]
185         else:
186             return super(CollectionFsAccess, self).listdir(fn)
187
188     def join(self, path, *paths): # type: (unicode, *unicode) -> unicode
189         if paths and paths[-1].startswith("keep:") and arvados.util.keep_locator_pattern.match(paths[-1][5:]):
190             return paths[-1]
191         return os.path.join(path, *paths)
192
193     def realpath(self, path):
194         if path.startswith("$(task.tmpdir)") or path.startswith("$(task.outdir)"):
195             return path
196         collection, rest = self.get_collection(path)
197         if collection is not None:
198             return path
199         else:
200             return os.path.realpath(path)
201
202 class CollectionFetcher(DefaultFetcher):
203     def __init__(self, cache, session, api_client=None, fs_access=None, num_retries=4):
204         super(CollectionFetcher, self).__init__(cache, session)
205         self.api_client = api_client
206         self.fsaccess = fs_access
207         self.num_retries = num_retries
208
209     def fetch_text(self, url):
210         if url.startswith("keep:"):
211             with self.fsaccess.open(url, "r") as f:
212                 return f.read()
213         if url.startswith("arvwf:"):
214             record = self.api_client.workflows().get(uuid=url[6:]).execute(num_retries=self.num_retries)
215             definition = record["definition"] + ('\nlabel: "%s"\n' % record["name"].replace('"', '\\"'))
216             return definition
217         return super(CollectionFetcher, self).fetch_text(url)
218
219     def check_exists(self, url):
220         try:
221             if url.startswith("http://arvados.org/cwl"):
222                 return True
223             if url.startswith("keep:"):
224                 return self.fsaccess.exists(url)
225             if url.startswith("arvwf:"):
226                 if self.fetch_text(url):
227                     return True
228         except arvados.errors.NotFoundError:
229             return False
230         except:
231             logger.exception("Got unexpected exception checking if file exists:")
232             return False
233         return super(CollectionFetcher, self).check_exists(url)
234
235     def urljoin(self, base_url, url):
236         if not url:
237             return base_url
238
239         urlsp = urlparse.urlsplit(url)
240         if urlsp.scheme or not base_url:
241             return url
242
243         basesp = urlparse.urlsplit(base_url)
244         if basesp.scheme in ("keep", "arvwf"):
245             if not basesp.path:
246                 raise IOError(errno.EINVAL, "Invalid Keep locator", base_url)
247
248             baseparts = basesp.path.split("/")
249             urlparts = urlsp.path.split("/") if urlsp.path else []
250
251             pdh = baseparts.pop(0)
252
253             if basesp.scheme == "keep" and not arvados.util.keep_locator_pattern.match(pdh):
254                 raise IOError(errno.EINVAL, "Invalid Keep locator", base_url)
255
256             if urlsp.path.startswith("/"):
257                 baseparts = []
258                 urlparts.pop(0)
259
260             if baseparts and urlsp.path:
261                 baseparts.pop()
262
263             path = "/".join([pdh] + baseparts + urlparts)
264             return urlparse.urlunsplit((basesp.scheme, "", path, "", urlsp.fragment))
265
266         return super(CollectionFetcher, self).urljoin(base_url, url)
267
268 workflow_uuid_pattern = re.compile(r'[a-z0-9]{5}-7fd4e-[a-z0-9]{15}')
269 pipeline_template_uuid_pattern = re.compile(r'[a-z0-9]{5}-p5p6p-[a-z0-9]{15}')
270
271 def collectionResolver(api_client, document_loader, uri, num_retries=4):
272     if uri.startswith("keep:") or uri.startswith("arvwf:"):
273         return uri
274
275     if workflow_uuid_pattern.match(uri):
276         return "arvwf:%s#main" % (uri)
277
278     if pipeline_template_uuid_pattern.match(uri):
279         pt = api_client.pipeline_templates().get(uuid=uri).execute(num_retries=num_retries)
280         return "keep:" + pt["components"].values()[0]["script_parameters"]["cwl:tool"]
281
282     p = uri.split("/")
283     if arvados.util.keep_locator_pattern.match(p[0]):
284         return "keep:%s" % (uri)
285
286     if arvados.util.collection_uuid_pattern.match(p[0]):
287         return "keep:%s%s" % (api_client.collections().
288                               get(uuid=p[0]).execute()["portable_data_hash"],
289                               uri[len(p[0]):])
290
291     return cwltool.resolver.tool_resolver(document_loader, uri)