Merge branch '14495-crunch-docker-space' refs #14495
[arvados.git] / sdk / cwl / arvados_cwl / fsaccess.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 import fnmatch
6 import os
7 import errno
8 import urlparse
9 import re
10 import logging
11 import threading
12 from collections import OrderedDict
13
14 import ruamel.yaml as yaml
15
16 import cwltool.stdfsaccess
17 from cwltool.pathmapper import abspath
18 import cwltool.resolver
19
20 import arvados.util
21 import arvados.collection
22 import arvados.arvfile
23 import arvados.errors
24
25 from googleapiclient.errors import HttpError
26
27 from schema_salad.ref_resolver import DefaultFetcher
28
29 logger = logging.getLogger('arvados.cwl-runner')
30
31 pdh_size = re.compile(r'([0-9a-f]{32})\+(\d+)(\+\S+)*')
32
33 class CollectionCache(object):
34     def __init__(self, api_client, keep_client, num_retries,
35                  cap=256*1024*1024,
36                  min_entries=2):
37         self.api_client = api_client
38         self.keep_client = keep_client
39         self.num_retries = num_retries
40         self.collections = OrderedDict()
41         self.lock = threading.Lock()
42         self.total = 0
43         self.cap = cap
44         self.min_entries = min_entries
45
46     def set_cap(self, cap):
47         self.cap = cap
48
49     def cap_cache(self, required):
50         # ordered dict iterates from oldest to newest
51         for pdh, v in self.collections.items():
52             available = self.cap - self.total
53             if available >= required or len(self.collections) < self.min_entries:
54                 return
55             # cut it loose
56             logger.debug("Evicting collection reader %s from cache (cap %s total %s required %s)", pdh, self.cap, self.total, required)
57             del self.collections[pdh]
58             self.total -= v[1]
59
60     def get(self, pdh):
61         with self.lock:
62             if pdh not in self.collections:
63                 m = pdh_size.match(pdh)
64                 if m:
65                     self.cap_cache(int(m.group(2)) * 128)
66                 logger.debug("Creating collection reader for %s", pdh)
67                 cr = arvados.collection.CollectionReader(pdh, api_client=self.api_client,
68                                                          keep_client=self.keep_client,
69                                                          num_retries=self.num_retries)
70                 sz = len(cr.manifest_text()) * 128
71                 self.collections[pdh] = (cr, sz)
72                 self.total += sz
73             else:
74                 cr, sz = self.collections[pdh]
75                 # bump it to the back
76                 del self.collections[pdh]
77                 self.collections[pdh] = (cr, sz)
78             return cr
79
80
81 class CollectionFsAccess(cwltool.stdfsaccess.StdFsAccess):
82     """Implement the cwltool FsAccess interface for Arvados Collections."""
83
84     def __init__(self, basedir, collection_cache=None):
85         super(CollectionFsAccess, self).__init__(basedir)
86         self.collection_cache = collection_cache
87
88     def get_collection(self, path):
89         sp = path.split("/", 1)
90         p = sp[0]
91         if p.startswith("keep:") and arvados.util.keep_locator_pattern.match(p[5:]):
92             pdh = p[5:]
93             return (self.collection_cache.get(pdh), urlparse.unquote(sp[1]) if len(sp) == 2 else None)
94         else:
95             return (None, path)
96
97     def _match(self, collection, patternsegments, parent):
98         if not patternsegments:
99             return []
100
101         if not isinstance(collection, arvados.collection.RichCollectionBase):
102             return []
103
104         ret = []
105         # iterate over the files and subcollections in 'collection'
106         for filename in collection:
107             if patternsegments[0] == '.':
108                 # Pattern contains something like "./foo" so just shift
109                 # past the "./"
110                 ret.extend(self._match(collection, patternsegments[1:], parent))
111             elif fnmatch.fnmatch(filename, patternsegments[0]):
112                 cur = os.path.join(parent, filename)
113                 if len(patternsegments) == 1:
114                     ret.append(cur)
115                 else:
116                     ret.extend(self._match(collection[filename], patternsegments[1:], cur))
117         return ret
118
119     def glob(self, pattern):
120         collection, rest = self.get_collection(pattern)
121         if collection is not None and not rest:
122             return [pattern]
123         patternsegments = rest.split("/")
124         return sorted(self._match(collection, patternsegments, "keep:" + collection.manifest_locator()))
125
126     def open(self, fn, mode):
127         collection, rest = self.get_collection(fn)
128         if collection is not None:
129             return collection.open(rest, mode)
130         else:
131             return super(CollectionFsAccess, self).open(self._abs(fn), mode)
132
133     def exists(self, fn):
134         try:
135             collection, rest = self.get_collection(fn)
136         except HttpError as err:
137             if err.resp.status == 404:
138                 return False
139             else:
140                 raise
141         if collection is not None:
142             if rest:
143                 return collection.exists(rest)
144             else:
145                 return True
146         else:
147             return super(CollectionFsAccess, self).exists(fn)
148
149     def size(self, fn):  # type: (unicode) -> bool
150         collection, rest = self.get_collection(fn)
151         if collection is not None:
152             if rest:
153                 arvfile = collection.find(rest)
154                 if isinstance(arvfile, arvados.arvfile.ArvadosFile):
155                     return arvfile.size()
156             raise IOError(errno.EINVAL, "Not a path to a file %s" % (fn))
157         else:
158             return super(CollectionFsAccess, self).size(fn)
159
160     def isfile(self, fn):  # type: (unicode) -> bool
161         collection, rest = self.get_collection(fn)
162         if collection is not None:
163             if rest:
164                 return isinstance(collection.find(rest), arvados.arvfile.ArvadosFile)
165             else:
166                 return False
167         else:
168             return super(CollectionFsAccess, self).isfile(fn)
169
170     def isdir(self, fn):  # type: (unicode) -> bool
171         collection, rest = self.get_collection(fn)
172         if collection is not None:
173             if rest:
174                 return isinstance(collection.find(rest), arvados.collection.RichCollectionBase)
175             else:
176                 return True
177         else:
178             return super(CollectionFsAccess, self).isdir(fn)
179
180     def listdir(self, fn):  # type: (unicode) -> List[unicode]
181         collection, rest = self.get_collection(fn)
182         if collection is not None:
183             if rest:
184                 dir = collection.find(rest)
185             else:
186                 dir = collection
187             if dir is None:
188                 raise IOError(errno.ENOENT, "Directory '%s' in '%s' not found" % (rest, collection.portable_data_hash()))
189             if not isinstance(dir, arvados.collection.RichCollectionBase):
190                 raise IOError(errno.ENOENT, "Path '%s' in '%s' is not a Directory" % (rest, collection.portable_data_hash()))
191             return [abspath(l, fn) for l in dir.keys()]
192         else:
193             return super(CollectionFsAccess, self).listdir(fn)
194
195     def join(self, path, *paths): # type: (unicode, *unicode) -> unicode
196         if paths and paths[-1].startswith("keep:") and arvados.util.keep_locator_pattern.match(paths[-1][5:]):
197             return paths[-1]
198         return os.path.join(path, *paths)
199
200     def realpath(self, path):
201         if path.startswith("$(task.tmpdir)") or path.startswith("$(task.outdir)"):
202             return path
203         collection, rest = self.get_collection(path)
204         if collection is not None:
205             return path
206         else:
207             return os.path.realpath(path)
208
209 class CollectionFetcher(DefaultFetcher):
210     def __init__(self, cache, session, api_client=None, fs_access=None, num_retries=4):
211         super(CollectionFetcher, self).__init__(cache, session)
212         self.api_client = api_client
213         self.fsaccess = fs_access
214         self.num_retries = num_retries
215
216     def fetch_text(self, url):
217         if url.startswith("keep:"):
218             with self.fsaccess.open(url, "r") as f:
219                 return f.read()
220         if url.startswith("arvwf:"):
221             record = self.api_client.workflows().get(uuid=url[6:]).execute(num_retries=self.num_retries)
222             definition = record["definition"] + ('\nlabel: "%s"\n' % record["name"].replace('"', '\\"'))
223             return definition
224         return super(CollectionFetcher, self).fetch_text(url)
225
226     def check_exists(self, url):
227         try:
228             if url.startswith("http://arvados.org/cwl"):
229                 return True
230             if url.startswith("keep:"):
231                 return self.fsaccess.exists(url)
232             if url.startswith("arvwf:"):
233                 if self.fetch_text(url):
234                     return True
235         except arvados.errors.NotFoundError:
236             return False
237         except:
238             logger.exception("Got unexpected exception checking if file exists:")
239             return False
240         return super(CollectionFetcher, self).check_exists(url)
241
242     def urljoin(self, base_url, url):
243         if not url:
244             return base_url
245
246         urlsp = urlparse.urlsplit(url)
247         if urlsp.scheme or not base_url:
248             return url
249
250         basesp = urlparse.urlsplit(base_url)
251         if basesp.scheme in ("keep", "arvwf"):
252             if not basesp.path:
253                 raise IOError(errno.EINVAL, "Invalid Keep locator", base_url)
254
255             baseparts = basesp.path.split("/")
256             urlparts = urlsp.path.split("/") if urlsp.path else []
257
258             pdh = baseparts.pop(0)
259
260             if basesp.scheme == "keep" and not arvados.util.keep_locator_pattern.match(pdh):
261                 raise IOError(errno.EINVAL, "Invalid Keep locator", base_url)
262
263             if urlsp.path.startswith("/"):
264                 baseparts = []
265                 urlparts.pop(0)
266
267             if baseparts and urlsp.path:
268                 baseparts.pop()
269
270             path = "/".join([pdh] + baseparts + urlparts)
271             return urlparse.urlunsplit((basesp.scheme, "", path, "", urlsp.fragment))
272
273         return super(CollectionFetcher, self).urljoin(base_url, url)
274
275     schemes = [u"file", u"http", u"https", u"mailto", u"keep", u"arvwf"]
276
277     def supported_schemes(self):  # type: () -> List[Text]
278         return self.schemes
279
280
281 workflow_uuid_pattern = re.compile(r'[a-z0-9]{5}-7fd4e-[a-z0-9]{15}')
282 pipeline_template_uuid_pattern = re.compile(r'[a-z0-9]{5}-p5p6p-[a-z0-9]{15}')
283
284 def collectionResolver(api_client, document_loader, uri, num_retries=4):
285     if uri.startswith("keep:") or uri.startswith("arvwf:"):
286         return uri
287
288     if workflow_uuid_pattern.match(uri):
289         return "arvwf:%s#main" % (uri)
290
291     if pipeline_template_uuid_pattern.match(uri):
292         pt = api_client.pipeline_templates().get(uuid=uri).execute(num_retries=num_retries)
293         return "keep:" + pt["components"].values()[0]["script_parameters"]["cwl:tool"]
294
295     p = uri.split("/")
296     if arvados.util.keep_locator_pattern.match(p[0]):
297         return "keep:%s" % (uri)
298
299     if arvados.util.collection_uuid_pattern.match(p[0]):
300         return "keep:%s%s" % (api_client.collections().
301                               get(uuid=p[0]).execute()["portable_data_hash"],
302                               uri[len(p[0]):])
303
304     return cwltool.resolver.tool_resolver(document_loader, uri)