13365: Reintroduce secondaryFile discovery.
[arvados.git] / sdk / cwl / arvados_cwl / fsaccess.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 import fnmatch
6 import os
7 import errno
8 import urlparse
9 import re
10 import logging
11 import threading
12 from collections import OrderedDict
13
14 import ruamel.yaml as yaml
15
16 import cwltool.stdfsaccess
17 from cwltool.pathmapper import abspath
18 import cwltool.resolver
19
20 import arvados.util
21 import arvados.collection
22 import arvados.arvfile
23 import arvados.errors
24
25 from googleapiclient.errors import HttpError
26
27 from schema_salad.ref_resolver import DefaultFetcher
28
29 logger = logging.getLogger('arvados.cwl-runner')
30
31 class CollectionCache(object):
32     def __init__(self, api_client, keep_client, num_retries,
33                  cap=256*1024*1024,
34                  min_entries=2):
35         self.api_client = api_client
36         self.keep_client = keep_client
37         self.num_retries = num_retries
38         self.collections = OrderedDict()
39         self.lock = threading.Lock()
40         self.total = 0
41         self.cap = cap
42         self.min_entries = min_entries
43
44     def cap_cache(self):
45         if self.total > self.cap:
46             # ordered list iterates from oldest to newest
47             for pdh, v in self.collections.items():
48                 if self.total < self.cap or len(self.collections) < self.min_entries:
49                     break
50                 # cut it loose
51                 logger.debug("Evicting collection reader %s from cache", pdh)
52                 del self.collections[pdh]
53                 self.total -= v[1]
54
55     def get(self, pdh):
56         with self.lock:
57             if pdh not in self.collections:
58                 logger.debug("Creating collection reader for %s", pdh)
59                 cr = arvados.collection.CollectionReader(pdh, api_client=self.api_client,
60                                                          keep_client=self.keep_client,
61                                                          num_retries=self.num_retries)
62                 sz = len(cr.manifest_text()) * 128
63                 self.collections[pdh] = (cr, sz)
64                 self.total += sz
65                 self.cap_cache()
66             else:
67                 cr, sz = self.collections[pdh]
68                 # bump it to the back
69                 del self.collections[pdh]
70                 self.collections[pdh] = (cr, sz)
71             return cr
72
73
74 class CollectionFsAccess(cwltool.stdfsaccess.StdFsAccess):
75     """Implement the cwltool FsAccess interface for Arvados Collections."""
76
77     def __init__(self, basedir, collection_cache=None):
78         super(CollectionFsAccess, self).__init__(basedir)
79         self.collection_cache = collection_cache
80
81     def get_collection(self, path):
82         sp = path.split("/", 1)
83         p = sp[0]
84         if p.startswith("keep:") and arvados.util.keep_locator_pattern.match(p[5:]):
85             pdh = p[5:]
86             return (self.collection_cache.get(pdh), sp[1] if len(sp) == 2 else None)
87         else:
88             return (None, path)
89
90     def _match(self, collection, patternsegments, parent):
91         if not patternsegments:
92             return []
93
94         if not isinstance(collection, arvados.collection.RichCollectionBase):
95             return []
96
97         ret = []
98         # iterate over the files and subcollections in 'collection'
99         for filename in collection:
100             if patternsegments[0] == '.':
101                 # Pattern contains something like "./foo" so just shift
102                 # past the "./"
103                 ret.extend(self._match(collection, patternsegments[1:], parent))
104             elif fnmatch.fnmatch(filename, patternsegments[0]):
105                 cur = os.path.join(parent, filename)
106                 if len(patternsegments) == 1:
107                     ret.append(cur)
108                 else:
109                     ret.extend(self._match(collection[filename], patternsegments[1:], cur))
110         return ret
111
112     def glob(self, pattern):
113         collection, rest = self.get_collection(pattern)
114         if collection is not None and not rest:
115             return [pattern]
116         patternsegments = rest.split("/")
117         return sorted(self._match(collection, patternsegments, "keep:" + collection.manifest_locator()))
118
119     def open(self, fn, mode):
120         collection, rest = self.get_collection(fn)
121         if collection is not None:
122             return collection.open(rest, mode)
123         else:
124             return super(CollectionFsAccess, self).open(self._abs(fn), mode)
125
126     def exists(self, fn):
127         try:
128             collection, rest = self.get_collection(fn)
129         except HttpError as err:
130             if err.resp.status == 404:
131                 return False
132             else:
133                 raise
134         if collection is not None:
135             if rest:
136                 return collection.exists(rest)
137             else:
138                 return True
139         else:
140             return super(CollectionFsAccess, self).exists(fn)
141
142     def isfile(self, fn):  # type: (unicode) -> bool
143         collection, rest = self.get_collection(fn)
144         if collection is not None:
145             if rest:
146                 return isinstance(collection.find(rest), arvados.arvfile.ArvadosFile)
147             else:
148                 return False
149         else:
150             return super(CollectionFsAccess, self).isfile(fn)
151
152     def isdir(self, fn):  # type: (unicode) -> bool
153         collection, rest = self.get_collection(fn)
154         if collection is not None:
155             if rest:
156                 return isinstance(collection.find(rest), arvados.collection.RichCollectionBase)
157             else:
158                 return True
159         else:
160             return super(CollectionFsAccess, self).isdir(fn)
161
162     def listdir(self, fn):  # type: (unicode) -> List[unicode]
163         collection, rest = self.get_collection(fn)
164         if collection is not None:
165             if rest:
166                 dir = collection.find(rest)
167             else:
168                 dir = collection
169             if dir is None:
170                 raise IOError(errno.ENOENT, "Directory '%s' in '%s' not found" % (rest, collection.portable_data_hash()))
171             if not isinstance(dir, arvados.collection.RichCollectionBase):
172                 raise IOError(errno.ENOENT, "Path '%s' in '%s' is not a Directory" % (rest, collection.portable_data_hash()))
173             return [abspath(l, fn) for l in dir.keys()]
174         else:
175             return super(CollectionFsAccess, self).listdir(fn)
176
177     def join(self, path, *paths): # type: (unicode, *unicode) -> unicode
178         if paths and paths[-1].startswith("keep:") and arvados.util.keep_locator_pattern.match(paths[-1][5:]):
179             return paths[-1]
180         return os.path.join(path, *paths)
181
182     def realpath(self, path):
183         if path.startswith("$(task.tmpdir)") or path.startswith("$(task.outdir)"):
184             return path
185         collection, rest = self.get_collection(path)
186         if collection is not None:
187             return path
188         else:
189             return os.path.realpath(path)
190
191 class CollectionFetcher(DefaultFetcher):
192     def __init__(self, cache, session, api_client=None, fs_access=None, num_retries=4):
193         super(CollectionFetcher, self).__init__(cache, session)
194         self.api_client = api_client
195         self.fsaccess = fs_access
196         self.num_retries = num_retries
197
198     def fetch_text(self, url):
199         if url.startswith("keep:"):
200             with self.fsaccess.open(url, "r") as f:
201                 return f.read()
202         if url.startswith("arvwf:"):
203             record = self.api_client.workflows().get(uuid=url[6:]).execute(num_retries=self.num_retries)
204             definition = record["definition"] + ('\nlabel: "%s"\n' % record["name"].replace('"', '\\"'))
205             return definition
206         return super(CollectionFetcher, self).fetch_text(url)
207
208     def check_exists(self, url):
209         try:
210             if url.startswith("http://arvados.org/cwl"):
211                 return True
212             if url.startswith("keep:"):
213                 return self.fsaccess.exists(url)
214             if url.startswith("arvwf:"):
215                 if self.fetch_text(url):
216                     return True
217         except arvados.errors.NotFoundError:
218             return False
219         except:
220             logger.exception("Got unexpected exception checking if file exists:")
221             return False
222         return super(CollectionFetcher, self).check_exists(url)
223
224     def urljoin(self, base_url, url):
225         if not url:
226             return base_url
227
228         urlsp = urlparse.urlsplit(url)
229         if urlsp.scheme or not base_url:
230             return url
231
232         basesp = urlparse.urlsplit(base_url)
233         if basesp.scheme in ("keep", "arvwf"):
234             if not basesp.path:
235                 raise IOError(errno.EINVAL, "Invalid Keep locator", base_url)
236
237             baseparts = basesp.path.split("/")
238             urlparts = urlsp.path.split("/") if urlsp.path else []
239
240             pdh = baseparts.pop(0)
241
242             if basesp.scheme == "keep" and not arvados.util.keep_locator_pattern.match(pdh):
243                 raise IOError(errno.EINVAL, "Invalid Keep locator", base_url)
244
245             if urlsp.path.startswith("/"):
246                 baseparts = []
247                 urlparts.pop(0)
248
249             if baseparts and urlsp.path:
250                 baseparts.pop()
251
252             path = "/".join([pdh] + baseparts + urlparts)
253             return urlparse.urlunsplit((basesp.scheme, "", path, "", urlsp.fragment))
254
255         return super(CollectionFetcher, self).urljoin(base_url, url)
256
257 workflow_uuid_pattern = re.compile(r'[a-z0-9]{5}-7fd4e-[a-z0-9]{15}')
258 pipeline_template_uuid_pattern = re.compile(r'[a-z0-9]{5}-p5p6p-[a-z0-9]{15}')
259
260 def collectionResolver(api_client, document_loader, uri, num_retries=4):
261     if uri.startswith("keep:") or uri.startswith("arvwf:"):
262         return uri
263
264     if workflow_uuid_pattern.match(uri):
265         return "arvwf:%s#main" % (uri)
266
267     if pipeline_template_uuid_pattern.match(uri):
268         pt = api_client.pipeline_templates().get(uuid=uri).execute(num_retries=num_retries)
269         return "keep:" + pt["components"].values()[0]["script_parameters"]["cwl:tool"]
270
271     p = uri.split("/")
272     if arvados.util.keep_locator_pattern.match(p[0]):
273         return "keep:%s" % (uri)
274
275     if arvados.util.collection_uuid_pattern.match(p[0]):
276         return "keep:%s%s" % (api_client.collections().
277                               get(uuid=p[0]).execute()["portable_data_hash"],
278                               uri[len(p[0]):])
279
280     return cwltool.resolver.tool_resolver(document_loader, uri)