sdk/cwl/arvados_cwl/fsaccess.py

   1 import fnmatch
   2 import os
   3 import errno
   4 import urlparse
   5 import re
   6 import logging
   7
   8 import ruamel.yaml as yaml
   9
  10 import cwltool.stdfsaccess
  11 from cwltool.pathmapper import abspath
  12 import cwltool.resolver
  13
  14 import arvados.util
  15 import arvados.collection
  16 import arvados.arvfile
  17 import arvados.errors
  18
  19 from schema_salad.ref_resolver import DefaultFetcher
  20
  21 logger = logging.getLogger('arvados.cwl-runner')
  22
  23 class CollectionCache(object):
  24     def __init__(self, api_client, keep_client, num_retries):
  25         self.api_client = api_client
  26         self.keep_client = keep_client
  27         self.collections = {}
  28
  29     def get(self, pdh):
  30         if pdh not in self.collections:
  31             logger.debug("Creating collection reader for %s", pdh)
  32             self.collections[pdh] = arvados.collection.CollectionReader(pdh, api_client=self.api_client,
  33                                                                         keep_client=self.keep_client)
  34         return self.collections[pdh]
  35
  36
  37 class CollectionFsAccess(cwltool.stdfsaccess.StdFsAccess):
  38     """Implement the cwltool FsAccess interface for Arvados Collections."""
  39
  40     def __init__(self, basedir, collection_cache=None):
  41         super(CollectionFsAccess, self).__init__(basedir)
  42         self.collection_cache = collection_cache
  43
  44     def get_collection(self, path):
  45         sp = path.split("/", 1)
  46         p = sp[0]
  47         if p.startswith("keep:") and arvados.util.keep_locator_pattern.match(p[5:]):
  48             pdh = p[5:]
  49             return (self.collection_cache.get(pdh), sp[1] if len(sp) == 2 else None)
  50         else:
  51             return (None, path)
  52
  53     def _match(self, collection, patternsegments, parent):
  54         if not patternsegments:
  55             return []
  56
  57         if not isinstance(collection, arvados.collection.RichCollectionBase):
  58             return []
  59
  60         ret = []
  61         # iterate over the files and subcollections in 'collection'
  62         for filename in collection:
  63             if patternsegments[0] == '.':
  64                 # Pattern contains something like "./foo" so just shift
  65                 # past the "./"
  66                 ret.extend(self._match(collection, patternsegments[1:], parent))
  67             elif fnmatch.fnmatch(filename, patternsegments[0]):
  68                 cur = os.path.join(parent, filename)
  69                 if len(patternsegments) == 1:
  70                     ret.append(cur)
  71                 else:
  72                     ret.extend(self._match(collection[filename], patternsegments[1:], cur))
  73         return ret
  74
  75     def glob(self, pattern):
  76         collection, rest = self.get_collection(pattern)
  77         if collection and not rest:
  78             return [pattern]
  79         patternsegments = rest.split("/")
  80         return self._match(collection, patternsegments, "keep:" + collection.manifest_locator())
  81
  82     def open(self, fn, mode):
  83         collection, rest = self.get_collection(fn)
  84         if collection:
  85             return collection.open(rest, mode)
  86         else:
  87             return super(CollectionFsAccess, self).open(self._abs(fn), mode)
  88
  89     def exists(self, fn):
  90         collection, rest = self.get_collection(fn)
  91         if collection:
  92             if rest:
  93                 return collection.exists(rest)
  94             else:
  95                 return True
  96         else:
  97             return super(CollectionFsAccess, self).exists(fn)
  98
  99     def isfile(self, fn):  # type: (unicode) -> bool
 100         collection, rest = self.get_collection(fn)
 101         if collection:
 102             if rest:
 103                 return isinstance(collection.find(rest), arvados.arvfile.ArvadosFile)
 104             else:
 105                 return False
 106         else:
 107             return super(CollectionFsAccess, self).isfile(fn)
 108
 109     def isdir(self, fn):  # type: (unicode) -> bool
 110         collection, rest = self.get_collection(fn)
 111         if collection:
 112             if rest:
 113                 return isinstance(collection.find(rest), arvados.collection.RichCollectionBase)
 114             else:
 115                 return True
 116         else:
 117             return super(CollectionFsAccess, self).isdir(fn)
 118
 119     def listdir(self, fn):  # type: (unicode) -> List[unicode]
 120         collection, rest = self.get_collection(fn)
 121         if collection:
 122             if rest:
 123                 dir = collection.find(rest)
 124             else:
 125                 dir = collection
 126             if dir is None:
 127                 raise IOError(errno.ENOENT, "Directory '%s' in '%s' not found" % (rest, collection.portable_data_hash()))
 128             if not isinstance(dir, arvados.collection.RichCollectionBase):
 129                 raise IOError(errno.ENOENT, "Path '%s' in '%s' is not a Directory" % (rest, collection.portable_data_hash()))
 130             return [abspath(l, fn) for l in dir.keys()]
 131         else:
 132             return super(CollectionFsAccess, self).listdir(fn)
 133
 134     def join(self, path, *paths): # type: (unicode, *unicode) -> unicode
 135         if paths and paths[-1].startswith("keep:") and arvados.util.keep_locator_pattern.match(paths[-1][5:]):
 136             return paths[-1]
 137         return os.path.join(path, *paths)
 138
 139     def realpath(self, path):
 140         if path.startswith("$(task.tmpdir)") or path.startswith("$(task.outdir)"):
 141             return path
 142         collection, rest = self.get_collection(path)
 143         if collection:
 144             return path
 145         else:
 146             return os.path.realpath(path)
 147
 148 class CollectionFetcher(DefaultFetcher):
 149     def __init__(self, cache, session, api_client=None, fs_access=None, num_retries=4):
 150         super(CollectionFetcher, self).__init__(cache, session)
 151         self.api_client = api_client
 152         self.fsaccess = fs_access
 153         self.num_retries = num_retries
 154
 155     def fetch_text(self, url):
 156         if url.startswith("keep:"):
 157             with self.fsaccess.open(url, "r") as f:
 158                 return f.read()
 159         if url.startswith("arvwf:"):
 160             record = self.api_client.workflows().get(uuid=url[6:]).execute(num_retries=self.num_retries)
 161             definition = record["definition"] + ('\nlabel: "%s"\n' % record["name"].replace('"', '\\"'))
 162             return definition
 163         return super(CollectionFetcher, self).fetch_text(url)
 164
 165     def check_exists(self, url):
 166         try:
 167             if url.startswith("http://arvados.org/cwl"):
 168                 return True
 169             if url.startswith("keep:"):
 170                 return self.fsaccess.exists(url)
 171             if url.startswith("arvwf:"):
 172                 if self.fetch_text(url):
 173                     return True
 174         except arvados.errors.NotFoundError:
 175             return False
 176         except:
 177             logger.exception("Got unexpected exception checking if file exists:")
 178             return False
 179         return super(CollectionFetcher, self).check_exists(url)
 180
 181     def urljoin(self, base_url, url):
 182         if not url:
 183             return base_url
 184
 185         urlsp = urlparse.urlsplit(url)
 186         if urlsp.scheme or not base_url:
 187             return url
 188
 189         basesp = urlparse.urlsplit(base_url)
 190         if basesp.scheme in ("keep", "arvwf"):
 191             if not basesp.path:
 192                 raise IOError(errno.EINVAL, "Invalid Keep locator", base_url)
 193
 194             baseparts = basesp.path.split("/")
 195             urlparts = urlsp.path.split("/") if urlsp.path else []
 196
 197             pdh = baseparts.pop(0)
 198
 199             if basesp.scheme == "keep" and not arvados.util.keep_locator_pattern.match(pdh):
 200                 raise IOError(errno.EINVAL, "Invalid Keep locator", base_url)
 201
 202             if urlsp.path.startswith("/"):
 203                 baseparts = []
 204                 urlparts.pop(0)
 205
 206             if baseparts and urlsp.path:
 207                 baseparts.pop()
 208
 209             path = "/".join([pdh] + baseparts + urlparts)
 210             return urlparse.urlunsplit((basesp.scheme, "", path, "", urlsp.fragment))
 211
 212         return super(CollectionFetcher, self).urljoin(base_url, url)
 213
 214 workflow_uuid_pattern = re.compile(r'[a-z0-9]{5}-7fd4e-[a-z0-9]{15}')
 215 pipeline_template_uuid_pattern = re.compile(r'[a-z0-9]{5}-p5p6p-[a-z0-9]{15}')
 216
 217 def collectionResolver(api_client, document_loader, uri, num_retries=4):
 218     if workflow_uuid_pattern.match(uri):
 219         return "arvwf:%s#main" % (uri)
 220
 221     if pipeline_template_uuid_pattern.match(uri):
 222         pt = api_client.pipeline_templates().get(uuid=uri).execute(num_retries=num_retries)
 223         return "keep:" + pt["components"].values()[0]["script_parameters"]["cwl:tool"]
 224
 225     p = uri.split("/")
 226     if arvados.util.keep_locator_pattern.match(p[0]):
 227         return "keep:%s" % (uri)
 228
 229     if arvados.util.collection_uuid_pattern.match(p[0]):
 230         return "keep:%s%s" % (api_client.collections().
 231                               get(uuid=p[0]).execute()["portable_data_hash"],
 232                               uri[len(p[0]):])
 233
 234     return cwltool.resolver.tool_resolver(document_loader, uri)