Merge branch '18947-githttpd'
[arvados.git] / sdk / cwl / arvados_cwl / pathmapper.py
index 37ca2515e4e141ad090fddff2a27c8c180708cca..64fdfa0d04032e97235dc581144d9cb74494c597 100644 (file)
@@ -21,7 +21,9 @@ import arvados.collection
 from schema_salad.sourceline import SourceLine
 
 from arvados.errors import ApiError
-from cwltool.pathmapper import PathMapper, MapperEnt, abspath, adjustFileObjs, adjustDirObjs
+from cwltool.pathmapper import PathMapper, MapperEnt
+from cwltool.utils import adjustFileObjs, adjustDirObjs
+from cwltool.stdfsaccess import abspath
 from cwltool.workflow import WorkflowException
 
 from .http import http_to_keep
@@ -42,15 +44,16 @@ def trim_listing(obj):
     if obj.get("location", "").startswith("keep:") and "listing" in obj:
         del obj["listing"]
 
+collection_pdh_path = re.compile(r'^keep:[0-9a-f]{32}\+\d+/.+$')
+collection_pdh_pattern = re.compile(r'^keep:([0-9a-f]{32}\+\d+)(/.*)?')
+collection_uuid_pattern = re.compile(r'^keep:([a-z0-9]{5}-4zz18-[a-z0-9]{15})(/.*)?$')
 
 class ArvPathMapper(PathMapper):
     """Convert container-local paths to and from Keep collection ids."""
 
-    pdh_path = re.compile(r'^keep:[0-9a-f]{32}\+\d+/.+$')
-    pdh_dirpath = re.compile(r'^keep:[0-9a-f]{32}\+\d+(/.*)?$')
-
     def __init__(self, arvrunner, referenced_files, input_basedir,
-                 collection_pattern, file_pattern, name=None, single_collection=False):
+                 collection_pattern, file_pattern, name=None, single_collection=False,
+                 optional_deps=None):
         self.arvrunner = arvrunner
         self.input_basedir = input_basedir
         self.collection_pattern = collection_pattern
@@ -59,6 +62,7 @@ class ArvPathMapper(PathMapper):
         self.referenced_files = [r["location"] for r in referenced_files]
         self.single_collection = single_collection
         self.pdh_to_uuid = {}
+        self.optional_deps = optional_deps or []
         super(ArvPathMapper, self).__init__(referenced_files, input_basedir, None)
 
     def visit(self, srcobj, uploadfiles):
@@ -66,13 +70,18 @@ class ArvPathMapper(PathMapper):
         if "#" in src:
             src = src[:src.index("#")]
 
-        if isinstance(src, basestring) and ArvPathMapper.pdh_dirpath.match(src):
-            self._pathmap[src] = MapperEnt(src, self.collection_pattern % urllib.parse.unquote(src[5:]), srcobj["class"], True)
-            if arvados_cwl.util.collectionUUID in srcobj:
-                self.pdh_to_uuid[src.split("/", 1)[0][5:]] = srcobj[arvados_cwl.util.collectionUUID]
-
         debug = logger.isEnabledFor(logging.DEBUG)
 
+        if isinstance(src, basestring) and src.startswith("keep:"):
+            if collection_pdh_pattern.match(src):
+                self._pathmap[src] = MapperEnt(src, self.collection_pattern % urllib.parse.unquote(src[5:]), srcobj["class"], True)
+
+                if arvados_cwl.util.collectionUUID in srcobj:
+                    self.pdh_to_uuid[src.split("/", 1)[0][5:]] = srcobj[arvados_cwl.util.collectionUUID]
+            elif not collection_uuid_pattern.match(src):
+                with SourceLine(srcobj, "location", WorkflowException, debug):
+                    raise WorkflowException("Invalid keep reference '%s'" % src)
+
         if src not in self._pathmap:
             if src.startswith("file:"):
                 # Local FS ref, may need to be uploaded or may be on keep
@@ -95,9 +104,12 @@ class ArvPathMapper(PathMapper):
                 if srcobj["class"] == "Directory" and "listing" not in srcobj:
                     raise WorkflowException("Directory literal '%s' is missing `listing`" % src)
             elif src.startswith("http:") or src.startswith("https:"):
-                keepref = http_to_keep(self.arvrunner.api, self.arvrunner.project_uuid, src)
-                logger.info("%s is %s", src, keepref)
-                self._pathmap[src] = MapperEnt(keepref, keepref, srcobj["class"], True)
+                try:
+                    keepref = http_to_keep(self.arvrunner.api, self.arvrunner.project_uuid, src)
+                    logger.info("%s is %s", src, keepref)
+                    self._pathmap[src] = MapperEnt(keepref, keepref, srcobj["class"], True)
+                except Exception as e:
+                    logger.warning(str(e))
             else:
                 self._pathmap[src] = MapperEnt(src, src, srcobj["class"], True)
 
@@ -126,6 +138,9 @@ class ArvPathMapper(PathMapper):
                 f.write(obj["contents"])
             remap.append((obj["location"], path + "/" + obj["basename"]))
         else:
+            for opt in self.optional_deps:
+                if obj["location"] == opt["location"]:
+                    return
             raise SourceLine(obj, "location", WorkflowException).makeError("Don't know what to do with '%s'" % obj["location"])
 
     def needs_new_collection(self, srcobj, prefix=""):
@@ -140,22 +155,33 @@ class ArvPathMapper(PathMapper):
         loc = srcobj["location"]
         if loc.startswith("_:"):
             return True
-        if prefix:
-            if loc != prefix+srcobj["basename"]:
-                return True
+
+        i = loc.rfind("/")
+        if i > -1:
+            loc_prefix = loc[:i+1]
+            if not prefix:
+                prefix = loc_prefix
+            # quote/unquote to ensure consistent quoting
+            suffix = urllib.parse.quote(urllib.parse.unquote(loc[i+1:]), "/+@")
         else:
-            i = loc.rfind("/")
-            if i > -1:
-                prefix = loc[:i+1]
-            else:
-                prefix = loc+"/"
+            # no '/' found
+            loc_prefix = loc+"/"
+            prefix = loc+"/"
+            suffix = ""
+
+        if prefix != loc_prefix:
+            return True
+
+        if "basename" in srcobj and suffix != urllib.parse.quote(srcobj["basename"], "/+@"):
+            return True
+
         if srcobj["class"] == "File" and loc not in self._pathmap:
             return True
         for s in srcobj.get("secondaryFiles", []):
             if self.needs_new_collection(s, prefix):
                 return True
         if srcobj.get("listing"):
-            prefix = "%s%s/" % (prefix, srcobj["basename"])
+            prefix = "%s%s/" % (prefix, urllib.parse.quote(srcobj.get("basename", suffix), "/+@"))
             for l in srcobj["listing"]:
                 if self.needs_new_collection(l, prefix):
                     return True
@@ -185,7 +211,7 @@ class ArvPathMapper(PathMapper):
                                          packed=False)
 
         for src, ab, st in uploadfiles:
-            self._pathmap[src] = MapperEnt(urllib.parse.quote(st.fn, "/:+@"), self.collection_pattern % st.fn[5:],
+            self._pathmap[src] = MapperEnt(urllib.parse.quote(st.fn, "/:+@"), urllib.parse.quote(self.collection_pattern % st.fn[5:], "/:+@"),
                                            "Directory" if os.path.isdir(ab) else "File", True)
 
         for srcobj in referenced_files:
@@ -208,19 +234,10 @@ class ArvPathMapper(PathMapper):
 
                 ab = self.collection_pattern % c.portable_data_hash()
                 self._pathmap[srcobj["location"]] = MapperEnt("keep:"+c.portable_data_hash(), ab, "Directory", True)
-            elif srcobj["class"] == "File" and (srcobj.get("secondaryFiles") or
-                (srcobj["location"].startswith("_:") and "contents" in srcobj)):
-
-                # If all secondary files/directories are located in
-                # the same collection as the primary file and the
-                # paths and names that are consistent with staging,
-                # don't create a new collection.
-                if not self.needs_new_collection(srcobj):
-                    continue
-
+            elif srcobj["class"] == "File" and self.needs_new_collection(srcobj):
                 c = arvados.collection.Collection(api_client=self.arvrunner.api,
                                                   keep_client=self.arvrunner.keep_client,
-                                                  num_retries=self.arvrunner.num_retries                                                  )
+                                                  num_retries=self.arvrunner.num_retries)
                 self.addentry(srcobj, c, ".", remap)
 
                 container = arvados_cwl.util.get_current_container(self.arvrunner.api, self.arvrunner.num_retries, logger)
@@ -265,6 +282,13 @@ class ArvPathMapper(PathMapper):
 
 
 class StagingPathMapper(PathMapper):
+    # Note that StagingPathMapper internally maps files from target to source.
+    # Specifically, the 'self._pathmap' dict keys are the target location and the
+    # values are 'MapperEnt' named tuples from which we use the 'resolved' attribute
+    # as the file identifier. This makes it possible to map an input file to multiple
+    # target directories. The exception is for file literals, which store the contents of
+    # the file in 'MapperEnt.resolved' and are therefore still mapped from source to target.
+
     _follow_dirs = True
 
     def __init__(self, referenced_files, basedir, stagedir, separateDirs=True):
@@ -274,10 +298,17 @@ class StagingPathMapper(PathMapper):
     def visit(self, obj, stagedir, basedir, copy=False, staged=False):
         # type: (Dict[unicode, Any], unicode, unicode, bool) -> None
         loc = obj["location"]
+        stagedir = obj.get("dirname") or stagedir
         tgt = os.path.join(stagedir, obj["basename"])
         basetgt, baseext = os.path.splitext(tgt)
+
+        def targetExists():
+            return tgt in self.targets and ("contents" not in obj) and (self._pathmap[tgt].resolved != loc)
+        def literalTargetExists():
+            return tgt in self.targets and "contents" in obj
+
         n = 1
-        if tgt in self.targets and (self.reversemap(tgt)[0] != loc):
+        if targetExists() or literalTargetExists():
             while tgt in self.targets:
                 n += 1
                 tgt = "%s_%i%s" % (basetgt, n, baseext)
@@ -293,7 +324,7 @@ class StagingPathMapper(PathMapper):
             if tgt in self._pathmap:
                 return
             if "contents" in obj and loc.startswith("_:"):
-                self._pathmap[tgt] = MapperEnt(obj["contents"], tgt, "CreateFile", staged)
+                self._pathmap[loc] = MapperEnt(obj["contents"], tgt, "CreateFile", staged)
             else:
                 if copy or obj.get("writable"):
                     self._pathmap[tgt] = MapperEnt(loc, tgt, "WritableFile", staged)
@@ -301,16 +332,19 @@ class StagingPathMapper(PathMapper):
                     self._pathmap[tgt] = MapperEnt(loc, tgt, "File", staged)
                 self.visitlisting(obj.get("secondaryFiles", []), stagedir, basedir)
 
-    def mapper(self, src):  # type: (Text) -> MapperEnt
+    def mapper(self, src):  # type: (Text) -> MapperEnt.
+        # Overridden to maintain the use case of mapping by source (identifier) to
+        # target regardless of how the map is structured interally.
+        def getMapperEnt(src):
+            for k,v in viewitems(self._pathmap):
+                if (v.type != "CreateFile" and v.resolved == src) or (v.type == "CreateFile" and k == src):
+                    return v
+
         if u"#" in src:
-            i = src.index(u"#")         
-            for k,v in self._pathmap.items():
-                if v.resolved == src[:i]:
-                    return MapperEnt(v.resolved, v.target + src[i:], v.type, v.staged)
-    
-        for k,v in self._pathmap.items():
-            if v.resolved == src:
-                return self._pathmap[k]
+            i = src.index(u"#")
+            v = getMapperEnt(src[i:])
+            return MapperEnt(v.resolved, v.target + src[i:], v.type, v.staged)
+        return getMapperEnt(src)
 
 
 class VwdPathMapper(StagingPathMapper):