11549: Fix container requests so it doesn't mount each file individually,
authorPeter Amstutz <peter.amstutz@curoverse.com>
Thu, 27 Apr 2017 18:24:43 +0000 (14:24 -0400)
committerPeter Amstutz <peter.amstutz@curoverse.com>
Thu, 27 Apr 2017 18:24:43 +0000 (14:24 -0400)
instead mount common collections/subdirectories.

sdk/cwl/arvados_cwl/arvcontainer.py
sdk/cwl/arvados_cwl/arvjob.py
sdk/cwl/arvados_cwl/arvworkflow.py
sdk/cwl/arvados_cwl/pathmapper.py
sdk/cwl/arvados_cwl/runner.py
sdk/cwl/tests/test_container.py

index 9e76cf711ecaaf81c6e5e42131ea3d717da00c01..657d5927d0328025eb948f6e8ee62ba215486769 100644 (file)
@@ -7,16 +7,16 @@ import ruamel.yaml as yaml
 
 from cwltool.errors import WorkflowException
 from cwltool.process import get_feature, UnsupportedRequirement, shortname
-from cwltool.pathmapper import adjustFiles, adjustDirObjs
+from cwltool.pathmapper import adjustFileObjs, adjustDirObjs
 from cwltool.utils import aslist
 
 import arvados.collection
 
 from .arvdocker import arv_docker_get_image
 from . import done
-from .runner import Runner, arvados_jobs_image, packed_workflow, trim_listing
+from .runner import Runner, arvados_jobs_image, packed_workflow, trim_anonymous_location
 from .fsaccess import CollectionFetcher
-from .pathmapper import NoFollowPathMapper
+from .pathmapper import NoFollowPathMapper, trim_listing
 from .perf import Perf
 
 logger = logging.getLogger('arvados.cwl-runner')
@@ -48,37 +48,39 @@ class ArvadosContainer(object):
         mounts = {
             self.outdir: {
                 "kind": "tmp"
+            },
+            self.tmpdir: {
+                "kind": "tmp"
             }
         }
         scheduling_parameters = {}
 
-        dirs = set()
-        for f in self.pathmapper.files():
-            pdh, p, tp, stg = self.pathmapper.mapper(f)
-            if tp == "Directory" and '/' not in pdh:
-                mounts[p] = {
-                    "kind": "collection",
-                    "portable_data_hash": pdh[5:]
-                }
-                dirs.add(pdh)
-
-        for f in self.pathmapper.files():
-            res, p, tp, stg = self.pathmapper.mapper(f)
-            if res.startswith("keep:"):
-                res = res[5:]
-            elif res.startswith("/keep/"):
-                res = res[6:]
-            else:
+        rf = [self.pathmapper.mapper(f) for f in self.pathmapper.referenced_files]
+        rf.sort(key=lambda k: k.resolved)
+        prevdir = None
+        for resolved, target, tp, stg in rf:
+            if not stg:
                 continue
-            sp = res.split("/", 1)
-            pdh = sp[0]
-            if pdh not in dirs:
-                mounts[p] = {
-                    "kind": "collection",
-                    "portable_data_hash": pdh
-                }
-                if len(sp) == 2:
-                    mounts[p]["path"] = urllib.unquote(sp[1])
+            if prevdir and target.startswith(prevdir):
+                continue
+            if tp == "Directory":
+                targetdir = target
+            else:
+                targetdir = os.path.dirname(target)
+            sp = resolved.split("/", 1)
+            pdh = sp[0][5:]   # remove "keep:"
+            mounts[targetdir] = {
+                "kind": "collection",
+                "portable_data_hash": pdh
+            }
+            if len(sp) == 2:
+                if tp == "Directory":
+                    path = sp[1]
+                else:
+                    path = os.path.dirname(sp[1])
+                if path and path != "/":
+                    mounts[targetdir]["path"] = path
+            prevdir = targetdir + "/"
 
         with Perf(metrics, "generatefiles %s" % self.name):
             if self.generatefiles["listing"]:
@@ -238,6 +240,8 @@ class RunnerContainer(Runner):
         """
 
         adjustDirObjs(self.job_order, trim_listing)
+        adjustFileObjs(self.job_order, trim_anonymous_location)
+        adjustDirObjs(self.job_order, trim_anonymous_location)
 
         container_req = {
             "owner_uuid": self.arvrunner.project_uuid,
index 04a62953ba24be6f793e9061a29e3cdb2ddccec7..e86f5055c54c76a46f74ca47438a846a56790664 100644 (file)
@@ -9,7 +9,7 @@ from cwltool.errors import WorkflowException
 from cwltool.draft2tool import revmap_file, CommandLineTool
 from cwltool.load_tool import fetch_document
 from cwltool.builder import Builder
-from cwltool.pathmapper import adjustDirObjs
+from cwltool.pathmapper import adjustFileObjs, adjustDirObjs
 
 from schema_salad.sourceline import SourceLine
 
@@ -18,8 +18,8 @@ import ruamel.yaml as yaml
 import arvados.collection
 
 from .arvdocker import arv_docker_get_image
-from .runner import Runner, arvados_jobs_image, packed_workflow, trim_listing, upload_workflow_collection
-from .pathmapper import VwdPathMapper
+from .runner import Runner, arvados_jobs_image, packed_workflow, upload_workflow_collection, trim_anonymous_location
+from .pathmapper import VwdPathMapper, trim_listing
 from .perf import Perf
 from . import done
 from ._version import __version__
@@ -254,6 +254,8 @@ class RunnerJob(Runner):
             self.job_order["cwl:tool"] = "%s/workflow.cwl#main" % wf_pdh
 
         adjustDirObjs(self.job_order, trim_listing)
+        adjustFileObjs(self.job_order, trim_anonymous_location)
+        adjustDirObjs(self.job_order, trim_anonymous_location)
 
         if self.output_name:
             self.job_order["arv:output_name"] = self.output_name
index 4db1f4f2f4d8f48739bd7ef525e9526bdd2c3b4c..22f662e00a1a02579543d1ead9542bf3e74997f7 100644 (file)
@@ -13,7 +13,8 @@ from cwltool.pathmapper import adjustFileObjs, adjustDirObjs
 
 import ruamel.yaml as yaml
 
-from .runner import upload_dependencies, trim_listing, packed_workflow, upload_workflow_collection
+from .runner import upload_dependencies, packed_workflow, upload_workflow_collection, trim_anonymous_location
+from .pathmapper import trim_listing
 from .arvtool import ArvadosCommandTool
 from .perf import Perf
 
@@ -26,6 +27,8 @@ def upload_workflow(arvRunner, tool, job_order, project_uuid, uuid=None,
     packed = packed_workflow(arvRunner, tool)
 
     adjustDirObjs(job_order, trim_listing)
+    adjustFileObjs(job_order, trim_anonymous_location)
+    adjustDirObjs(job_order, trim_anonymous_location)
 
     main = [p for p in packed["$graph"] if p["id"] == "#main"][0]
     for inp in main["inputs"]:
index cddb4088b7bfcbbdb211d6785ba650e8ea36901e..5e2ee46a87bfd2494debfc63ce04273584a8dc64 100644 (file)
@@ -14,6 +14,21 @@ from cwltool.workflow import WorkflowException
 
 logger = logging.getLogger('arvados.cwl-runner')
 
+def trim_listing(obj):
+    """Remove 'listing' field from Directory objects that are keep references.
+
+    When Directory objects represent Keep references, it is redundant and
+    potentially very expensive to pass fully enumerated Directory objects
+    between instances of cwl-runner (e.g. a submitting a job, or using the
+    RunInSingleContainer feature), so delete the 'listing' field when it is
+    safe to do so.
+
+    """
+
+    if obj.get("location", "").startswith("keep:") and "listing" in obj:
+        del obj["listing"]
+
+
 class ArvPathMapper(PathMapper):
     """Convert container-local paths to and from Keep collection ids."""
 
@@ -27,6 +42,7 @@ class ArvPathMapper(PathMapper):
         self.collection_pattern = collection_pattern
         self.file_pattern = file_pattern
         self.name = name
+        self.referenced_files = [r["location"] for r in referenced_files]
         super(ArvPathMapper, self).__init__(referenced_files, input_basedir, None)
 
     def visit(self, srcobj, uploadfiles):
index 57a672389c740e00b33a8e47d8f3eb419fed33f8..69e4f5bd7b628e89349822e45ae01a587320c605 100644 (file)
@@ -23,24 +23,22 @@ import arvados.collection
 import ruamel.yaml as yaml
 
 from .arvdocker import arv_docker_get_image
-from .pathmapper import ArvPathMapper
+from .pathmapper import ArvPathMapper, trim_listing
 from ._version import __version__
 from . import done
 
 logger = logging.getLogger('arvados.cwl-runner')
 
-def trim_listing(obj):
-    """Remove 'listing' field from Directory objects that are keep references.
+def trim_anonymous_location(obj):
+    """Remove 'location' field from File and Directory literals.
+
+    To make internal handling easier, literals are assigned a random id for
+    'location'.  However, when writing the record back out, this can break
+    reproducibility.  Since it is valid for literals not have a 'location'
+    field, remove it.
 
-    When Directory objects represent Keep references, it redundant and
-    potentially very expensive to pass fully enumerated Directory objects
-    between instances of cwl-runner (e.g. a submitting a job, or using the
-    RunInSingleContainer feature), so delete the 'listing' field when it is
-    safe to do so.
     """
 
-    if obj.get("location", "").startswith("keep:") and "listing" in obj:
-        del obj["listing"]
     if obj.get("location", "").startswith("_:"):
         del obj["location"]
 
index 7786e7f62548c567c62de13b018e88c29233dec0..b06eae8105aad06d35ece8611f3b7a5103ab838c 100644 (file)
@@ -63,6 +63,7 @@ class TestContainer(unittest.TestCase):
                         'use_existing': enable_reuse,
                         'priority': 1,
                         'mounts': {
+                            '/tmp': {'kind': 'tmp'},
                             '/var/spool/cwl': {'kind': 'tmp'}
                         },
                         'state': 'Committed',
@@ -135,6 +136,7 @@ class TestContainer(unittest.TestCase):
                 'use_existing': True,
                 'priority': 1,
                 'mounts': {
+                    '/tmp': {'kind': 'tmp'},
                     '/var/spool/cwl': {'kind': 'tmp'}
                 },
                 'state': 'Committed',
@@ -240,6 +242,7 @@ class TestContainer(unittest.TestCase):
                 'use_existing': True,
                 'priority': 1,
                 'mounts': {
+                    '/tmp': {'kind': 'tmp'},
                     '/var/spool/cwl': {'kind': 'tmp'},
                     '/var/spool/cwl/foo': {
                         'kind': 'collection',
@@ -325,6 +328,7 @@ class TestContainer(unittest.TestCase):
                     'use_existing': True,
                     'priority': 1,
                     'mounts': {
+                        '/tmp': {'kind': 'tmp'},
                         '/var/spool/cwl': {'kind': 'tmp'},
                         "stderr": {
                             "kind": "file",