From f5f5de0ae41e12738a380c422417d5a5e5af7f09 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Thu, 27 Apr 2017 14:24:43 -0400 Subject: [PATCH] 11549: Fix container requests so it doesn't mount each file individually, instead mount common collections/subdirectories. --- sdk/cwl/arvados_cwl/arvcontainer.py | 62 +++++++++++++++-------------- sdk/cwl/arvados_cwl/arvjob.py | 8 ++-- sdk/cwl/arvados_cwl/arvworkflow.py | 5 ++- sdk/cwl/arvados_cwl/pathmapper.py | 16 ++++++++ sdk/cwl/arvados_cwl/runner.py | 18 ++++----- sdk/cwl/tests/test_container.py | 4 ++ 6 files changed, 70 insertions(+), 43 deletions(-) diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py index 9e76cf711e..657d5927d0 100644 --- a/sdk/cwl/arvados_cwl/arvcontainer.py +++ b/sdk/cwl/arvados_cwl/arvcontainer.py @@ -7,16 +7,16 @@ import ruamel.yaml as yaml from cwltool.errors import WorkflowException from cwltool.process import get_feature, UnsupportedRequirement, shortname -from cwltool.pathmapper import adjustFiles, adjustDirObjs +from cwltool.pathmapper import adjustFileObjs, adjustDirObjs from cwltool.utils import aslist import arvados.collection from .arvdocker import arv_docker_get_image from . import done -from .runner import Runner, arvados_jobs_image, packed_workflow, trim_listing +from .runner import Runner, arvados_jobs_image, packed_workflow, trim_anonymous_location from .fsaccess import CollectionFetcher -from .pathmapper import NoFollowPathMapper +from .pathmapper import NoFollowPathMapper, trim_listing from .perf import Perf logger = logging.getLogger('arvados.cwl-runner') @@ -48,37 +48,39 @@ class ArvadosContainer(object): mounts = { self.outdir: { "kind": "tmp" + }, + self.tmpdir: { + "kind": "tmp" } } scheduling_parameters = {} - dirs = set() - for f in self.pathmapper.files(): - pdh, p, tp, stg = self.pathmapper.mapper(f) - if tp == "Directory" and '/' not in pdh: - mounts[p] = { - "kind": "collection", - "portable_data_hash": pdh[5:] - } - dirs.add(pdh) - - for f in self.pathmapper.files(): - res, p, tp, stg = self.pathmapper.mapper(f) - if res.startswith("keep:"): - res = res[5:] - elif res.startswith("/keep/"): - res = res[6:] - else: + rf = [self.pathmapper.mapper(f) for f in self.pathmapper.referenced_files] + rf.sort(key=lambda k: k.resolved) + prevdir = None + for resolved, target, tp, stg in rf: + if not stg: continue - sp = res.split("/", 1) - pdh = sp[0] - if pdh not in dirs: - mounts[p] = { - "kind": "collection", - "portable_data_hash": pdh - } - if len(sp) == 2: - mounts[p]["path"] = urllib.unquote(sp[1]) + if prevdir and target.startswith(prevdir): + continue + if tp == "Directory": + targetdir = target + else: + targetdir = os.path.dirname(target) + sp = resolved.split("/", 1) + pdh = sp[0][5:] # remove "keep:" + mounts[targetdir] = { + "kind": "collection", + "portable_data_hash": pdh + } + if len(sp) == 2: + if tp == "Directory": + path = sp[1] + else: + path = os.path.dirname(sp[1]) + if path and path != "/": + mounts[targetdir]["path"] = path + prevdir = targetdir + "/" with Perf(metrics, "generatefiles %s" % self.name): if self.generatefiles["listing"]: @@ -238,6 +240,8 @@ class RunnerContainer(Runner): """ adjustDirObjs(self.job_order, trim_listing) + adjustFileObjs(self.job_order, trim_anonymous_location) + adjustDirObjs(self.job_order, trim_anonymous_location) container_req = { "owner_uuid": self.arvrunner.project_uuid, diff --git a/sdk/cwl/arvados_cwl/arvjob.py b/sdk/cwl/arvados_cwl/arvjob.py index 04a62953ba..e86f5055c5 100644 --- a/sdk/cwl/arvados_cwl/arvjob.py +++ b/sdk/cwl/arvados_cwl/arvjob.py @@ -9,7 +9,7 @@ from cwltool.errors import WorkflowException from cwltool.draft2tool import revmap_file, CommandLineTool from cwltool.load_tool import fetch_document from cwltool.builder import Builder -from cwltool.pathmapper import adjustDirObjs +from cwltool.pathmapper import adjustFileObjs, adjustDirObjs from schema_salad.sourceline import SourceLine @@ -18,8 +18,8 @@ import ruamel.yaml as yaml import arvados.collection from .arvdocker import arv_docker_get_image -from .runner import Runner, arvados_jobs_image, packed_workflow, trim_listing, upload_workflow_collection -from .pathmapper import VwdPathMapper +from .runner import Runner, arvados_jobs_image, packed_workflow, upload_workflow_collection, trim_anonymous_location +from .pathmapper import VwdPathMapper, trim_listing from .perf import Perf from . import done from ._version import __version__ @@ -254,6 +254,8 @@ class RunnerJob(Runner): self.job_order["cwl:tool"] = "%s/workflow.cwl#main" % wf_pdh adjustDirObjs(self.job_order, trim_listing) + adjustFileObjs(self.job_order, trim_anonymous_location) + adjustDirObjs(self.job_order, trim_anonymous_location) if self.output_name: self.job_order["arv:output_name"] = self.output_name diff --git a/sdk/cwl/arvados_cwl/arvworkflow.py b/sdk/cwl/arvados_cwl/arvworkflow.py index 4db1f4f2f4..22f662e00a 100644 --- a/sdk/cwl/arvados_cwl/arvworkflow.py +++ b/sdk/cwl/arvados_cwl/arvworkflow.py @@ -13,7 +13,8 @@ from cwltool.pathmapper import adjustFileObjs, adjustDirObjs import ruamel.yaml as yaml -from .runner import upload_dependencies, trim_listing, packed_workflow, upload_workflow_collection +from .runner import upload_dependencies, packed_workflow, upload_workflow_collection, trim_anonymous_location +from .pathmapper import trim_listing from .arvtool import ArvadosCommandTool from .perf import Perf @@ -26,6 +27,8 @@ def upload_workflow(arvRunner, tool, job_order, project_uuid, uuid=None, packed = packed_workflow(arvRunner, tool) adjustDirObjs(job_order, trim_listing) + adjustFileObjs(job_order, trim_anonymous_location) + adjustDirObjs(job_order, trim_anonymous_location) main = [p for p in packed["$graph"] if p["id"] == "#main"][0] for inp in main["inputs"]: diff --git a/sdk/cwl/arvados_cwl/pathmapper.py b/sdk/cwl/arvados_cwl/pathmapper.py index cddb4088b7..5e2ee46a87 100644 --- a/sdk/cwl/arvados_cwl/pathmapper.py +++ b/sdk/cwl/arvados_cwl/pathmapper.py @@ -14,6 +14,21 @@ from cwltool.workflow import WorkflowException logger = logging.getLogger('arvados.cwl-runner') +def trim_listing(obj): + """Remove 'listing' field from Directory objects that are keep references. + + When Directory objects represent Keep references, it is redundant and + potentially very expensive to pass fully enumerated Directory objects + between instances of cwl-runner (e.g. a submitting a job, or using the + RunInSingleContainer feature), so delete the 'listing' field when it is + safe to do so. + + """ + + if obj.get("location", "").startswith("keep:") and "listing" in obj: + del obj["listing"] + + class ArvPathMapper(PathMapper): """Convert container-local paths to and from Keep collection ids.""" @@ -27,6 +42,7 @@ class ArvPathMapper(PathMapper): self.collection_pattern = collection_pattern self.file_pattern = file_pattern self.name = name + self.referenced_files = [r["location"] for r in referenced_files] super(ArvPathMapper, self).__init__(referenced_files, input_basedir, None) def visit(self, srcobj, uploadfiles): diff --git a/sdk/cwl/arvados_cwl/runner.py b/sdk/cwl/arvados_cwl/runner.py index 57a672389c..69e4f5bd7b 100644 --- a/sdk/cwl/arvados_cwl/runner.py +++ b/sdk/cwl/arvados_cwl/runner.py @@ -23,24 +23,22 @@ import arvados.collection import ruamel.yaml as yaml from .arvdocker import arv_docker_get_image -from .pathmapper import ArvPathMapper +from .pathmapper import ArvPathMapper, trim_listing from ._version import __version__ from . import done logger = logging.getLogger('arvados.cwl-runner') -def trim_listing(obj): - """Remove 'listing' field from Directory objects that are keep references. +def trim_anonymous_location(obj): + """Remove 'location' field from File and Directory literals. + + To make internal handling easier, literals are assigned a random id for + 'location'. However, when writing the record back out, this can break + reproducibility. Since it is valid for literals not have a 'location' + field, remove it. - When Directory objects represent Keep references, it redundant and - potentially very expensive to pass fully enumerated Directory objects - between instances of cwl-runner (e.g. a submitting a job, or using the - RunInSingleContainer feature), so delete the 'listing' field when it is - safe to do so. """ - if obj.get("location", "").startswith("keep:") and "listing" in obj: - del obj["listing"] if obj.get("location", "").startswith("_:"): del obj["location"] diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py index 7786e7f625..b06eae8105 100644 --- a/sdk/cwl/tests/test_container.py +++ b/sdk/cwl/tests/test_container.py @@ -63,6 +63,7 @@ class TestContainer(unittest.TestCase): 'use_existing': enable_reuse, 'priority': 1, 'mounts': { + '/tmp': {'kind': 'tmp'}, '/var/spool/cwl': {'kind': 'tmp'} }, 'state': 'Committed', @@ -135,6 +136,7 @@ class TestContainer(unittest.TestCase): 'use_existing': True, 'priority': 1, 'mounts': { + '/tmp': {'kind': 'tmp'}, '/var/spool/cwl': {'kind': 'tmp'} }, 'state': 'Committed', @@ -240,6 +242,7 @@ class TestContainer(unittest.TestCase): 'use_existing': True, 'priority': 1, 'mounts': { + '/tmp': {'kind': 'tmp'}, '/var/spool/cwl': {'kind': 'tmp'}, '/var/spool/cwl/foo': { 'kind': 'collection', @@ -325,6 +328,7 @@ class TestContainer(unittest.TestCase): 'use_existing': True, 'priority': 1, 'mounts': { + '/tmp': {'kind': 'tmp'}, '/var/spool/cwl': {'kind': 'tmp'}, "stderr": { "kind": "file", -- 2.30.2