18947: Merge branch 'main'
[arvados.git] / sdk / cwl / arvados_cwl / arvdocker.py
index 7f6ab587d323a7dc65e39c00e8e1b38d019f009d..cf0b3b9daff3deb02a84fe6e41428e04cdefca4e 100644 (file)
@@ -1,6 +1,15 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 import sys
 import threading
+import copy
+import re
+import subprocess
+
+from schema_salad.sourceline import SourceLine
 
 import cwltool.docker
 from cwltool.errors import WorkflowException
@@ -11,11 +20,54 @@ logger = logging.getLogger('arvados.cwl-runner')
 cached_lookups = {}
 cached_lookups_lock = threading.Lock()
 
-def arv_docker_get_image(api_client, dockerRequirement, pull_image, project_uuid):
+def determine_image_id(dockerImageId):
+    for line in (
+        subprocess.check_output(  # nosec
+            ["docker", "images", "--no-trunc", "--all"]
+        )
+        .decode("utf-8")
+        .splitlines()
+    ):
+        try:
+            match = re.match(r"^([^ ]+)\s+([^ ]+)\s+([^ ]+)", line)
+            split = dockerImageId.split(":")
+            if len(split) == 1:
+                split.append("latest")
+            elif len(split) == 2:
+                #  if split[1] doesn't  match valid tag names, it is a part of repository
+                if not re.match(r"[\w][\w.-]{0,127}", split[1]):
+                    split[0] = split[0] + ":" + split[1]
+                    split[1] = "latest"
+            elif len(split) == 3:
+                if re.match(r"[\w][\w.-]{0,127}", split[2]):
+                    split[0] = split[0] + ":" + split[1]
+                    split[1] = split[2]
+                    del split[2]
+
+            # check for repository:tag match or image id match
+            if match and (
+                (split[0] == match.group(1) and split[1] == match.group(2))
+                or dockerImageId == match.group(3)
+            ):
+                return match.group(3)
+        except ValueError:
+            pass
+
+    return None
+
+
+def arv_docker_get_image(api_client, dockerRequirement, pull_image, project_uuid,
+                         force_pull, tmp_outdir_prefix, match_local_docker, copy_deps):
     """Check if a Docker image is available in Keep, if not, upload it using arv-keepdocker."""
 
+    if "http://arvados.org/cwl#dockerCollectionPDH" in dockerRequirement:
+        return dockerRequirement["http://arvados.org/cwl#dockerCollectionPDH"]
+
     if "dockerImageId" not in dockerRequirement and "dockerPull" in dockerRequirement:
+        dockerRequirement = copy.deepcopy(dockerRequirement)
         dockerRequirement["dockerImageId"] = dockerRequirement["dockerPull"]
+        if hasattr(dockerRequirement, 'lc'):
+            dockerRequirement.lc.data["dockerImageId"] = dockerRequirement.lc.data["dockerPull"]
 
     global cached_lookups
     global cached_lookups_lock
@@ -23,43 +75,86 @@ def arv_docker_get_image(api_client, dockerRequirement, pull_image, project_uuid
         if dockerRequirement["dockerImageId"] in cached_lookups:
             return cached_lookups[dockerRequirement["dockerImageId"]]
 
-    sp = dockerRequirement["dockerImageId"].split(":")
-    image_name = sp[0]
-    image_tag = sp[1] if len(sp) > 1 else None
-
-    images = arvados.commands.keepdocker.list_images_in_arv(api_client, 3,
-                                                            image_name=image_name,
-                                                            image_tag=image_tag)
-
-    if not images:
-        # Fetch Docker image if necessary.
-        cwltool.docker.get_image(dockerRequirement, pull_image)
-
-        # Upload image to Arvados
-        args = []
-        if project_uuid:
-            args.append("--project-uuid="+project_uuid)
-        args.append(image_name)
-        if image_tag:
-            args.append(image_tag)
-        logger.info("Uploading Docker image %s", ":".join(args[1:]))
-        try:
-            arvados.commands.keepdocker.main(args, stdout=sys.stderr)
-        except SystemExit as e:
-            if e.code:
-                raise WorkflowException("keepdocker exited with code %s" % e.code)
+    with SourceLine(dockerRequirement, "dockerImageId", WorkflowException, logger.isEnabledFor(logging.DEBUG)):
+        sp = dockerRequirement["dockerImageId"].split(":")
+        image_name = sp[0]
+        image_tag = sp[1] if len(sp) > 1 else "latest"
 
-        images = arvados.commands.keepdocker.list_images_in_arv(api_client, 3,
+        out_of_project_images = arvados.commands.keepdocker.list_images_in_arv(api_client, 3,
                                                                 image_name=image_name,
-                                                                image_tag=image_tag)
-
-    if not images:
-        raise WorkflowException("Could not find Docker image %s:%s" % (image_name, image_tag))
-
-    pdh = api_client.collections().get(uuid=images[0][0]).execute()["portable_data_hash"]
-
-    with cached_lookups_lock:
-        cached_lookups[dockerRequirement["dockerImageId"]] = pdh
+                                                                image_tag=image_tag,
+                                                                project_uuid=None)
+
+        if copy_deps:
+            # Only images that are available in the destination project
+            images = arvados.commands.keepdocker.list_images_in_arv(api_client, 3,
+                                                                    image_name=image_name,
+                                                                    image_tag=image_tag,
+                                                                    project_uuid=project_uuid)
+        else:
+            images = out_of_project_images
+
+        if match_local_docker:
+            local_image_id = determine_image_id(dockerRequirement["dockerImageId"])
+            if local_image_id:
+                # find it in the list
+                found = False
+                for i in images:
+                    if i[1]["dockerhash"] == local_image_id:
+                        found = True
+                        images = [i]
+                        break
+                if not found:
+                    # force re-upload.
+                    images = []
+
+                for i in out_of_project_images:
+                    if i[1]["dockerhash"] == local_image_id:
+                        found = True
+                        out_of_project_images = [i]
+                        break
+                if not found:
+                    # force re-upload.
+                    out_of_project_images = []
+
+        if not images:
+            if not out_of_project_images:
+                # Fetch Docker image if necessary.
+                try:
+                    result = cwltool.docker.DockerCommandLineJob.get_image(dockerRequirement, pull_image,
+                                                                  force_pull, tmp_outdir_prefix)
+                    if not result:
+                        raise WorkflowException("Docker image '%s' not available" % dockerRequirement["dockerImageId"])
+                except OSError as e:
+                    raise WorkflowException("While trying to get Docker image '%s', failed to execute 'docker': %s" % (dockerRequirement["dockerImageId"], e))
+
+            # Upload image to Arvados
+            args = []
+            if project_uuid:
+                args.append("--project-uuid="+project_uuid)
+            args.append(image_name)
+            args.append(image_tag)
+            logger.info("Uploading Docker image %s:%s", image_name, image_tag)
+            try:
+                arvados.commands.put.api_client = api_client
+                arvados.commands.keepdocker.main(args, stdout=sys.stderr, install_sig_handlers=False, api=api_client)
+            except SystemExit as e:
+                # If e.code is None or zero, then keepdocker exited normally and we can continue
+                if e.code:
+                    raise WorkflowException("keepdocker exited with code %s" % e.code)
+
+            images = arvados.commands.keepdocker.list_images_in_arv(api_client, 3,
+                                                                    image_name=image_name,
+                                                                    image_tag=image_tag,
+                                                                    project_uuid=project_uuid)
+
+        if not images:
+            raise WorkflowException("Could not find Docker image %s:%s" % (image_name, image_tag))
+
+        pdh = api_client.collections().get(uuid=images[0][0]).execute()["portable_data_hash"]
+
+        with cached_lookups_lock:
+            cached_lookups[dockerRequirement["dockerImageId"]] = pdh
 
     return pdh