Merge branch '21666-provision-test-improvement'
[arvados.git] / sdk / python / arvados / commands / keepdocker.py
index f3df4e3f393bc15090dab3de80def1b50c595cc3..6823ee1beada080526c9a9aa901d752e7b5aefd9 100644 (file)
@@ -2,37 +2,29 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from builtins import next
 import argparse
 import collections
 import datetime
 import errno
+import fcntl
 import json
+import logging
 import os
 import re
+import subprocess
 import sys
 import tarfile
 import tempfile
-import shutil
-import _strptime
-import fcntl
+
+import ciso8601
 from operator import itemgetter
 from stat import *
 
-if os.name == "posix" and sys.version_info[0] < 3:
-    import subprocess32 as subprocess
-else:
-    import subprocess
-
 import arvados
+import arvados.config
 import arvados.util
 import arvados.commands._util as arv_cmd
 import arvados.commands.put as arv_put
-from arvados.collection import CollectionReader
-import ciso8601
-import logging
-import arvados.config
-
 from arvados._version import __version__
 
 logger = logging.getLogger('arvados.keepdocker')
@@ -85,11 +77,12 @@ class DockerError(Exception):
 def popen_docker(cmd, *args, **kwargs):
     manage_stdin = ('stdin' not in kwargs)
     kwargs.setdefault('stdin', subprocess.PIPE)
-    kwargs.setdefault('stdout', sys.stderr)
+    kwargs.setdefault('stdout', subprocess.PIPE)
+    kwargs.setdefault('stderr', subprocess.PIPE)
     try:
-        docker_proc = subprocess.Popen(['docker.io'] + cmd, *args, **kwargs)
-    except OSError:  # No docker.io in $PATH
         docker_proc = subprocess.Popen(['docker'] + cmd, *args, **kwargs)
+    except OSError:  # No docker in $PATH, try docker.io
+        docker_proc = subprocess.Popen(['docker.io'] + cmd, *args, **kwargs)
     if manage_stdin:
         docker_proc.stdin.close()
     return docker_proc
@@ -146,20 +139,18 @@ def docker_images():
     check_docker(list_proc, "images")
 
 def find_image_hashes(image_search, image_tag=None):
-    # Given one argument, search for Docker images with matching hashes,
-    # and return their full hashes in a set.
-    # Given two arguments, also search for a Docker image with the
-    # same repository and tag.  If one is found, return its hash in a
-    # set; otherwise, fall back to the one-argument hash search.
-    # Returns None if no match is found, or a hash search is ambiguous.
-    hash_search = image_search.lower()
-    hash_matches = set()
-    for image in docker_images():
-        if (image.repo == image_search) and (image.tag == image_tag):
-            return set([image.hash])
-        elif image.hash.startswith(hash_search):
-            hash_matches.add(image.hash)
-    return hash_matches
+    # Query for a Docker images with the repository and tag and return
+    # the image ids in a list.  Returns empty list if no match is
+    # found.
+
+    list_proc = popen_docker(['inspect', "%s%s" % (image_search, ":"+image_tag if image_tag else "")], stdout=subprocess.PIPE)
+
+    inspect = list_proc.stdout.read()
+    list_proc.stdout.close()
+
+    imageinfo = json.loads(inspect)
+
+    return [i["Id"] for i in imageinfo]
 
 def find_one_image_hash(image_search, image_tag=None):
     hashes = find_image_hashes(image_search, image_tag)
@@ -241,8 +232,9 @@ def docker_link_sort_key(link):
     return (image_timestamp, created_timestamp)
 
 def _get_docker_links(api_client, num_retries, **kwargs):
-    links = arvados.util.list_all(api_client.links().list,
-                                  num_retries, **kwargs)
+    links = list(arvados.util.keyset_list_all(
+        api_client.links().list, num_retries=num_retries, **kwargs,
+    ))
     for link in links:
         link['_sort_key'] = docker_link_sort_key(link)
     links.sort(key=itemgetter('_sort_key'), reverse=True)
@@ -259,7 +251,7 @@ def _new_image_listing(link, dockerhash, repo='<none>', tag='<none>'):
         'tag': tag,
         }
 
-def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None):
+def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None, project_uuid=None):
     """List all Docker images known to the api_client with image_name and
     image_tag.  If no image_name is given, defaults to listing all
     Docker images.
@@ -274,13 +266,18 @@ def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None)
     search_filters = []
     repo_links = None
     hash_links = None
+
+    project_filter = []
+    if project_uuid is not None:
+        project_filter = [["owner_uuid", "=", project_uuid]]
+
     if image_name:
         # Find images with the name the user specified.
         search_links = _get_docker_links(
             api_client, num_retries,
             filters=[['link_class', '=', 'docker_image_repo+tag'],
                      ['name', '=',
-                      '{}:{}'.format(image_name, image_tag or 'latest')]])
+                      '{}:{}'.format(image_name, image_tag or 'latest')]]+project_filter)
         if search_links:
             repo_links = search_links
         else:
@@ -288,7 +285,7 @@ def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None)
             search_links = _get_docker_links(
                 api_client, num_retries,
                 filters=[['link_class', '=', 'docker_image_hash'],
-                         ['name', 'ilike', image_name + '%']])
+                         ['name', 'ilike', image_name + '%']]+project_filter)
             hash_links = search_links
         # Only list information about images that were found in the search.
         search_filters.append(['head_uuid', 'in',
@@ -300,7 +297,7 @@ def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None)
     if hash_links is None:
         hash_links = _get_docker_links(
             api_client, num_retries,
-            filters=search_filters + [['link_class', '=', 'docker_image_hash']])
+            filters=search_filters + [['link_class', '=', 'docker_image_hash']]+project_filter)
     hash_link_map = {link['head_uuid']: link for link in reversed(hash_links)}
 
     # Each collection may have more than one name (though again, one name
@@ -310,7 +307,7 @@ def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None)
         repo_links = _get_docker_links(
             api_client, num_retries,
             filters=search_filters + [['link_class', '=',
-                                       'docker_image_repo+tag']])
+                                       'docker_image_repo+tag']]+project_filter)
     seen_image_names = collections.defaultdict(set)
     images = []
     for link in repo_links:
@@ -322,7 +319,7 @@ def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None)
             dockerhash = hash_link_map[collection_uuid]['name']
         except KeyError:
             dockerhash = '<unknown>'
-        name_parts = link['name'].split(':', 1)
+        name_parts = link['name'].rsplit(':', 1)
         images.append(_new_image_listing(link, dockerhash, *name_parts))
 
     # Find any image hash links that did not have a corresponding name link,
@@ -336,10 +333,12 @@ def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None)
         images.sort(key=itemgetter('_sort_key'), reverse=True)
 
     # Remove any image listings that refer to unknown collections.
-    existing_coll_uuids = {coll['uuid'] for coll in arvados.util.list_all(
-            api_client.collections().list, num_retries,
-            filters=[['uuid', 'in', [im['collection'] for im in images]]],
-            select=['uuid'])}
+    existing_coll_uuids = {coll['uuid'] for coll in arvados.util.keyset_list_all(
+        api_client.collections().list,
+        num_retries=num_retries,
+        filters=[['uuid', 'in', [im['collection'] for im in images]]]+project_filter,
+        select=['uuid'],
+    )}
     return [(image['collection'], image) for image in images
             if image['collection'] in existing_coll_uuids]
 
@@ -352,10 +351,29 @@ def _uuid2pdh(api, uuid):
         select=['portable_data_hash'],
     ).execute()['items'][0]['portable_data_hash']
 
+def load_image_metadata(image_file):
+    """Load an image manifest and config from an archive
+
+    Given an image archive as an open binary file object, this function loads
+    the image manifest and configuration, deserializing each from JSON and
+    returning them in a 2-tuple of dicts.
+    """
+    image_file.seek(0)
+    with tarfile.open(fileobj=image_file) as image_tar:
+        with image_tar.extractfile('manifest.json') as manifest_file:
+            image_manifest_list = json.load(manifest_file)
+        # Because arv-keepdocker only saves one image, there should only be
+        # one manifest.  This extracts that from the list and raises
+        # ValueError if there's not exactly one.
+        image_manifest, = image_manifest_list
+        with image_tar.extractfile(image_manifest['Config']) as config_file:
+            image_config = json.load(config_file)
+    return image_manifest, image_config
+
 def main(arguments=None, stdout=sys.stdout, install_sig_handlers=True, api=None):
     args = arg_parser.parse_args(arguments)
     if api is None:
-        api = arvados.api('v1')
+        api = arvados.api('v1', num_retries=args.retries)
 
     if args.image is None or args.image == 'images':
         fmt = "{:30}  {:10}  {:12}  {:29}  {:20}\n"
@@ -382,23 +400,40 @@ def main(arguments=None, stdout=sys.stdout, install_sig_handlers=True, api=None)
     elif args.tag is None:
         args.tag = 'latest'
 
+    if '/' in args.image:
+        hostport, path = args.image.split('/', 1)
+        if hostport.endswith(':443'):
+            # "docker pull host:443/asdf" transparently removes the
+            # :443 (which is redundant because https is implied) and
+            # after it succeeds "docker images" will list "host/asdf",
+            # not "host:443/asdf".  If we strip the :443 then the name
+            # doesn't change underneath us.
+            args.image = '/'.join([hostport[:-4], path])
+
     # Pull the image if requested, unless the image is specified as a hash
     # that we already have.
     if args.pull and not find_image_hashes(args.image):
         pull_image(args.image, args.tag)
 
+    images_in_arv = list_images_in_arv(api, args.retries, args.image, args.tag)
+
+    image_hash = None
     try:
         image_hash = find_one_image_hash(args.image, args.tag)
+        if not docker_image_compatible(api, image_hash):
+            if args.force_image_format:
+                logger.warning("forcing incompatible image")
+            else:
+                logger.error("refusing to store " \
+                    "incompatible format (use --force-image-format to override)")
+                sys.exit(1)
     except DockerError as error:
-        logger.error(str(error))
-        sys.exit(1)
-
-    if not docker_image_compatible(api, image_hash):
-        if args.force_image_format:
-            logger.warning("forcing incompatible image")
+        if images_in_arv:
+            # We don't have Docker / we don't have the image locally,
+            # use image that's already uploaded to Arvados
+            image_hash = images_in_arv[0][1]['dockerhash']
         else:
-            logger.error("refusing to store " \
-                "incompatible format (use --force-image-format to override)")
+            logger.error(str(error))
             sys.exit(1)
 
     image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None
@@ -511,21 +546,9 @@ def main(arguments=None, stdout=sys.stdout, install_sig_handlers=True, api=None)
         # Managed properties could be already set
         coll_properties = api.collections().get(uuid=coll_uuid).execute(num_retries=args.retries).get('properties', {})
         coll_properties.update({"docker-image-repo-tag": image_repo_tag})
-
         api.collections().update(uuid=coll_uuid, body={"properties": coll_properties}).execute(num_retries=args.retries)
 
-        # Read the image metadata and make Arvados links from it.
-        image_file.seek(0)
-        image_tar = tarfile.open(fileobj=image_file)
-        image_hash_type, _, raw_image_hash = image_hash.rpartition(':')
-        if image_hash_type:
-            json_filename = raw_image_hash + '.json'
-        else:
-            json_filename = raw_image_hash + '/json'
-        json_file = image_tar.extractfile(image_tar.getmember(json_filename))
-        image_metadata = json.loads(json_file.read().decode('utf-8'))
-        json_file.close()
-        image_tar.close()
+        _, image_metadata = load_image_metadata(image_file)
         link_base = {'head_uuid': coll_uuid, 'properties': {}}
         if 'created' in image_metadata:
             link_base['properties']['image_timestamp'] = image_metadata['created']