sdk/python/arvados/commands/keepdocker.py

   1 #!/usr/bin/env python
   2
   3 import argparse
   4 import collections
   5 import datetime
   6 import errno
   7 import json
   8 import os
   9 import subprocess
  10 import sys
  11 import tarfile
  12 import tempfile
  13 import _strptime
  14
  15 from operator import itemgetter
  16 from stat import *
  17
  18 import arvados
  19 import arvados.util
  20 import arvados.commands._util as arv_cmd
  21 import arvados.commands.put as arv_put
  22 import ciso8601
  23
  24 from arvados._version import __version__
  25
  26 EARLIEST_DATETIME = datetime.datetime(datetime.MINYEAR, 1, 1, 0, 0, 0)
  27 STAT_CACHE_ERRORS = (IOError, OSError, ValueError)
  28
  29 DockerImage = collections.namedtuple(
  30     'DockerImage', ['repo', 'tag', 'hash', 'created', 'vsize'])
  31
  32 keepdocker_parser = argparse.ArgumentParser(add_help=False)
  33 keepdocker_parser.add_argument(
  34     '--version', action='version', version="%s %s" % (sys.argv[0], __version__),
  35     help='Print version and exit.')
  36 keepdocker_parser.add_argument(
  37     '-f', '--force', action='store_true', default=False,
  38     help="Re-upload the image even if it already exists on the server")
  39
  40 _group = keepdocker_parser.add_mutually_exclusive_group()
  41 _group.add_argument(
  42     '--pull', action='store_true', default=False,
  43     help="Try to pull the latest image from Docker registry")
  44 _group.add_argument(
  45     '--no-pull', action='store_false', dest='pull',
  46     help="Use locally installed image only, don't pull image from Docker registry (default)")
  47
  48 keepdocker_parser.add_argument(
  49     'image', nargs='?',
  50     help="Docker image to upload, as a repository name or hash")
  51 keepdocker_parser.add_argument(
  52     'tag', nargs='?', default='latest',
  53     help="Tag of the Docker image to upload (default 'latest')")
  54
  55 # Combine keepdocker options listed above with run_opts options of arv-put.
  56 # The options inherited from arv-put include --name, --project-uuid,
  57 # --progress/--no-progress/--batch-progress and --resume/--no-resume.
  58 arg_parser = argparse.ArgumentParser(
  59         description="Upload or list Docker images in Arvados",
  60         parents=[keepdocker_parser, arv_put.run_opts, arv_cmd.retry_opt])
  61
  62 class DockerError(Exception):
  63     pass
  64
  65
  66 def popen_docker(cmd, *args, **kwargs):
  67     manage_stdin = ('stdin' not in kwargs)
  68     kwargs.setdefault('stdin', subprocess.PIPE)
  69     kwargs.setdefault('stdout', sys.stderr)
  70     try:
  71         docker_proc = subprocess.Popen(['docker.io'] + cmd, *args, **kwargs)
  72     except OSError:  # No docker.io in $PATH
  73         docker_proc = subprocess.Popen(['docker'] + cmd, *args, **kwargs)
  74     if manage_stdin:
  75         docker_proc.stdin.close()
  76     return docker_proc
  77
  78 def check_docker(proc, description):
  79     proc.wait()
  80     if proc.returncode != 0:
  81         raise DockerError("docker {} returned status code {}".
  82                           format(description, proc.returncode))
  83
  84 def docker_images():
  85     # Yield a DockerImage tuple for each installed image.
  86     list_proc = popen_docker(['images', '--no-trunc'], stdout=subprocess.PIPE)
  87     list_output = iter(list_proc.stdout)
  88     next(list_output)  # Ignore the header line
  89     for line in list_output:
  90         words = line.split()
  91         size_index = len(words) - 2
  92         repo, tag, imageid = words[:3]
  93         ctime = ' '.join(words[3:size_index])
  94         vsize = ' '.join(words[size_index:])
  95         yield DockerImage(repo, tag, imageid, ctime, vsize)
  96     list_proc.stdout.close()
  97     check_docker(list_proc, "images")
  98
  99 def find_image_hashes(image_search, image_tag=None):
 100     # Given one argument, search for Docker images with matching hashes,
 101     # and return their full hashes in a set.
 102     # Given two arguments, also search for a Docker image with the
 103     # same repository and tag.  If one is found, return its hash in a
 104     # set; otherwise, fall back to the one-argument hash search.
 105     # Returns None if no match is found, or a hash search is ambiguous.
 106     hash_search = image_search.lower()
 107     hash_matches = set()
 108     for image in docker_images():
 109         if (image.repo == image_search) and (image.tag == image_tag):
 110             return set([image.hash])
 111         elif image.hash.startswith(hash_search):
 112             hash_matches.add(image.hash)
 113     return hash_matches
 114
 115 def find_one_image_hash(image_search, image_tag=None):
 116     hashes = find_image_hashes(image_search, image_tag)
 117     hash_count = len(hashes)
 118     if hash_count == 1:
 119         return hashes.pop()
 120     elif hash_count == 0:
 121         raise DockerError("no matching image found")
 122     else:
 123         raise DockerError("{} images match {}".format(hash_count, image_search))
 124
 125 def stat_cache_name(image_file):
 126     return getattr(image_file, 'name', image_file) + '.stat'
 127
 128 def pull_image(image_name, image_tag):
 129     check_docker(popen_docker(['pull', '{}:{}'.format(image_name, image_tag)]),
 130                  "pull")
 131
 132 def save_image(image_hash, image_file):
 133     # Save the specified Docker image to image_file, then try to save its
 134     # stats so we can try to resume after interruption.
 135     check_docker(popen_docker(['save', image_hash], stdout=image_file),
 136                  "save")
 137     image_file.flush()
 138     try:
 139         with open(stat_cache_name(image_file), 'w') as statfile:
 140             json.dump(tuple(os.fstat(image_file.fileno())), statfile)
 141     except STAT_CACHE_ERRORS:
 142         pass  # We won't resume from this cache.  No big deal.
 143
 144 def prep_image_file(filename):
 145     # Return a file object ready to save a Docker image,
 146     # and a boolean indicating whether or not we need to actually save the
 147     # image (False if a cached save is available).
 148     cache_dir = arv_cmd.make_home_conf_dir(
 149         os.path.join('.cache', 'arvados', 'docker'), 0o700)
 150     if cache_dir is None:
 151         image_file = tempfile.NamedTemporaryFile(suffix='.tar')
 152         need_save = True
 153     else:
 154         file_path = os.path.join(cache_dir, filename)
 155         try:
 156             with open(stat_cache_name(file_path)) as statfile:
 157                 prev_stat = json.load(statfile)
 158             now_stat = os.stat(file_path)
 159             need_save = any(prev_stat[field] != now_stat[field]
 160                             for field in [ST_MTIME, ST_SIZE])
 161         except STAT_CACHE_ERRORS + (AttributeError, IndexError):
 162             need_save = True  # We couldn't compare against old stats
 163         image_file = open(file_path, 'w+b' if need_save else 'rb')
 164     return image_file, need_save
 165
 166 def make_link(api_client, num_retries, link_class, link_name, **link_attrs):
 167     link_attrs.update({'link_class': link_class, 'name': link_name})
 168     return api_client.links().create(body=link_attrs).execute(
 169         num_retries=num_retries)
 170
 171 def docker_link_sort_key(link):
 172     """Build a sort key to find the latest available Docker image.
 173
 174     To find one source collection for a Docker image referenced by
 175     name or image id, the API server looks for a link with the most
 176     recent `image_timestamp` property; then the most recent
 177     `created_at` timestamp.  This method generates a sort key for
 178     Docker metadata links to sort them from least to most preferred.
 179     """
 180     try:
 181         image_timestamp = ciso8601.parse_datetime_unaware(
 182             link['properties']['image_timestamp'])
 183     except (KeyError, ValueError):
 184         image_timestamp = EARLIEST_DATETIME
 185     return (image_timestamp,
 186             ciso8601.parse_datetime_unaware(link['created_at']))
 187
 188 def _get_docker_links(api_client, num_retries, **kwargs):
 189     links = arvados.util.list_all(api_client.links().list,
 190                                   num_retries, **kwargs)
 191     for link in links:
 192         link['_sort_key'] = docker_link_sort_key(link)
 193     links.sort(key=itemgetter('_sort_key'), reverse=True)
 194     return links
 195
 196 def _new_image_listing(link, dockerhash, repo='<none>', tag='<none>'):
 197     timestamp_index = 1 if (link['_sort_key'][0] is EARLIEST_DATETIME) else 0
 198     return {
 199         '_sort_key': link['_sort_key'],
 200         'timestamp': link['_sort_key'][timestamp_index],
 201         'collection': link['head_uuid'],
 202         'dockerhash': dockerhash,
 203         'repo': repo,
 204         'tag': tag,
 205         }
 206
 207 def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None):
 208     """List all Docker images known to the api_client with image_name and
 209     image_tag.  If no image_name is given, defaults to listing all
 210     Docker images.
 211
 212     Returns a list of tuples representing matching Docker images,
 213     sorted in preference order (i.e. the first collection in the list
 214     is the one that the API server would use). Each tuple is a
 215     (collection_uuid, collection_info) pair, where collection_info is
 216     a dict with fields "dockerhash", "repo", "tag", and "timestamp".
 217
 218     """
 219     search_filters = []
 220     repo_links = None
 221     hash_links = None
 222     if image_name:
 223         # Find images with the name the user specified.
 224         search_links = _get_docker_links(
 225             api_client, num_retries,
 226             filters=[['link_class', '=', 'docker_image_repo+tag'],
 227                      ['name', '=',
 228                       '{}:{}'.format(image_name, image_tag or 'latest')]])
 229         if search_links:
 230             repo_links = search_links
 231         else:
 232             # Fall back to finding images with the specified image hash.
 233             search_links = _get_docker_links(
 234                 api_client, num_retries,
 235                 filters=[['link_class', '=', 'docker_image_hash'],
 236                          ['name', 'ilike', image_name + '%']])
 237             hash_links = search_links
 238         # Only list information about images that were found in the search.
 239         search_filters.append(['head_uuid', 'in',
 240                                [link['head_uuid'] for link in search_links]])
 241
 242     # It should be reasonable to expect that each collection only has one
 243     # image hash (though there may be many links specifying this).  Find
 244     # the API server's most preferred image hash link for each collection.
 245     if hash_links is None:
 246         hash_links = _get_docker_links(
 247             api_client, num_retries,
 248             filters=search_filters + [['link_class', '=', 'docker_image_hash']])
 249     hash_link_map = {link['head_uuid']: link for link in reversed(hash_links)}
 250
 251     # Each collection may have more than one name (though again, one name
 252     # may be specified more than once).  Build an image listing from name
 253     # tags, sorted by API server preference.
 254     if repo_links is None:
 255         repo_links = _get_docker_links(
 256             api_client, num_retries,
 257             filters=search_filters + [['link_class', '=',
 258                                        'docker_image_repo+tag']])
 259     seen_image_names = collections.defaultdict(set)
 260     images = []
 261     for link in repo_links:
 262         collection_uuid = link['head_uuid']
 263         if link['name'] in seen_image_names[collection_uuid]:
 264             continue
 265         seen_image_names[collection_uuid].add(link['name'])
 266         try:
 267             dockerhash = hash_link_map[collection_uuid]['name']
 268         except KeyError:
 269             dockerhash = '<unknown>'
 270         name_parts = link['name'].split(':', 1)
 271         images.append(_new_image_listing(link, dockerhash, *name_parts))
 272
 273     # Find any image hash links that did not have a corresponding name link,
 274     # and add image listings for them, retaining the API server preference
 275     # sorting.
 276     images_start_size = len(images)
 277     for collection_uuid, link in hash_link_map.iteritems():
 278         if not seen_image_names[collection_uuid]:
 279             images.append(_new_image_listing(link, link['name']))
 280     if len(images) > images_start_size:
 281         images.sort(key=itemgetter('_sort_key'), reverse=True)
 282
 283     # Remove any image listings that refer to unknown collections.
 284     existing_coll_uuids = {coll['uuid'] for coll in arvados.util.list_all(
 285             api_client.collections().list, num_retries,
 286             filters=[['uuid', 'in', [im['collection'] for im in images]]],
 287             select=['uuid'])}
 288     return [(image['collection'], image) for image in images
 289             if image['collection'] in existing_coll_uuids]
 290
 291 def items_owned_by(owner_uuid, arv_items):
 292     return (item for item in arv_items if item['owner_uuid'] == owner_uuid)
 293
 294 def main(arguments=None, stdout=sys.stdout):
 295     args = arg_parser.parse_args(arguments)
 296     api = arvados.api('v1')
 297
 298     if args.image is None or args.image == 'images':
 299         fmt = "{:30}  {:10}  {:12}  {:29}  {:20}\n"
 300         stdout.write(fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED"))
 301         for i, j in list_images_in_arv(api, args.retries):
 302             stdout.write(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
 303         sys.exit(0)
 304
 305     # Pull the image if requested, unless the image is specified as a hash
 306     # that we already have.
 307     if args.pull and not find_image_hashes(args.image):
 308         pull_image(args.image, args.tag)
 309
 310     try:
 311         image_hash = find_one_image_hash(args.image, args.tag)
 312     except DockerError as error:
 313         print >>sys.stderr, "arv-keepdocker:", error.message
 314         sys.exit(1)
 315
 316     image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None
 317
 318     if args.name is None:
 319         if image_repo_tag:
 320             collection_name = 'Docker image {} {}'.format(image_repo_tag, image_hash[0:12])
 321         else:
 322             collection_name = 'Docker image {}'.format(image_hash[0:12])
 323     else:
 324         collection_name = args.name
 325
 326     if not args.force:
 327         # Check if this image is already in Arvados.
 328
 329         # Project where everything should be owned
 330         if args.project_uuid:
 331             parent_project_uuid = args.project_uuid
 332         else:
 333             parent_project_uuid = api.users().current().execute(
 334                 num_retries=args.retries)['uuid']
 335
 336         # Find image hash tags
 337         existing_links = _get_docker_links(
 338             api, args.retries,
 339             filters=[['link_class', '=', 'docker_image_hash'],
 340                      ['name', '=', image_hash]])
 341         if existing_links:
 342             # get readable collections
 343             collections = api.collections().list(
 344                 filters=[['uuid', 'in', [link['head_uuid'] for link in existing_links]]],
 345                 select=["uuid", "owner_uuid", "name", "manifest_text"]
 346                 ).execute(num_retries=args.retries)['items']
 347
 348             if collections:
 349                 # check for repo+tag links on these collections
 350                 if image_repo_tag:
 351                     existing_repo_tag = _get_docker_links(
 352                         api, args.retries,
 353                         filters=[['link_class', '=', 'docker_image_repo+tag'],
 354                                  ['name', '=', image_repo_tag],
 355                                  ['head_uuid', 'in', collections]])
 356                 else:
 357                     existing_repo_tag = []
 358
 359                 try:
 360                     coll_uuid = next(items_owned_by(parent_project_uuid, collections))['uuid']
 361                 except StopIteration:
 362                     # create new collection owned by the project
 363                     coll_uuid = api.collections().create(
 364                         body={"manifest_text": collections[0]['manifest_text'],
 365                               "name": collection_name,
 366                               "owner_uuid": parent_project_uuid},
 367                         ensure_unique_name=True
 368                         ).execute(num_retries=args.retries)['uuid']
 369
 370                 link_base = {'owner_uuid': parent_project_uuid,
 371                              'head_uuid':  coll_uuid,
 372                              'properties': existing_links[0]['properties']}
 373
 374                 if not any(items_owned_by(parent_project_uuid, existing_links)):
 375                     # create image link owned by the project
 376                     make_link(api, args.retries,
 377                               'docker_image_hash', image_hash, **link_base)
 378
 379                 if image_repo_tag and not any(items_owned_by(parent_project_uuid, existing_repo_tag)):
 380                     # create repo+tag link owned by the project
 381                     make_link(api, args.retries, 'docker_image_repo+tag',
 382                               image_repo_tag, **link_base)
 383
 384                 stdout.write(coll_uuid + "\n")
 385
 386                 sys.exit(0)
 387
 388     # Open a file for the saved image, and write it if needed.
 389     outfile_name = '{}.tar'.format(image_hash)
 390     image_file, need_save = prep_image_file(outfile_name)
 391     if need_save:
 392         save_image(image_hash, image_file)
 393
 394     # Call arv-put with switches we inherited from it
 395     # (a.k.a., switches that aren't our own).
 396     put_args = keepdocker_parser.parse_known_args(arguments)[1]
 397
 398     if args.name is None:
 399         put_args += ['--name', collection_name]
 400
 401     coll_uuid = arv_put.main(
 402         put_args + ['--filename', outfile_name, image_file.name], stdout=stdout).strip()
 403
 404     # Read the image metadata and make Arvados links from it.
 405     image_file.seek(0)
 406     image_tar = tarfile.open(fileobj=image_file)
 407     image_hash_type, _, raw_image_hash = image_hash.rpartition(':')
 408     if image_hash_type:
 409         json_filename = raw_image_hash + '.json'
 410     else:
 411         json_filename = raw_image_hash + '/json'
 412     json_file = image_tar.extractfile(image_tar.getmember(json_filename))
 413     image_metadata = json.load(json_file)
 414     json_file.close()
 415     image_tar.close()
 416     link_base = {'head_uuid': coll_uuid, 'properties': {}}
 417     if 'created' in image_metadata:
 418         link_base['properties']['image_timestamp'] = image_metadata['created']
 419     if args.project_uuid is not None:
 420         link_base['owner_uuid'] = args.project_uuid
 421
 422     make_link(api, args.retries, 'docker_image_hash', image_hash, **link_base)
 423     if image_repo_tag:
 424         make_link(api, args.retries,
 425                   'docker_image_repo+tag', image_repo_tag, **link_base)
 426
 427     # Clean up.
 428     image_file.close()
 429     for filename in [stat_cache_name(image_file), image_file.name]:
 430         try:
 431             os.unlink(filename)
 432         except OSError as error:
 433             if error.errno != errno.ENOENT:
 434                 raise
 435
 436 if __name__ == '__main__':
 437     main()