17 from operator import itemgetter
22 import arvados.commands._util as arv_cmd
23 import arvados.commands.put as arv_put
24 from arvados.collection import CollectionReader
29 from arvados._version import __version__
31 logger = logging.getLogger('arvados.keepdocker')
32 logger.setLevel(logging.DEBUG if arvados.config.get('ARVADOS_DEBUG')
35 EARLIEST_DATETIME = datetime.datetime(datetime.MINYEAR, 1, 1, 0, 0, 0)
36 STAT_CACHE_ERRORS = (IOError, OSError, ValueError)
38 DockerImage = collections.namedtuple(
39 'DockerImage', ['repo', 'tag', 'hash', 'created', 'vsize'])
41 keepdocker_parser = argparse.ArgumentParser(add_help=False)
42 keepdocker_parser.add_argument(
43 '--version', action='version', version="%s %s" % (sys.argv[0], __version__),
44 help='Print version and exit.')
45 keepdocker_parser.add_argument(
46 '-f', '--force', action='store_true', default=False,
47 help="Re-upload the image even if it already exists on the server")
48 keepdocker_parser.add_argument(
49 '--force-image-format', action='store_true', default=False,
50 help="Proceed even if the image format is not supported by the server")
52 _group = keepdocker_parser.add_mutually_exclusive_group()
54 '--pull', action='store_true', default=False,
55 help="Try to pull the latest image from Docker registry")
57 '--no-pull', action='store_false', dest='pull',
58 help="Use locally installed image only, don't pull image from Docker registry (default)")
60 keepdocker_parser.add_argument(
62 help="Docker image to upload, as a repository name or hash")
63 keepdocker_parser.add_argument(
64 'tag', nargs='?', default='latest',
65 help="Tag of the Docker image to upload (default 'latest')")
67 # Combine keepdocker options listed above with run_opts options of arv-put.
68 # The options inherited from arv-put include --name, --project-uuid,
69 # --progress/--no-progress/--batch-progress and --resume/--no-resume.
70 arg_parser = argparse.ArgumentParser(
71 description="Upload or list Docker images in Arvados",
72 parents=[keepdocker_parser, arv_put.run_opts, arv_cmd.retry_opt])
74 class DockerError(Exception):
78 def popen_docker(cmd, *args, **kwargs):
79 manage_stdin = ('stdin' not in kwargs)
80 kwargs.setdefault('stdin', subprocess.PIPE)
81 kwargs.setdefault('stdout', sys.stderr)
83 docker_proc = subprocess.Popen(['docker.io'] + cmd, *args, **kwargs)
84 except OSError: # No docker.io in $PATH
85 docker_proc = subprocess.Popen(['docker'] + cmd, *args, **kwargs)
87 docker_proc.stdin.close()
90 def check_docker(proc, description):
92 if proc.returncode != 0:
93 raise DockerError("docker {} returned status code {}".
94 format(description, proc.returncode))
96 def docker_image_format(image_hash):
97 """Return the registry format ('v1' or 'v2') of the given image."""
98 cmd = popen_docker(['inspect', '--format={{.Id}}', image_hash],
99 stdout=subprocess.PIPE)
101 image_id = next(cmd.stdout).strip()
102 if image_id.startswith('sha256:'):
104 elif ':' not in image_id:
109 check_docker(cmd, "inspect")
111 def docker_image_compatible(api, image_hash):
112 supported = api._rootDesc.get('dockerImageFormats', [])
114 logger.warn("server does not specify supported image formats (see docker_image_formats in server config). Continuing.")
117 fmt = docker_image_format(image_hash)
121 logger.error("image format is {!r} " \
122 "but server supports only {!r}".format(fmt, supported))
126 # Yield a DockerImage tuple for each installed image.
127 list_proc = popen_docker(['images', '--no-trunc'], stdout=subprocess.PIPE)
128 list_output = iter(list_proc.stdout)
129 next(list_output) # Ignore the header line
130 for line in list_output:
132 size_index = len(words) - 2
133 repo, tag, imageid = words[:3]
134 ctime = ' '.join(words[3:size_index])
135 vsize = ' '.join(words[size_index:])
136 yield DockerImage(repo, tag, imageid, ctime, vsize)
137 list_proc.stdout.close()
138 check_docker(list_proc, "images")
140 def find_image_hashes(image_search, image_tag=None):
141 # Given one argument, search for Docker images with matching hashes,
142 # and return their full hashes in a set.
143 # Given two arguments, also search for a Docker image with the
144 # same repository and tag. If one is found, return its hash in a
145 # set; otherwise, fall back to the one-argument hash search.
146 # Returns None if no match is found, or a hash search is ambiguous.
147 hash_search = image_search.lower()
149 for image in docker_images():
150 if (image.repo == image_search) and (image.tag == image_tag):
151 return set([image.hash])
152 elif image.hash.startswith(hash_search):
153 hash_matches.add(image.hash)
156 def find_one_image_hash(image_search, image_tag=None):
157 hashes = find_image_hashes(image_search, image_tag)
158 hash_count = len(hashes)
161 elif hash_count == 0:
162 raise DockerError("no matching image found")
164 raise DockerError("{} images match {}".format(hash_count, image_search))
166 def stat_cache_name(image_file):
167 return getattr(image_file, 'name', image_file) + '.stat'
169 def pull_image(image_name, image_tag):
170 check_docker(popen_docker(['pull', '{}:{}'.format(image_name, image_tag)]),
173 def save_image(image_hash, image_file):
174 # Save the specified Docker image to image_file, then try to save its
175 # stats so we can try to resume after interruption.
176 check_docker(popen_docker(['save', image_hash], stdout=image_file),
180 with open(stat_cache_name(image_file), 'w') as statfile:
181 json.dump(tuple(os.fstat(image_file.fileno())), statfile)
182 except STAT_CACHE_ERRORS:
183 pass # We won't resume from this cache. No big deal.
185 def prep_image_file(filename):
186 # Return a file object ready to save a Docker image,
187 # and a boolean indicating whether or not we need to actually save the
188 # image (False if a cached save is available).
189 cache_dir = arv_cmd.make_home_conf_dir(
190 os.path.join('.cache', 'arvados', 'docker'), 0o700)
191 if cache_dir is None:
192 image_file = tempfile.NamedTemporaryFile(suffix='.tar')
195 file_path = os.path.join(cache_dir, filename)
197 with open(stat_cache_name(file_path)) as statfile:
198 prev_stat = json.load(statfile)
199 now_stat = os.stat(file_path)
200 need_save = any(prev_stat[field] != now_stat[field]
201 for field in [ST_MTIME, ST_SIZE])
202 except STAT_CACHE_ERRORS + (AttributeError, IndexError):
203 need_save = True # We couldn't compare against old stats
204 image_file = open(file_path, 'w+b' if need_save else 'rb')
205 return image_file, need_save
207 def make_link(api_client, num_retries, link_class, link_name, **link_attrs):
208 link_attrs.update({'link_class': link_class, 'name': link_name})
209 return api_client.links().create(body=link_attrs).execute(
210 num_retries=num_retries)
212 def docker_link_sort_key(link):
213 """Build a sort key to find the latest available Docker image.
215 To find one source collection for a Docker image referenced by
216 name or image id, the API server looks for a link with the most
217 recent `image_timestamp` property; then the most recent
218 `created_at` timestamp. This method generates a sort key for
219 Docker metadata links to sort them from least to most preferred.
222 image_timestamp = ciso8601.parse_datetime_unaware(
223 link['properties']['image_timestamp'])
224 except (KeyError, ValueError):
225 image_timestamp = EARLIEST_DATETIME
226 return (image_timestamp,
227 ciso8601.parse_datetime_unaware(link['created_at']))
229 def _get_docker_links(api_client, num_retries, **kwargs):
230 links = arvados.util.list_all(api_client.links().list,
231 num_retries, **kwargs)
233 link['_sort_key'] = docker_link_sort_key(link)
234 links.sort(key=itemgetter('_sort_key'), reverse=True)
237 def _new_image_listing(link, dockerhash, repo='<none>', tag='<none>'):
238 timestamp_index = 1 if (link['_sort_key'][0] is EARLIEST_DATETIME) else 0
240 '_sort_key': link['_sort_key'],
241 'timestamp': link['_sort_key'][timestamp_index],
242 'collection': link['head_uuid'],
243 'dockerhash': dockerhash,
248 def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None):
249 """List all Docker images known to the api_client with image_name and
250 image_tag. If no image_name is given, defaults to listing all
253 Returns a list of tuples representing matching Docker images,
254 sorted in preference order (i.e. the first collection in the list
255 is the one that the API server would use). Each tuple is a
256 (collection_uuid, collection_info) pair, where collection_info is
257 a dict with fields "dockerhash", "repo", "tag", and "timestamp".
264 # Find images with the name the user specified.
265 search_links = _get_docker_links(
266 api_client, num_retries,
267 filters=[['link_class', '=', 'docker_image_repo+tag'],
269 '{}:{}'.format(image_name, image_tag or 'latest')]])
271 repo_links = search_links
273 # Fall back to finding images with the specified image hash.
274 search_links = _get_docker_links(
275 api_client, num_retries,
276 filters=[['link_class', '=', 'docker_image_hash'],
277 ['name', 'ilike', image_name + '%']])
278 hash_links = search_links
279 # Only list information about images that were found in the search.
280 search_filters.append(['head_uuid', 'in',
281 [link['head_uuid'] for link in search_links]])
283 # It should be reasonable to expect that each collection only has one
284 # image hash (though there may be many links specifying this). Find
285 # the API server's most preferred image hash link for each collection.
286 if hash_links is None:
287 hash_links = _get_docker_links(
288 api_client, num_retries,
289 filters=search_filters + [['link_class', '=', 'docker_image_hash']])
290 hash_link_map = {link['head_uuid']: link for link in reversed(hash_links)}
292 # Each collection may have more than one name (though again, one name
293 # may be specified more than once). Build an image listing from name
294 # tags, sorted by API server preference.
295 if repo_links is None:
296 repo_links = _get_docker_links(
297 api_client, num_retries,
298 filters=search_filters + [['link_class', '=',
299 'docker_image_repo+tag']])
300 seen_image_names = collections.defaultdict(set)
302 for link in repo_links:
303 collection_uuid = link['head_uuid']
304 if link['name'] in seen_image_names[collection_uuid]:
306 seen_image_names[collection_uuid].add(link['name'])
308 dockerhash = hash_link_map[collection_uuid]['name']
310 dockerhash = '<unknown>'
311 name_parts = link['name'].split(':', 1)
312 images.append(_new_image_listing(link, dockerhash, *name_parts))
314 # Find any image hash links that did not have a corresponding name link,
315 # and add image listings for them, retaining the API server preference
317 images_start_size = len(images)
318 for collection_uuid, link in hash_link_map.iteritems():
319 if not seen_image_names[collection_uuid]:
320 images.append(_new_image_listing(link, link['name']))
321 if len(images) > images_start_size:
322 images.sort(key=itemgetter('_sort_key'), reverse=True)
324 # Remove any image listings that refer to unknown collections.
325 existing_coll_uuids = {coll['uuid'] for coll in arvados.util.list_all(
326 api_client.collections().list, num_retries,
327 filters=[['uuid', 'in', [im['collection'] for im in images]]],
329 return [(image['collection'], image) for image in images
330 if image['collection'] in existing_coll_uuids]
332 def items_owned_by(owner_uuid, arv_items):
333 return (item for item in arv_items if item['owner_uuid'] == owner_uuid)
335 def _uuid2pdh(api, uuid):
336 return api.collections().list(
337 filters=[['uuid', '=', uuid]],
338 select=['portable_data_hash'],
339 ).execute()['items'][0]['portable_data_hash']
341 _migration_link_class = 'docker_image_migration'
342 _migration_link_name = 'migrate_1.9_1.10'
345 """Docker image format migration tool for Arvados.
347 This converts Docker images stored in Arvados from image format v1
348 (Docker <= 1.9) to image format v2 (Docker >= 1.10).
350 Requires Docker running on the local host.
354 1) Run arvados/docker/migrate-docker19/build.sh to create
355 arvados/migrate-docker19 Docker image.
357 2) Set ARVADOS_API_HOST and ARVADOS_API_TOKEN to the cluster you want to migrate.
359 3) Run arv-migrate-docker19 from the Arvados Python SDK on the host (not in a container).
361 This will query Arvados for v1 format Docker images. For each image that
362 does not already have a corresponding v2 format image (as indicated by a
363 docker_image_migration tag) it will perform the following process:
365 i) download the image from Arvados
366 ii) load it into Docker
367 iii) update the Docker version, which updates the image
368 iv) save the v2 format image and upload to Arvados
369 v) create a migration link
373 api_client = arvados.api()
375 user = api_client.users().current().execute()
376 if not user['is_admin']:
377 raise Exception("This command requires an admin token")
378 sys_uuid = user['uuid'][:12] + '000000000000000'
380 images = arvados.commands.keepdocker.list_images_in_arv(api_client, 3)
382 is_new = lambda img: img['dockerhash'].startswith('sha256:')
386 for uuid, img in images:
387 if img["dockerhash"].startswith("sha256:"):
389 key = (img["repo"], img["tag"], img["timestamp"])
390 old_images.append(img)
392 migration_links = arvados.util.list_all(api_client.links().list, filters=[
393 ['link_class', '=', _migration_link_class],
394 ['name', '=', _migration_link_name],
397 already_migrated = set()
398 for m in migration_links:
399 already_migrated.add(m["tail_uuid"])
401 items = arvados.util.list_all(api_client.collections().list,
402 filters=[["uuid", "in", [img["collection"] for img in old_images]]],
403 select=["uuid", "portable_data_hash"])
404 uuid_to_pdh = {i["uuid"]: i["portable_data_hash"] for i in items}
405 need_migrate = [img for img in old_images
406 if uuid_to_pdh[img["collection"]] not in already_migrated]
408 logger.info("Already migrated %i images", len(already_migrated))
409 logger.info("Need to migrate %i images", len(need_migrate))
413 for old_image in need_migrate:
414 if uuid_to_pdh[old_image["collection"]] in already_migrated:
417 logger.info("Migrating %s:%s (%s)", old_image["repo"], old_image["tag"], old_image["collection"])
419 oldcol = CollectionReader(old_image["collection"])
420 tarfile = oldcol.keys()[0]
423 varlibdocker = tempfile.mkdtemp()
424 with tempfile.NamedTemporaryFile() as envfile:
425 envfile.write("ARVADOS_API_HOST=%s\n" % (os.environ["ARVADOS_API_HOST"]))
426 envfile.write("ARVADOS_API_TOKEN=%s\n" % (os.environ["ARVADOS_API_TOKEN"]))
427 if "ARVADOS_API_HOST_INSECURE" in os.environ:
428 envfile.write("ARVADOS_API_HOST_INSECURE=%s\n" % (os.environ["ARVADOS_API_HOST_INSECURE"]))
431 dockercmd = ["docker", "run",
434 "--env-file", envfile.name,
435 "--volume", "%s:/var/lib/docker" % varlibdocker,
436 "arvados/migrate-docker19",
438 "%s/%s" % (old_image["collection"], tarfile),
442 oldcol.api_response()["owner_uuid"]]
444 out = subprocess.check_output(dockercmd)
446 migrated = re.search(r"Migrated uuid is ([a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15})", out)
448 newcol = CollectionReader(migrated.group(1))
450 api_client.links().create(body={"link": {
451 'owner_uuid': sys_uuid,
452 'link_class': arvados.commands.keepdocker._migration_link_class,
453 'name': arvados.commands.keepdocker._migration_link_name,
454 'tail_uuid': oldcol.portable_data_hash(),
455 'head_uuid': newcol.portable_data_hash()
456 }}).execute(num_retries=3)
458 logger.info("Migrated '%s' to '%s'", oldcol.portable_data_hash(), newcol.portable_data_hash())
459 already_migrated.add(oldcol.portable_data_hash())
460 success.append(old_image["collection"])
462 logger.error("Error migrating '%s'", old_image["collection"])
463 failures.append(old_image["collection"])
464 except Exception as e:
465 logger.exception("Migration failed")
466 failures.append(old_image["collection"])
468 shutil.rmtree(varlibdocker)
470 logger.info("Successfully migrated %i images", len(success))
472 logger.error("Failure migrating images: %s", failures)
475 def main(arguments=None, stdout=sys.stdout):
476 args = arg_parser.parse_args(arguments)
477 api = arvados.api('v1')
479 if args.image is None or args.image == 'images':
480 fmt = "{:30} {:10} {:12} {:29} {:20}\n"
481 stdout.write(fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED"))
482 for i, j in list_images_in_arv(api, args.retries):
483 stdout.write(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
486 # Pull the image if requested, unless the image is specified as a hash
487 # that we already have.
488 if args.pull and not find_image_hashes(args.image):
489 pull_image(args.image, args.tag)
492 image_hash = find_one_image_hash(args.image, args.tag)
493 except DockerError as error:
494 logger.error(error.message)
497 if not docker_image_compatible(api, image_hash):
498 if args.force_image_format:
499 logger.warn("forcing incompatible image")
501 logger.error("refusing to store " \
502 "incompatible format (use --force-image-format to override)")
505 image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None
507 if args.name is None:
509 collection_name = 'Docker image {} {}'.format(image_repo_tag, image_hash[0:12])
511 collection_name = 'Docker image {}'.format(image_hash[0:12])
513 collection_name = args.name
516 # Check if this image is already in Arvados.
518 # Project where everything should be owned
519 if args.project_uuid:
520 parent_project_uuid = args.project_uuid
522 parent_project_uuid = api.users().current().execute(
523 num_retries=args.retries)['uuid']
525 # Find image hash tags
526 existing_links = _get_docker_links(
528 filters=[['link_class', '=', 'docker_image_hash'],
529 ['name', '=', image_hash]])
531 # get readable collections
532 collections = api.collections().list(
533 filters=[['uuid', 'in', [link['head_uuid'] for link in existing_links]]],
534 select=["uuid", "owner_uuid", "name", "manifest_text"]
535 ).execute(num_retries=args.retries)['items']
538 # check for repo+tag links on these collections
540 existing_repo_tag = _get_docker_links(
542 filters=[['link_class', '=', 'docker_image_repo+tag'],
543 ['name', '=', image_repo_tag],
544 ['head_uuid', 'in', collections]])
546 existing_repo_tag = []
549 coll_uuid = next(items_owned_by(parent_project_uuid, collections))['uuid']
550 except StopIteration:
551 # create new collection owned by the project
552 coll_uuid = api.collections().create(
553 body={"manifest_text": collections[0]['manifest_text'],
554 "name": collection_name,
555 "owner_uuid": parent_project_uuid},
556 ensure_unique_name=True
557 ).execute(num_retries=args.retries)['uuid']
559 link_base = {'owner_uuid': parent_project_uuid,
560 'head_uuid': coll_uuid,
561 'properties': existing_links[0]['properties']}
563 if not any(items_owned_by(parent_project_uuid, existing_links)):
564 # create image link owned by the project
565 make_link(api, args.retries,
566 'docker_image_hash', image_hash, **link_base)
568 if image_repo_tag and not any(items_owned_by(parent_project_uuid, existing_repo_tag)):
569 # create repo+tag link owned by the project
570 make_link(api, args.retries, 'docker_image_repo+tag',
571 image_repo_tag, **link_base)
573 stdout.write(coll_uuid + "\n")
577 # Open a file for the saved image, and write it if needed.
578 outfile_name = '{}.tar'.format(image_hash)
579 image_file, need_save = prep_image_file(outfile_name)
581 save_image(image_hash, image_file)
583 # Call arv-put with switches we inherited from it
584 # (a.k.a., switches that aren't our own).
585 put_args = keepdocker_parser.parse_known_args(arguments)[1]
587 if args.name is None:
588 put_args += ['--name', collection_name]
590 coll_uuid = arv_put.main(
591 put_args + ['--filename', outfile_name, image_file.name], stdout=stdout).strip()
593 # Read the image metadata and make Arvados links from it.
595 image_tar = tarfile.open(fileobj=image_file)
596 image_hash_type, _, raw_image_hash = image_hash.rpartition(':')
598 json_filename = raw_image_hash + '.json'
600 json_filename = raw_image_hash + '/json'
601 json_file = image_tar.extractfile(image_tar.getmember(json_filename))
602 image_metadata = json.load(json_file)
605 link_base = {'head_uuid': coll_uuid, 'properties': {}}
606 if 'created' in image_metadata:
607 link_base['properties']['image_timestamp'] = image_metadata['created']
608 if args.project_uuid is not None:
609 link_base['owner_uuid'] = args.project_uuid
611 make_link(api, args.retries, 'docker_image_hash', image_hash, **link_base)
613 make_link(api, args.retries,
614 'docker_image_repo+tag', image_repo_tag, **link_base)
618 for filename in [stat_cache_name(image_file), image_file.name]:
621 except OSError as error:
622 if error.errno != errno.ENOENT:
625 if __name__ == '__main__':