17 from operator import itemgetter
22 import arvados.commands._util as arv_cmd
23 import arvados.commands.put as arv_put
24 from arvados.collection import CollectionReader
29 from arvados._version import __version__
31 logger = logging.getLogger('arvados.keepdocker')
32 logger.setLevel(logging.DEBUG if arvados.config.get('ARVADOS_DEBUG')
35 EARLIEST_DATETIME = datetime.datetime(datetime.MINYEAR, 1, 1, 0, 0, 0)
36 STAT_CACHE_ERRORS = (IOError, OSError, ValueError)
38 DockerImage = collections.namedtuple(
39 'DockerImage', ['repo', 'tag', 'hash', 'created', 'vsize'])
41 keepdocker_parser = argparse.ArgumentParser(add_help=False)
42 keepdocker_parser.add_argument(
43 '--version', action='version', version="%s %s" % (sys.argv[0], __version__),
44 help='Print version and exit.')
45 keepdocker_parser.add_argument(
46 '-f', '--force', action='store_true', default=False,
47 help="Re-upload the image even if it already exists on the server")
48 keepdocker_parser.add_argument(
49 '--force-image-format', action='store_true', default=False,
50 help="Proceed even if the image format is not supported by the server")
52 _group = keepdocker_parser.add_mutually_exclusive_group()
54 '--pull', action='store_true', default=False,
55 help="Try to pull the latest image from Docker registry")
57 '--no-pull', action='store_false', dest='pull',
58 help="Use locally installed image only, don't pull image from Docker registry (default)")
60 keepdocker_parser.add_argument(
62 help="Docker image to upload, as a repository name or hash")
63 keepdocker_parser.add_argument(
64 'tag', nargs='?', default='latest',
65 help="Tag of the Docker image to upload (default 'latest')")
67 # Combine keepdocker options listed above with run_opts options of arv-put.
68 # The options inherited from arv-put include --name, --project-uuid,
69 # --progress/--no-progress/--batch-progress and --resume/--no-resume.
70 arg_parser = argparse.ArgumentParser(
71 description="Upload or list Docker images in Arvados",
72 parents=[keepdocker_parser, arv_put.run_opts, arv_cmd.retry_opt])
74 class DockerError(Exception):
78 def popen_docker(cmd, *args, **kwargs):
79 manage_stdin = ('stdin' not in kwargs)
80 kwargs.setdefault('stdin', subprocess.PIPE)
81 kwargs.setdefault('stdout', sys.stderr)
83 docker_proc = subprocess.Popen(['docker.io'] + cmd, *args, **kwargs)
84 except OSError: # No docker.io in $PATH
85 docker_proc = subprocess.Popen(['docker'] + cmd, *args, **kwargs)
87 docker_proc.stdin.close()
90 def check_docker(proc, description):
92 if proc.returncode != 0:
93 raise DockerError("docker {} returned status code {}".
94 format(description, proc.returncode))
96 def docker_image_format(image_hash):
97 """Return the registry format ('v1' or 'v2') of the given image."""
98 cmd = popen_docker(['inspect', '--format={{.Id}}', image_hash],
99 stdout=subprocess.PIPE)
101 image_id = next(cmd.stdout).strip()
102 if image_id.startswith('sha256:'):
104 elif ':' not in image_id:
109 check_docker(cmd, "inspect")
111 def docker_image_compatible(api, image_hash):
112 supported = api._rootDesc.get('dockerImageFormats', [])
114 logger.warn("server does not specify supported image formats (see docker_image_formats in server config). Continuing.")
117 fmt = docker_image_format(image_hash)
121 logger.error("image format is {!r} " \
122 "but server supports only {!r}".format(fmt, supported))
126 # Yield a DockerImage tuple for each installed image.
127 list_proc = popen_docker(['images', '--no-trunc'], stdout=subprocess.PIPE)
128 list_output = iter(list_proc.stdout)
129 next(list_output) # Ignore the header line
130 for line in list_output:
132 size_index = len(words) - 2
133 repo, tag, imageid = words[:3]
134 ctime = ' '.join(words[3:size_index])
135 vsize = ' '.join(words[size_index:])
136 yield DockerImage(repo, tag, imageid, ctime, vsize)
137 list_proc.stdout.close()
138 check_docker(list_proc, "images")
140 def find_image_hashes(image_search, image_tag=None):
141 # Given one argument, search for Docker images with matching hashes,
142 # and return their full hashes in a set.
143 # Given two arguments, also search for a Docker image with the
144 # same repository and tag. If one is found, return its hash in a
145 # set; otherwise, fall back to the one-argument hash search.
146 # Returns None if no match is found, or a hash search is ambiguous.
147 hash_search = image_search.lower()
149 for image in docker_images():
150 if (image.repo == image_search) and (image.tag == image_tag):
151 return set([image.hash])
152 elif image.hash.startswith(hash_search):
153 hash_matches.add(image.hash)
156 def find_one_image_hash(image_search, image_tag=None):
157 hashes = find_image_hashes(image_search, image_tag)
158 hash_count = len(hashes)
161 elif hash_count == 0:
162 raise DockerError("no matching image found")
164 raise DockerError("{} images match {}".format(hash_count, image_search))
166 def stat_cache_name(image_file):
167 return getattr(image_file, 'name', image_file) + '.stat'
169 def pull_image(image_name, image_tag):
170 check_docker(popen_docker(['pull', '{}:{}'.format(image_name, image_tag)]),
173 def save_image(image_hash, image_file):
174 # Save the specified Docker image to image_file, then try to save its
175 # stats so we can try to resume after interruption.
176 check_docker(popen_docker(['save', image_hash], stdout=image_file),
180 with open(stat_cache_name(image_file), 'w') as statfile:
181 json.dump(tuple(os.fstat(image_file.fileno())), statfile)
182 except STAT_CACHE_ERRORS:
183 pass # We won't resume from this cache. No big deal.
185 def prep_image_file(filename):
186 # Return a file object ready to save a Docker image,
187 # and a boolean indicating whether or not we need to actually save the
188 # image (False if a cached save is available).
189 cache_dir = arv_cmd.make_home_conf_dir(
190 os.path.join('.cache', 'arvados', 'docker'), 0o700)
191 if cache_dir is None:
192 image_file = tempfile.NamedTemporaryFile(suffix='.tar')
195 file_path = os.path.join(cache_dir, filename)
197 with open(stat_cache_name(file_path)) as statfile:
198 prev_stat = json.load(statfile)
199 now_stat = os.stat(file_path)
200 need_save = any(prev_stat[field] != now_stat[field]
201 for field in [ST_MTIME, ST_SIZE])
202 except STAT_CACHE_ERRORS + (AttributeError, IndexError):
203 need_save = True # We couldn't compare against old stats
204 image_file = open(file_path, 'w+b' if need_save else 'rb')
205 return image_file, need_save
207 def make_link(api_client, num_retries, link_class, link_name, **link_attrs):
208 link_attrs.update({'link_class': link_class, 'name': link_name})
209 return api_client.links().create(body=link_attrs).execute(
210 num_retries=num_retries)
212 def docker_link_sort_key(link):
213 """Build a sort key to find the latest available Docker image.
215 To find one source collection for a Docker image referenced by
216 name or image id, the API server looks for a link with the most
217 recent `image_timestamp` property; then the most recent
218 `created_at` timestamp. This method generates a sort key for
219 Docker metadata links to sort them from least to most preferred.
222 image_timestamp = ciso8601.parse_datetime_unaware(
223 link['properties']['image_timestamp'])
224 except (KeyError, ValueError):
225 image_timestamp = EARLIEST_DATETIME
226 return (image_timestamp,
227 ciso8601.parse_datetime_unaware(link['created_at']))
229 def _get_docker_links(api_client, num_retries, **kwargs):
230 links = arvados.util.list_all(api_client.links().list,
231 num_retries, **kwargs)
233 link['_sort_key'] = docker_link_sort_key(link)
234 links.sort(key=itemgetter('_sort_key'), reverse=True)
237 def _new_image_listing(link, dockerhash, repo='<none>', tag='<none>'):
238 timestamp_index = 1 if (link['_sort_key'][0] is EARLIEST_DATETIME) else 0
240 '_sort_key': link['_sort_key'],
241 'timestamp': link['_sort_key'][timestamp_index],
242 'collection': link['head_uuid'],
243 'dockerhash': dockerhash,
248 def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None):
249 """List all Docker images known to the api_client with image_name and
250 image_tag. If no image_name is given, defaults to listing all
253 Returns a list of tuples representing matching Docker images,
254 sorted in preference order (i.e. the first collection in the list
255 is the one that the API server would use). Each tuple is a
256 (collection_uuid, collection_info) pair, where collection_info is
257 a dict with fields "dockerhash", "repo", "tag", and "timestamp".
264 # Find images with the name the user specified.
265 search_links = _get_docker_links(
266 api_client, num_retries,
267 filters=[['link_class', '=', 'docker_image_repo+tag'],
269 '{}:{}'.format(image_name, image_tag or 'latest')]])
271 repo_links = search_links
273 # Fall back to finding images with the specified image hash.
274 search_links = _get_docker_links(
275 api_client, num_retries,
276 filters=[['link_class', '=', 'docker_image_hash'],
277 ['name', 'ilike', image_name + '%']])
278 hash_links = search_links
279 # Only list information about images that were found in the search.
280 search_filters.append(['head_uuid', 'in',
281 [link['head_uuid'] for link in search_links]])
283 # It should be reasonable to expect that each collection only has one
284 # image hash (though there may be many links specifying this). Find
285 # the API server's most preferred image hash link for each collection.
286 if hash_links is None:
287 hash_links = _get_docker_links(
288 api_client, num_retries,
289 filters=search_filters + [['link_class', '=', 'docker_image_hash']])
290 hash_link_map = {link['head_uuid']: link for link in reversed(hash_links)}
292 # Each collection may have more than one name (though again, one name
293 # may be specified more than once). Build an image listing from name
294 # tags, sorted by API server preference.
295 if repo_links is None:
296 repo_links = _get_docker_links(
297 api_client, num_retries,
298 filters=search_filters + [['link_class', '=',
299 'docker_image_repo+tag']])
300 seen_image_names = collections.defaultdict(set)
302 for link in repo_links:
303 collection_uuid = link['head_uuid']
304 if link['name'] in seen_image_names[collection_uuid]:
306 seen_image_names[collection_uuid].add(link['name'])
308 dockerhash = hash_link_map[collection_uuid]['name']
310 dockerhash = '<unknown>'
311 name_parts = link['name'].split(':', 1)
312 images.append(_new_image_listing(link, dockerhash, *name_parts))
314 # Find any image hash links that did not have a corresponding name link,
315 # and add image listings for them, retaining the API server preference
317 images_start_size = len(images)
318 for collection_uuid, link in hash_link_map.iteritems():
319 if not seen_image_names[collection_uuid]:
320 images.append(_new_image_listing(link, link['name']))
321 if len(images) > images_start_size:
322 images.sort(key=itemgetter('_sort_key'), reverse=True)
324 # Remove any image listings that refer to unknown collections.
325 existing_coll_uuids = {coll['uuid'] for coll in arvados.util.list_all(
326 api_client.collections().list, num_retries,
327 filters=[['uuid', 'in', [im['collection'] for im in images]]],
329 return [(image['collection'], image) for image in images
330 if image['collection'] in existing_coll_uuids]
332 def items_owned_by(owner_uuid, arv_items):
333 return (item for item in arv_items if item['owner_uuid'] == owner_uuid)
335 def _uuid2pdh(api, uuid):
336 return api.collections().list(
337 filters=[['uuid', '=', uuid]],
338 select=['portable_data_hash'],
339 ).execute()['items'][0]['portable_data_hash']
341 _migration_link_class = 'docker_image_migration'
342 _migration_link_name = 'migrate_1.9_1.10'
345 """Docker image format migration tool for Arvados.
347 This converts Docker images stored in Arvados from image format v1
348 (Docker <= 1.9) to image format v2 (Docker >= 1.10).
350 Requires Docker running on the local host.
354 1) Run arvados/docker/migrate-docker19/build.sh to create
355 arvados/migrate-docker19 Docker image.
357 2) Set ARVADOS_API_HOST and ARVADOS_API_TOKEN to the cluster you want to migrate.
359 3) Run arv-migrate-docker19 from the Arvados Python SDK on the host (not in a container).
361 This will query Arvados for v1 format Docker images. For each image that
362 does not already have a corresponding v2 format image (as indicated by a
363 docker_image_migration tag) it will perform the following process:
365 i) download the image from Arvados
366 ii) load it into Docker
367 iii) update the Docker version, which updates the image
368 iv) save the v2 format image and upload to Arvados
369 v) create a migration link
373 api_client = arvados.api()
375 images = arvados.commands.keepdocker.list_images_in_arv(api_client, 3)
377 is_new = lambda img: img['dockerhash'].startswith('sha256:')
381 for uuid, img in images:
382 if img["dockerhash"].startswith("sha256:"):
384 key = (img["repo"], img["tag"], img["timestamp"])
385 old_images.append(img)
387 migration_links = arvados.util.list_all(api_client.links().list, filters=[
388 ['link_class', '=', _migration_link_class],
389 ['name', '=', _migration_link_name],
392 already_migrated = set()
393 for m in migration_links:
394 already_migrated.add(m["tail_uuid"])
396 items = arvados.util.list_all(api_client.collections().list,
397 filters=[["uuid", "in", [img["collection"] for img in old_images]]],
398 select=["uuid", "portable_data_hash"])
399 uuid_to_pdh = {i["uuid"]: i["portable_data_hash"] for i in items}
400 need_migrate = [img for img in old_images
401 if uuid_to_pdh[img["collection"]] not in already_migrated]
403 logger.info("Already migrated %i images", len(already_migrated))
404 logger.info("Need to migrate %i images", len(need_migrate))
408 for old_image in need_migrate:
409 if uuid_to_pdh[old_image["collection"]] in already_migrated:
412 logger.info("Migrating %s:%s (%s)", old_image["repo"], old_image["tag"], old_image["collection"])
414 oldcol = CollectionReader(old_image["collection"])
415 tarfile = oldcol.keys()[0]
418 varlibdocker = tempfile.mkdtemp()
419 with tempfile.NamedTemporaryFile() as envfile:
420 envfile.write("ARVADOS_API_HOST=%s\n" % (os.environ["ARVADOS_API_HOST"]))
421 envfile.write("ARVADOS_API_TOKEN=%s\n" % (os.environ["ARVADOS_API_TOKEN"]))
422 if "ARVADOS_API_HOST_INSECURE" in os.environ:
423 envfile.write("ARVADOS_API_HOST_INSECURE=%s\n" % (os.environ["ARVADOS_API_HOST_INSECURE"]))
426 dockercmd = ["docker", "run",
429 "--env-file", envfile.name,
430 "--volume", "%s:/var/lib/docker" % varlibdocker,
431 "arvados/migrate-docker19",
433 "%s/%s" % (old_image["collection"], tarfile),
437 oldcol.api_response()["owner_uuid"]]
439 out = subprocess.check_output(dockercmd)
441 migrated = re.search(r"Migrated uuid is ([a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15})", out)
443 newcol = CollectionReader(migrated.group(1))
445 api_client.links().create(body={"link": {
446 'owner_uuid': oldcol.api_response()["owner_uuid"],
447 'link_class': arvados.commands.keepdocker._migration_link_class,
448 'name': arvados.commands.keepdocker._migration_link_name,
449 'tail_uuid': oldcol.portable_data_hash(),
450 'head_uuid': newcol.portable_data_hash()
451 }}).execute(num_retries=3)
453 logger.info("Migrated '%s' to '%s'", oldcol.portable_data_hash(), newcol.portable_data_hash())
454 already_migrated.add(oldcol.portable_data_hash())
455 success.append(old_image["collection"])
457 logger.error("Error migrating '%s'", old_image["collection"])
458 failures.append(old_image["collection"])
459 except Exception as e:
460 logger.exception("Migration failed")
461 failures.append(old_image["collection"])
463 shutil.rmtree(varlibdocker)
465 logger.info("Successfully migrated %i images", len(success))
467 logger.error("Failure migrating images: %s", failures)
470 def main(arguments=None, stdout=sys.stdout):
471 args = arg_parser.parse_args(arguments)
472 api = arvados.api('v1')
474 if args.image is None or args.image == 'images':
475 fmt = "{:30} {:10} {:12} {:29} {:20}\n"
476 stdout.write(fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED"))
477 for i, j in list_images_in_arv(api, args.retries):
478 stdout.write(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
481 # Pull the image if requested, unless the image is specified as a hash
482 # that we already have.
483 if args.pull and not find_image_hashes(args.image):
484 pull_image(args.image, args.tag)
487 image_hash = find_one_image_hash(args.image, args.tag)
488 except DockerError as error:
489 logger.error(error.message)
492 if not docker_image_compatible(api, image_hash):
493 if args.force_image_format:
494 logger.warn("forcing incompatible image")
496 logger.error("refusing to store " \
497 "incompatible format (use --force-image-format to override)")
500 image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None
502 if args.name is None:
504 collection_name = 'Docker image {} {}'.format(image_repo_tag, image_hash[0:12])
506 collection_name = 'Docker image {}'.format(image_hash[0:12])
508 collection_name = args.name
511 # Check if this image is already in Arvados.
513 # Project where everything should be owned
514 if args.project_uuid:
515 parent_project_uuid = args.project_uuid
517 parent_project_uuid = api.users().current().execute(
518 num_retries=args.retries)['uuid']
520 # Find image hash tags
521 existing_links = _get_docker_links(
523 filters=[['link_class', '=', 'docker_image_hash'],
524 ['name', '=', image_hash]])
526 # get readable collections
527 collections = api.collections().list(
528 filters=[['uuid', 'in', [link['head_uuid'] for link in existing_links]]],
529 select=["uuid", "owner_uuid", "name", "manifest_text"]
530 ).execute(num_retries=args.retries)['items']
533 # check for repo+tag links on these collections
535 existing_repo_tag = _get_docker_links(
537 filters=[['link_class', '=', 'docker_image_repo+tag'],
538 ['name', '=', image_repo_tag],
539 ['head_uuid', 'in', collections]])
541 existing_repo_tag = []
544 coll_uuid = next(items_owned_by(parent_project_uuid, collections))['uuid']
545 except StopIteration:
546 # create new collection owned by the project
547 coll_uuid = api.collections().create(
548 body={"manifest_text": collections[0]['manifest_text'],
549 "name": collection_name,
550 "owner_uuid": parent_project_uuid},
551 ensure_unique_name=True
552 ).execute(num_retries=args.retries)['uuid']
554 link_base = {'owner_uuid': parent_project_uuid,
555 'head_uuid': coll_uuid,
556 'properties': existing_links[0]['properties']}
558 if not any(items_owned_by(parent_project_uuid, existing_links)):
559 # create image link owned by the project
560 make_link(api, args.retries,
561 'docker_image_hash', image_hash, **link_base)
563 if image_repo_tag and not any(items_owned_by(parent_project_uuid, existing_repo_tag)):
564 # create repo+tag link owned by the project
565 make_link(api, args.retries, 'docker_image_repo+tag',
566 image_repo_tag, **link_base)
568 stdout.write(coll_uuid + "\n")
572 # Open a file for the saved image, and write it if needed.
573 outfile_name = '{}.tar'.format(image_hash)
574 image_file, need_save = prep_image_file(outfile_name)
576 save_image(image_hash, image_file)
578 # Call arv-put with switches we inherited from it
579 # (a.k.a., switches that aren't our own).
580 put_args = keepdocker_parser.parse_known_args(arguments)[1]
582 if args.name is None:
583 put_args += ['--name', collection_name]
585 coll_uuid = arv_put.main(
586 put_args + ['--filename', outfile_name, image_file.name], stdout=stdout).strip()
588 # Read the image metadata and make Arvados links from it.
590 image_tar = tarfile.open(fileobj=image_file)
591 image_hash_type, _, raw_image_hash = image_hash.rpartition(':')
593 json_filename = raw_image_hash + '.json'
595 json_filename = raw_image_hash + '/json'
596 json_file = image_tar.extractfile(image_tar.getmember(json_filename))
597 image_metadata = json.load(json_file)
600 link_base = {'head_uuid': coll_uuid, 'properties': {}}
601 if 'created' in image_metadata:
602 link_base['properties']['image_timestamp'] = image_metadata['created']
603 if args.project_uuid is not None:
604 link_base['owner_uuid'] = args.project_uuid
606 make_link(api, args.retries, 'docker_image_hash', image_hash, **link_base)
608 make_link(api, args.retries,
609 'docker_image_repo+tag', image_repo_tag, **link_base)
613 for filename in [stat_cache_name(image_file), image_file.name]:
616 except OSError as error:
617 if error.errno != errno.ENOENT:
620 if __name__ == '__main__':