17 from operator import itemgetter
22 import arvados.commands._util as arv_cmd
23 import arvados.commands.put as arv_put
24 from arvados.collection import CollectionReader
29 from arvados._version import __version__
31 logger = logging.getLogger('arvados.keepdocker')
32 logger.setLevel(logging.DEBUG if arvados.config.get('ARVADOS_DEBUG')
35 EARLIEST_DATETIME = datetime.datetime(datetime.MINYEAR, 1, 1, 0, 0, 0)
36 STAT_CACHE_ERRORS = (IOError, OSError, ValueError)
38 DockerImage = collections.namedtuple(
39 'DockerImage', ['repo', 'tag', 'hash', 'created', 'vsize'])
41 keepdocker_parser = argparse.ArgumentParser(add_help=False)
42 keepdocker_parser.add_argument(
43 '--version', action='version', version="%s %s" % (sys.argv[0], __version__),
44 help='Print version and exit.')
45 keepdocker_parser.add_argument(
46 '-f', '--force', action='store_true', default=False,
47 help="Re-upload the image even if it already exists on the server")
48 keepdocker_parser.add_argument(
49 '--force-image-format', action='store_true', default=False,
50 help="Proceed even if the image format is not supported by the server")
52 _group = keepdocker_parser.add_mutually_exclusive_group()
54 '--pull', action='store_true', default=False,
55 help="Try to pull the latest image from Docker registry")
57 '--no-pull', action='store_false', dest='pull',
58 help="Use locally installed image only, don't pull image from Docker registry (default)")
60 keepdocker_parser.add_argument(
62 help="Docker image to upload, as a repository name or hash")
63 keepdocker_parser.add_argument(
64 'tag', nargs='?', default='latest',
65 help="Tag of the Docker image to upload (default 'latest')")
67 # Combine keepdocker options listed above with run_opts options of arv-put.
68 # The options inherited from arv-put include --name, --project-uuid,
69 # --progress/--no-progress/--batch-progress and --resume/--no-resume.
70 arg_parser = argparse.ArgumentParser(
71 description="Upload or list Docker images in Arvados",
72 parents=[keepdocker_parser, arv_put.run_opts, arv_cmd.retry_opt])
74 class DockerError(Exception):
78 def popen_docker(cmd, *args, **kwargs):
79 manage_stdin = ('stdin' not in kwargs)
80 kwargs.setdefault('stdin', subprocess.PIPE)
81 kwargs.setdefault('stdout', sys.stderr)
83 docker_proc = subprocess.Popen(['docker.io'] + cmd, *args, **kwargs)
84 except OSError: # No docker.io in $PATH
85 docker_proc = subprocess.Popen(['docker'] + cmd, *args, **kwargs)
87 docker_proc.stdin.close()
90 def check_docker(proc, description):
92 if proc.returncode != 0:
93 raise DockerError("docker {} returned status code {}".
94 format(description, proc.returncode))
96 def docker_image_format(image_hash):
97 """Return the registry format ('v1' or 'v2') of the given image."""
98 cmd = popen_docker(['inspect', '--format={{.Id}}', image_hash],
99 stdout=subprocess.PIPE)
101 image_id = next(cmd.stdout).strip()
102 if image_id.startswith('sha256:'):
104 elif ':' not in image_id:
109 check_docker(cmd, "inspect")
111 def docker_image_compatible(api, image_hash):
112 supported = api._rootDesc.get('dockerImageFormats', [])
114 logger.warn("server does not specify supported image formats (see docker_image_formats in server config). Continuing.")
117 fmt = docker_image_format(image_hash)
121 logger.error("image format is {!r} " \
122 "but server supports only {!r}".format(fmt, supported))
126 # Yield a DockerImage tuple for each installed image.
127 list_proc = popen_docker(['images', '--no-trunc'], stdout=subprocess.PIPE)
128 list_output = iter(list_proc.stdout)
129 next(list_output) # Ignore the header line
130 for line in list_output:
132 size_index = len(words) - 2
133 repo, tag, imageid = words[:3]
134 ctime = ' '.join(words[3:size_index])
135 vsize = ' '.join(words[size_index:])
136 yield DockerImage(repo, tag, imageid, ctime, vsize)
137 list_proc.stdout.close()
138 check_docker(list_proc, "images")
140 def find_image_hashes(image_search, image_tag=None):
141 # Given one argument, search for Docker images with matching hashes,
142 # and return their full hashes in a set.
143 # Given two arguments, also search for a Docker image with the
144 # same repository and tag. If one is found, return its hash in a
145 # set; otherwise, fall back to the one-argument hash search.
146 # Returns None if no match is found, or a hash search is ambiguous.
147 hash_search = image_search.lower()
149 for image in docker_images():
150 if (image.repo == image_search) and (image.tag == image_tag):
151 return set([image.hash])
152 elif image.hash.startswith(hash_search):
153 hash_matches.add(image.hash)
156 def find_one_image_hash(image_search, image_tag=None):
157 hashes = find_image_hashes(image_search, image_tag)
158 hash_count = len(hashes)
161 elif hash_count == 0:
162 raise DockerError("no matching image found")
164 raise DockerError("{} images match {}".format(hash_count, image_search))
166 def stat_cache_name(image_file):
167 return getattr(image_file, 'name', image_file) + '.stat'
169 def pull_image(image_name, image_tag):
170 check_docker(popen_docker(['pull', '{}:{}'.format(image_name, image_tag)]),
173 def save_image(image_hash, image_file):
174 # Save the specified Docker image to image_file, then try to save its
175 # stats so we can try to resume after interruption.
176 check_docker(popen_docker(['save', image_hash], stdout=image_file),
180 with open(stat_cache_name(image_file), 'w') as statfile:
181 json.dump(tuple(os.fstat(image_file.fileno())), statfile)
182 except STAT_CACHE_ERRORS:
183 pass # We won't resume from this cache. No big deal.
185 def prep_image_file(filename):
186 # Return a file object ready to save a Docker image,
187 # and a boolean indicating whether or not we need to actually save the
188 # image (False if a cached save is available).
189 cache_dir = arv_cmd.make_home_conf_dir(
190 os.path.join('.cache', 'arvados', 'docker'), 0o700)
191 if cache_dir is None:
192 image_file = tempfile.NamedTemporaryFile(suffix='.tar')
195 file_path = os.path.join(cache_dir, filename)
197 with open(stat_cache_name(file_path)) as statfile:
198 prev_stat = json.load(statfile)
199 now_stat = os.stat(file_path)
200 need_save = any(prev_stat[field] != now_stat[field]
201 for field in [ST_MTIME, ST_SIZE])
202 except STAT_CACHE_ERRORS + (AttributeError, IndexError):
203 need_save = True # We couldn't compare against old stats
204 image_file = open(file_path, 'w+b' if need_save else 'rb')
205 return image_file, need_save
207 def make_link(api_client, num_retries, link_class, link_name, **link_attrs):
208 link_attrs.update({'link_class': link_class, 'name': link_name})
209 return api_client.links().create(body=link_attrs).execute(
210 num_retries=num_retries)
212 def docker_link_sort_key(link):
213 """Build a sort key to find the latest available Docker image.
215 To find one source collection for a Docker image referenced by
216 name or image id, the API server looks for a link with the most
217 recent `image_timestamp` property; then the most recent
218 `created_at` timestamp. This method generates a sort key for
219 Docker metadata links to sort them from least to most preferred.
222 image_timestamp = ciso8601.parse_datetime_unaware(
223 link['properties']['image_timestamp'])
224 except (KeyError, ValueError):
225 image_timestamp = EARLIEST_DATETIME
226 return (image_timestamp,
227 ciso8601.parse_datetime_unaware(link['created_at']))
229 def _get_docker_links(api_client, num_retries, **kwargs):
230 links = arvados.util.list_all(api_client.links().list,
231 num_retries, **kwargs)
233 link['_sort_key'] = docker_link_sort_key(link)
234 links.sort(key=itemgetter('_sort_key'), reverse=True)
237 def _new_image_listing(link, dockerhash, repo='<none>', tag='<none>'):
238 timestamp_index = 1 if (link['_sort_key'][0] is EARLIEST_DATETIME) else 0
240 '_sort_key': link['_sort_key'],
241 'timestamp': link['_sort_key'][timestamp_index],
242 'collection': link['head_uuid'],
243 'dockerhash': dockerhash,
248 def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None):
249 """List all Docker images known to the api_client with image_name and
250 image_tag. If no image_name is given, defaults to listing all
253 Returns a list of tuples representing matching Docker images,
254 sorted in preference order (i.e. the first collection in the list
255 is the one that the API server would use). Each tuple is a
256 (collection_uuid, collection_info) pair, where collection_info is
257 a dict with fields "dockerhash", "repo", "tag", and "timestamp".
264 # Find images with the name the user specified.
265 search_links = _get_docker_links(
266 api_client, num_retries,
267 filters=[['link_class', '=', 'docker_image_repo+tag'],
269 '{}:{}'.format(image_name, image_tag or 'latest')]])
271 repo_links = search_links
273 # Fall back to finding images with the specified image hash.
274 search_links = _get_docker_links(
275 api_client, num_retries,
276 filters=[['link_class', '=', 'docker_image_hash'],
277 ['name', 'ilike', image_name + '%']])
278 hash_links = search_links
279 # Only list information about images that were found in the search.
280 search_filters.append(['head_uuid', 'in',
281 [link['head_uuid'] for link in search_links]])
283 # It should be reasonable to expect that each collection only has one
284 # image hash (though there may be many links specifying this). Find
285 # the API server's most preferred image hash link for each collection.
286 if hash_links is None:
287 hash_links = _get_docker_links(
288 api_client, num_retries,
289 filters=search_filters + [['link_class', '=', 'docker_image_hash']])
290 hash_link_map = {link['head_uuid']: link for link in reversed(hash_links)}
292 # Each collection may have more than one name (though again, one name
293 # may be specified more than once). Build an image listing from name
294 # tags, sorted by API server preference.
295 if repo_links is None:
296 repo_links = _get_docker_links(
297 api_client, num_retries,
298 filters=search_filters + [['link_class', '=',
299 'docker_image_repo+tag']])
300 seen_image_names = collections.defaultdict(set)
302 for link in repo_links:
303 collection_uuid = link['head_uuid']
304 if link['name'] in seen_image_names[collection_uuid]:
306 seen_image_names[collection_uuid].add(link['name'])
308 dockerhash = hash_link_map[collection_uuid]['name']
310 dockerhash = '<unknown>'
311 name_parts = link['name'].split(':', 1)
312 images.append(_new_image_listing(link, dockerhash, *name_parts))
314 # Find any image hash links that did not have a corresponding name link,
315 # and add image listings for them, retaining the API server preference
317 images_start_size = len(images)
318 for collection_uuid, link in hash_link_map.iteritems():
319 if not seen_image_names[collection_uuid]:
320 images.append(_new_image_listing(link, link['name']))
321 if len(images) > images_start_size:
322 images.sort(key=itemgetter('_sort_key'), reverse=True)
324 # Remove any image listings that refer to unknown collections.
325 existing_coll_uuids = {coll['uuid'] for coll in arvados.util.list_all(
326 api_client.collections().list, num_retries,
327 filters=[['uuid', 'in', [im['collection'] for im in images]]],
329 return [(image['collection'], image) for image in images
330 if image['collection'] in existing_coll_uuids]
332 def items_owned_by(owner_uuid, arv_items):
333 return (item for item in arv_items if item['owner_uuid'] == owner_uuid)
335 def _uuid2pdh(api, uuid):
336 return api.collections().list(
337 filters=[['uuid', '=', uuid]],
338 select=['portable_data_hash'],
339 ).execute()['items'][0]['portable_data_hash']
341 _migration_link_class = 'docker_image_migration'
342 _migration_link_name = 'migrate_1.9_1.10'
345 api_client = arvados.api()
347 images = arvados.commands.keepdocker.list_images_in_arv(api_client, 3)
349 is_new = lambda img: img['dockerhash'].startswith('sha256:')
353 for uuid, img in images:
354 if img["dockerhash"].startswith("sha256:"):
356 key = (img["repo"], img["tag"], img["timestamp"])
357 old_images.append(img)
359 migration_links = arvados.util.list_all(api_client.links().list, filters=[
360 ['link_class', '=', _migration_link_class],
361 ['name', '=', _migration_link_name],
364 already_migrated = set()
365 for m in migration_links:
366 already_migrated.add(m["tail_uuid"])
368 need_migrate = [img for img in old_images if img["collection"] not in already_migrated]
370 logger.info("Already migrated %i images", len(already_migrated))
371 logger.info("Need to migrate %i images", len(need_migrate))
373 for old_image in need_migrate:
374 logger.info("Migrating %s", old_image["collection"])
376 col = CollectionReader(old_image["collection"])
377 tarfile = col.keys()[0]
380 varlibdocker = tempfile.mkdtemp()
381 with tempfile.NamedTemporaryFile() as envfile:
382 envfile.write("ARVADOS_API_HOST=%s\n" % (os.environ["ARVADOS_API_HOST"]))
383 envfile.write("ARVADOS_API_TOKEN=%s\n" % (os.environ["ARVADOS_API_TOKEN"]))
384 envfile.write("ARVADOS_API_HOST_INSECURE=%s\n" % (os.environ["ARVADOS_API_HOST_INSECURE"]))
387 dockercmd = ["docker", "run",
390 "--env-file", envfile.name,
391 "--volume", "%s:/var/lib/docker" % varlibdocker,
392 "arvados/docker19-migrate",
394 "%s/%s" % (old_image["collection"], tarfile),
398 col.api_response()["owner_uuid"]]
400 out = subprocess.check_output(dockercmd)
402 new_collection = re.search(r"Migrated uuid is ([a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15})", out)
403 api_client.links().create(body={"link": {
404 'owner_uuid': col.api_response()["owner_uuid"],
405 'link_class': arvados.commands.keepdocker._migration_link_class,
406 'name': arvados.commands.keepdocker._migration_link_name,
407 'tail_uuid': old_image["collection"],
408 'head_uuid': new_collection.group(1)
409 }}).execute(num_retries=3)
411 logger.info("Migrated '%s' to '%s'", old_image["collection"], new_collection.group(1))
412 except Exception as e:
413 logger.exception("Migration failed")
415 shutil.rmtree(varlibdocker)
417 logger.info("All done")
420 def main(arguments=None, stdout=sys.stdout):
421 args = arg_parser.parse_args(arguments)
422 api = arvados.api('v1')
424 if args.image is None or args.image == 'images':
425 fmt = "{:30} {:10} {:12} {:29} {:20}\n"
426 stdout.write(fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED"))
427 for i, j in list_images_in_arv(api, args.retries):
428 stdout.write(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
431 # Pull the image if requested, unless the image is specified as a hash
432 # that we already have.
433 if args.pull and not find_image_hashes(args.image):
434 pull_image(args.image, args.tag)
437 image_hash = find_one_image_hash(args.image, args.tag)
438 except DockerError as error:
439 logger.error(error.message)
442 if not docker_image_compatible(api, image_hash):
443 if args.force_image_format:
444 logger.warn("forcing incompatible image")
446 logger.error("refusing to store " \
447 "incompatible format (use --force-image-format to override)")
450 image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None
452 if args.name is None:
454 collection_name = 'Docker image {} {}'.format(image_repo_tag, image_hash[0:12])
456 collection_name = 'Docker image {}'.format(image_hash[0:12])
458 collection_name = args.name
461 # Check if this image is already in Arvados.
463 # Project where everything should be owned
464 if args.project_uuid:
465 parent_project_uuid = args.project_uuid
467 parent_project_uuid = api.users().current().execute(
468 num_retries=args.retries)['uuid']
470 # Find image hash tags
471 existing_links = _get_docker_links(
473 filters=[['link_class', '=', 'docker_image_hash'],
474 ['name', '=', image_hash]])
476 # get readable collections
477 collections = api.collections().list(
478 filters=[['uuid', 'in', [link['head_uuid'] for link in existing_links]]],
479 select=["uuid", "owner_uuid", "name", "manifest_text"]
480 ).execute(num_retries=args.retries)['items']
483 # check for repo+tag links on these collections
485 existing_repo_tag = _get_docker_links(
487 filters=[['link_class', '=', 'docker_image_repo+tag'],
488 ['name', '=', image_repo_tag],
489 ['head_uuid', 'in', collections]])
491 existing_repo_tag = []
494 coll_uuid = next(items_owned_by(parent_project_uuid, collections))['uuid']
495 except StopIteration:
496 # create new collection owned by the project
497 coll_uuid = api.collections().create(
498 body={"manifest_text": collections[0]['manifest_text'],
499 "name": collection_name,
500 "owner_uuid": parent_project_uuid},
501 ensure_unique_name=True
502 ).execute(num_retries=args.retries)['uuid']
504 link_base = {'owner_uuid': parent_project_uuid,
505 'head_uuid': coll_uuid,
506 'properties': existing_links[0]['properties']}
508 if not any(items_owned_by(parent_project_uuid, existing_links)):
509 # create image link owned by the project
510 make_link(api, args.retries,
511 'docker_image_hash', image_hash, **link_base)
513 if image_repo_tag and not any(items_owned_by(parent_project_uuid, existing_repo_tag)):
514 # create repo+tag link owned by the project
515 make_link(api, args.retries, 'docker_image_repo+tag',
516 image_repo_tag, **link_base)
518 stdout.write(coll_uuid + "\n")
522 # Open a file for the saved image, and write it if needed.
523 outfile_name = '{}.tar'.format(image_hash)
524 image_file, need_save = prep_image_file(outfile_name)
526 save_image(image_hash, image_file)
528 # Call arv-put with switches we inherited from it
529 # (a.k.a., switches that aren't our own).
530 put_args = keepdocker_parser.parse_known_args(arguments)[1]
532 if args.name is None:
533 put_args += ['--name', collection_name]
535 coll_uuid = arv_put.main(
536 put_args + ['--filename', outfile_name, image_file.name], stdout=stdout).strip()
538 # Read the image metadata and make Arvados links from it.
540 image_tar = tarfile.open(fileobj=image_file)
541 image_hash_type, _, raw_image_hash = image_hash.rpartition(':')
543 json_filename = raw_image_hash + '.json'
545 json_filename = raw_image_hash + '/json'
546 json_file = image_tar.extractfile(image_tar.getmember(json_filename))
547 image_metadata = json.load(json_file)
550 link_base = {'head_uuid': coll_uuid, 'properties': {}}
551 if 'created' in image_metadata:
552 link_base['properties']['image_timestamp'] = image_metadata['created']
553 if args.project_uuid is not None:
554 link_base['owner_uuid'] = args.project_uuid
556 make_link(api, args.retries, 'docker_image_hash', image_hash, **link_base)
558 make_link(api, args.retries,
559 'docker_image_repo+tag', image_repo_tag, **link_base)
563 for filename in [stat_cache_name(image_file), image_file.name]:
566 except OSError as error:
567 if error.errno != errno.ENOENT:
570 if __name__ == '__main__':