16 from operator import itemgetter
21 import arvados.commands._util as arv_cmd
22 import arvados.commands.put as arv_put
25 from arvados._version import __version__
27 EARLIEST_DATETIME = datetime.datetime(datetime.MINYEAR, 1, 1, 0, 0, 0)
28 STAT_CACHE_ERRORS = (IOError, OSError, ValueError)
30 DockerImage = collections.namedtuple(
31 'DockerImage', ['repo', 'tag', 'hash', 'created', 'vsize'])
33 keepdocker_parser = argparse.ArgumentParser(add_help=False)
34 keepdocker_parser.add_argument(
35 '--version', action='version', version="%s %s" % (sys.argv[0], __version__),
36 help='Print version and exit.')
37 keepdocker_parser.add_argument(
38 '-f', '--force', action='store_true', default=False,
39 help="Re-upload the image even if it already exists on the server")
40 keepdocker_parser.add_argument(
41 '--force-image-format', action='store_true', default=False,
42 help="Proceed even if the image format is not supported by the server")
44 _group = keepdocker_parser.add_mutually_exclusive_group()
46 '--pull', action='store_true', default=False,
47 help="Try to pull the latest image from Docker registry")
49 '--no-pull', action='store_false', dest='pull',
50 help="Use locally installed image only, don't pull image from Docker registry (default)")
52 keepdocker_parser.add_argument(
54 help="Docker image to upload, as a repository name or hash")
55 keepdocker_parser.add_argument(
56 'tag', nargs='?', default='latest',
57 help="Tag of the Docker image to upload (default 'latest')")
59 # Combine keepdocker options listed above with run_opts options of arv-put.
60 # The options inherited from arv-put include --name, --project-uuid,
61 # --progress/--no-progress/--batch-progress and --resume/--no-resume.
62 arg_parser = argparse.ArgumentParser(
63 description="Upload or list Docker images in Arvados",
64 parents=[keepdocker_parser, arv_put.run_opts, arv_cmd.retry_opt])
66 class DockerError(Exception):
70 def popen_docker(cmd, *args, **kwargs):
71 manage_stdin = ('stdin' not in kwargs)
72 kwargs.setdefault('stdin', subprocess.PIPE)
73 kwargs.setdefault('stdout', sys.stderr)
75 docker_proc = subprocess.Popen(['docker.io'] + cmd, *args, **kwargs)
76 except OSError: # No docker.io in $PATH
77 docker_proc = subprocess.Popen(['docker'] + cmd, *args, **kwargs)
79 docker_proc.stdin.close()
82 def check_docker(proc, description):
84 if proc.returncode != 0:
85 raise DockerError("docker {} returned status code {}".
86 format(description, proc.returncode))
88 def docker_image_format(image_hash):
89 """Return the registry format ('v1' or 'v2') of the given image."""
90 cmd = popen_docker(['inspect', '--format={{.Id}}', image_hash],
91 stdout=subprocess.PIPE)
93 image_id = next(cmd.stdout).strip()
94 if image_id.startswith('sha256:'):
96 elif ':' not in image_id:
101 check_docker(cmd, "inspect")
103 def docker_image_compatible(api, image_hash):
104 supported = api._rootDesc.get('dockerImageFormats', [])
106 print >>sys.stderr, "arv-keepdocker: warning: server does not specify supported image formats (see docker_image_formats in server config). Continuing."
109 fmt = docker_image_format(image_hash)
113 print >>sys.stderr, "arv-keepdocker: image format is {!r} " \
114 "but server supports only {!r}".format(fmt, supported)
118 # Yield a DockerImage tuple for each installed image.
119 list_proc = popen_docker(['images', '--no-trunc'], stdout=subprocess.PIPE)
120 list_output = iter(list_proc.stdout)
121 next(list_output) # Ignore the header line
122 for line in list_output:
124 size_index = len(words) - 2
125 repo, tag, imageid = words[:3]
126 ctime = ' '.join(words[3:size_index])
127 vsize = ' '.join(words[size_index:])
128 yield DockerImage(repo, tag, imageid, ctime, vsize)
129 list_proc.stdout.close()
130 check_docker(list_proc, "images")
132 def find_image_hashes(image_search, image_tag=None):
133 # Given one argument, search for Docker images with matching hashes,
134 # and return their full hashes in a set.
135 # Given two arguments, also search for a Docker image with the
136 # same repository and tag. If one is found, return its hash in a
137 # set; otherwise, fall back to the one-argument hash search.
138 # Returns None if no match is found, or a hash search is ambiguous.
139 hash_search = image_search.lower()
141 for image in docker_images():
142 if (image.repo == image_search) and (image.tag == image_tag):
143 return set([image.hash])
144 elif image.hash.startswith(hash_search):
145 hash_matches.add(image.hash)
148 def find_one_image_hash(image_search, image_tag=None):
149 hashes = find_image_hashes(image_search, image_tag)
150 hash_count = len(hashes)
153 elif hash_count == 0:
154 raise DockerError("no matching image found")
156 raise DockerError("{} images match {}".format(hash_count, image_search))
158 def stat_cache_name(image_file):
159 return getattr(image_file, 'name', image_file) + '.stat'
161 def pull_image(image_name, image_tag):
162 check_docker(popen_docker(['pull', '{}:{}'.format(image_name, image_tag)]),
165 def save_image(image_hash, image_file):
166 # Save the specified Docker image to image_file, then try to save its
167 # stats so we can try to resume after interruption.
168 check_docker(popen_docker(['save', image_hash], stdout=image_file),
172 with open(stat_cache_name(image_file), 'w') as statfile:
173 json.dump(tuple(os.fstat(image_file.fileno())), statfile)
174 except STAT_CACHE_ERRORS:
175 pass # We won't resume from this cache. No big deal.
177 def prep_image_file(filename):
178 # Return a file object ready to save a Docker image,
179 # and a boolean indicating whether or not we need to actually save the
180 # image (False if a cached save is available).
181 cache_dir = arv_cmd.make_home_conf_dir(
182 os.path.join('.cache', 'arvados', 'docker'), 0o700)
183 if cache_dir is None:
184 image_file = tempfile.NamedTemporaryFile(suffix='.tar')
187 file_path = os.path.join(cache_dir, filename)
189 with open(stat_cache_name(file_path)) as statfile:
190 prev_stat = json.load(statfile)
191 now_stat = os.stat(file_path)
192 need_save = any(prev_stat[field] != now_stat[field]
193 for field in [ST_MTIME, ST_SIZE])
194 except STAT_CACHE_ERRORS + (AttributeError, IndexError):
195 need_save = True # We couldn't compare against old stats
196 image_file = open(file_path, 'w+b' if need_save else 'rb')
197 return image_file, need_save
199 def make_link(api_client, num_retries, link_class, link_name, **link_attrs):
200 link_attrs.update({'link_class': link_class, 'name': link_name})
201 return api_client.links().create(body=link_attrs).execute(
202 num_retries=num_retries)
204 def docker_link_sort_key(link):
205 """Build a sort key to find the latest available Docker image.
207 To find one source collection for a Docker image referenced by
208 name or image id, the API server looks for a link with the most
209 recent `image_timestamp` property; then the most recent
210 `created_at` timestamp. This method generates a sort key for
211 Docker metadata links to sort them from least to most preferred.
214 image_timestamp = ciso8601.parse_datetime_unaware(
215 link['properties']['image_timestamp'])
216 except (KeyError, ValueError):
217 image_timestamp = EARLIEST_DATETIME
218 return (image_timestamp,
219 ciso8601.parse_datetime_unaware(link['created_at']))
221 def _get_docker_links(api_client, num_retries, **kwargs):
222 links = arvados.util.list_all(api_client.links().list,
223 num_retries, **kwargs)
225 link['_sort_key'] = docker_link_sort_key(link)
226 links.sort(key=itemgetter('_sort_key'), reverse=True)
229 def _new_image_listing(link, dockerhash, repo='<none>', tag='<none>'):
230 timestamp_index = 1 if (link['_sort_key'][0] is EARLIEST_DATETIME) else 0
232 '_sort_key': link['_sort_key'],
233 'timestamp': link['_sort_key'][timestamp_index],
234 'collection': link['head_uuid'],
235 'dockerhash': dockerhash,
240 def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None):
241 """List all Docker images known to the api_client with image_name and
242 image_tag. If no image_name is given, defaults to listing all
245 Returns a list of tuples representing matching Docker images,
246 sorted in preference order (i.e. the first collection in the list
247 is the one that the API server would use). Each tuple is a
248 (collection_uuid, collection_info) pair, where collection_info is
249 a dict with fields "dockerhash", "repo", "tag", and "timestamp".
256 # Find images with the name the user specified.
257 search_links = _get_docker_links(
258 api_client, num_retries,
259 filters=[['link_class', '=', 'docker_image_repo+tag'],
261 '{}:{}'.format(image_name, image_tag or 'latest')]])
263 repo_links = search_links
265 # Fall back to finding images with the specified image hash.
266 search_links = _get_docker_links(
267 api_client, num_retries,
268 filters=[['link_class', '=', 'docker_image_hash'],
269 ['name', 'ilike', image_name + '%']])
270 hash_links = search_links
271 # Only list information about images that were found in the search.
272 search_filters.append(['head_uuid', 'in',
273 [link['head_uuid'] for link in search_links]])
275 # It should be reasonable to expect that each collection only has one
276 # image hash (though there may be many links specifying this). Find
277 # the API server's most preferred image hash link for each collection.
278 if hash_links is None:
279 hash_links = _get_docker_links(
280 api_client, num_retries,
281 filters=search_filters + [['link_class', '=', 'docker_image_hash']])
282 hash_link_map = {link['head_uuid']: link for link in reversed(hash_links)}
284 # Each collection may have more than one name (though again, one name
285 # may be specified more than once). Build an image listing from name
286 # tags, sorted by API server preference.
287 if repo_links is None:
288 repo_links = _get_docker_links(
289 api_client, num_retries,
290 filters=search_filters + [['link_class', '=',
291 'docker_image_repo+tag']])
292 seen_image_names = collections.defaultdict(set)
294 for link in repo_links:
295 collection_uuid = link['head_uuid']
296 if link['name'] in seen_image_names[collection_uuid]:
298 seen_image_names[collection_uuid].add(link['name'])
300 dockerhash = hash_link_map[collection_uuid]['name']
302 dockerhash = '<unknown>'
303 name_parts = link['name'].split(':', 1)
304 images.append(_new_image_listing(link, dockerhash, *name_parts))
306 # Find any image hash links that did not have a corresponding name link,
307 # and add image listings for them, retaining the API server preference
309 images_start_size = len(images)
310 for collection_uuid, link in hash_link_map.iteritems():
311 if not seen_image_names[collection_uuid]:
312 images.append(_new_image_listing(link, link['name']))
313 if len(images) > images_start_size:
314 images.sort(key=itemgetter('_sort_key'), reverse=True)
316 # Remove any image listings that refer to unknown collections.
317 existing_coll_uuids = {coll['uuid'] for coll in arvados.util.list_all(
318 api_client.collections().list, num_retries,
319 filters=[['uuid', 'in', [im['collection'] for im in images]]],
321 return [(image['collection'], image) for image in images
322 if image['collection'] in existing_coll_uuids]
324 def items_owned_by(owner_uuid, arv_items):
325 return (item for item in arv_items if item['owner_uuid'] == owner_uuid)
327 def _uuid2pdh(api, uuid):
328 return api.collections().list(
329 filters=[['uuid', '=', uuid]],
330 select=['portable_data_hash'],
331 ).execute()['items'][0]['portable_data_hash']
333 _migration_link_class = 'docker_image_migration'
334 _migration_link_name = 'migrate_1.9_1.10'
335 def _migrate19_link(api, root_uuid, old_uuid, new_uuid):
336 old_pdh = _uuid2pdh(api, old_uuid)
337 new_pdh = _uuid2pdh(api, new_uuid)
338 if not api.links().list(filters=[
339 ['owner_uuid', '=', root_uuid],
340 ['link_class', '=', _migration_link_class],
341 ['name', '=', _migration_link_name],
342 ['tail_uuid', '=', old_pdh],
343 ['head_uuid', '=', new_pdh]]).execute()['items']:
344 print >>sys.stderr, 'Creating migration link {} -> {}: '.format(
346 link = api.links().create(body={
347 'owner_uuid': root_uuid,
348 'link_class': _migration_link_class,
349 'name': _migration_link_name,
350 'tail_uuid': old_pdh,
351 'head_uuid': new_pdh,
353 print >>sys.stderr, '{}'.format(link['uuid'])
357 api = arvados.api('v1')
358 user = api.users().current().execute()
359 if not user['is_admin']:
360 raise Exception("This command requires an admin token")
361 root_uuid = user['uuid'][:12] + '000000000000000'
363 images = list_images_in_arv(api, 2)
364 is_new = lambda img: img['dockerhash'].startswith('sha256:')
367 for uuid, img in images:
368 if not re.match(r'^[0-9a-f]{64}$', img["tag"]):
370 key = (img["repo"], img["tag"])
371 if is_new(img) and key not in new_image_uuids:
373 new_image_uuids[key] = uuid
377 for uuid, img in images:
378 key = (img['repo'], img['tag'])
379 if not is_new(img) and key in new_image_uuids:
380 count_migrations += 1
381 link = _migrate19_link(api, root_uuid, uuid, new_image_uuids[key])
383 new_links.append(link)
385 print >>sys.stderr, "=== {} new-format images, {} migrations detected, " \
386 "{} links added.".format(count_new, count_migrations, len(new_links))
389 def main(arguments=None, stdout=sys.stdout):
390 args = arg_parser.parse_args(arguments)
391 api = arvados.api('v1')
393 if args.image is None or args.image == 'images':
394 fmt = "{:30} {:10} {:12} {:29} {:20}\n"
395 stdout.write(fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED"))
396 for i, j in list_images_in_arv(api, args.retries):
397 stdout.write(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
400 # Pull the image if requested, unless the image is specified as a hash
401 # that we already have.
402 if args.pull and not find_image_hashes(args.image):
403 pull_image(args.image, args.tag)
406 image_hash = find_one_image_hash(args.image, args.tag)
407 except DockerError as error:
408 print >>sys.stderr, "arv-keepdocker:", error.message
411 if not docker_image_compatible(api, image_hash):
412 if args.force_image_format:
413 print >>sys.stderr, "arv-keepdocker: forcing incompatible image"
415 print >>sys.stderr, "arv-keepdocker: refusing to store " \
416 "incompatible format (use --force-image-format to override)"
419 image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None
421 if args.name is None:
423 collection_name = 'Docker image {} {}'.format(image_repo_tag, image_hash[0:12])
425 collection_name = 'Docker image {}'.format(image_hash[0:12])
427 collection_name = args.name
430 # Check if this image is already in Arvados.
432 # Project where everything should be owned
433 if args.project_uuid:
434 parent_project_uuid = args.project_uuid
436 parent_project_uuid = api.users().current().execute(
437 num_retries=args.retries)['uuid']
439 # Find image hash tags
440 existing_links = _get_docker_links(
442 filters=[['link_class', '=', 'docker_image_hash'],
443 ['name', '=', image_hash]])
445 # get readable collections
446 collections = api.collections().list(
447 filters=[['uuid', 'in', [link['head_uuid'] for link in existing_links]]],
448 select=["uuid", "owner_uuid", "name", "manifest_text"]
449 ).execute(num_retries=args.retries)['items']
452 # check for repo+tag links on these collections
454 existing_repo_tag = _get_docker_links(
456 filters=[['link_class', '=', 'docker_image_repo+tag'],
457 ['name', '=', image_repo_tag],
458 ['head_uuid', 'in', collections]])
460 existing_repo_tag = []
463 coll_uuid = next(items_owned_by(parent_project_uuid, collections))['uuid']
464 except StopIteration:
465 # create new collection owned by the project
466 coll_uuid = api.collections().create(
467 body={"manifest_text": collections[0]['manifest_text'],
468 "name": collection_name,
469 "owner_uuid": parent_project_uuid},
470 ensure_unique_name=True
471 ).execute(num_retries=args.retries)['uuid']
473 link_base = {'owner_uuid': parent_project_uuid,
474 'head_uuid': coll_uuid,
475 'properties': existing_links[0]['properties']}
477 if not any(items_owned_by(parent_project_uuid, existing_links)):
478 # create image link owned by the project
479 make_link(api, args.retries,
480 'docker_image_hash', image_hash, **link_base)
482 if image_repo_tag and not any(items_owned_by(parent_project_uuid, existing_repo_tag)):
483 # create repo+tag link owned by the project
484 make_link(api, args.retries, 'docker_image_repo+tag',
485 image_repo_tag, **link_base)
487 stdout.write(coll_uuid + "\n")
491 # Open a file for the saved image, and write it if needed.
492 outfile_name = '{}.tar'.format(image_hash)
493 image_file, need_save = prep_image_file(outfile_name)
495 save_image(image_hash, image_file)
497 # Call arv-put with switches we inherited from it
498 # (a.k.a., switches that aren't our own).
499 put_args = keepdocker_parser.parse_known_args(arguments)[1]
501 if args.name is None:
502 put_args += ['--name', collection_name]
504 coll_uuid = arv_put.main(
505 put_args + ['--filename', outfile_name, image_file.name], stdout=stdout).strip()
507 # Read the image metadata and make Arvados links from it.
509 image_tar = tarfile.open(fileobj=image_file)
510 image_hash_type, _, raw_image_hash = image_hash.rpartition(':')
512 json_filename = raw_image_hash + '.json'
514 json_filename = raw_image_hash + '/json'
515 json_file = image_tar.extractfile(image_tar.getmember(json_filename))
516 image_metadata = json.load(json_file)
519 link_base = {'head_uuid': coll_uuid, 'properties': {}}
520 if 'created' in image_metadata:
521 link_base['properties']['image_timestamp'] = image_metadata['created']
522 if args.project_uuid is not None:
523 link_base['owner_uuid'] = args.project_uuid
525 make_link(api, args.retries, 'docker_image_hash', image_hash, **link_base)
527 make_link(api, args.retries,
528 'docker_image_repo+tag', image_repo_tag, **link_base)
532 for filename in [stat_cache_name(image_file), image_file.name]:
535 except OSError as error:
536 if error.errno != errno.ENOENT:
539 if __name__ == '__main__':