1 # Copyright (C) The Arvados Authors. All rights reserved.
3 # SPDX-License-Identifier: Apache-2.0
5 from builtins import next
21 from operator import itemgetter
26 import arvados.commands._util as arv_cmd
27 import arvados.commands.put as arv_put
28 from arvados.collection import CollectionReader
33 from arvados._version import __version__
35 logger = logging.getLogger('arvados.keepdocker')
36 logger.setLevel(logging.DEBUG if arvados.config.get('ARVADOS_DEBUG')
39 EARLIEST_DATETIME = datetime.datetime(datetime.MINYEAR, 1, 1, 0, 0, 0)
40 STAT_CACHE_ERRORS = (IOError, OSError, ValueError)
42 DockerImage = collections.namedtuple(
43 'DockerImage', ['repo', 'tag', 'hash', 'created', 'vsize'])
45 keepdocker_parser = argparse.ArgumentParser(add_help=False)
46 keepdocker_parser.add_argument(
47 '--version', action='version', version="%s %s" % (sys.argv[0], __version__),
48 help='Print version and exit.')
49 keepdocker_parser.add_argument(
50 '-f', '--force', action='store_true', default=False,
51 help="Re-upload the image even if it already exists on the server")
52 keepdocker_parser.add_argument(
53 '--force-image-format', action='store_true', default=False,
54 help="Proceed even if the image format is not supported by the server")
56 _group = keepdocker_parser.add_mutually_exclusive_group()
58 '--pull', action='store_true', default=False,
59 help="Try to pull the latest image from Docker registry")
61 '--no-pull', action='store_false', dest='pull',
62 help="Use locally installed image only, don't pull image from Docker registry (default)")
64 keepdocker_parser.add_argument(
66 help="Docker image to upload: repo, repo:tag, or hash")
67 keepdocker_parser.add_argument(
69 help="Tag of the Docker image to upload (default 'latest'), if image is given as an untagged repo name")
71 # Combine keepdocker options listed above with run_opts options of arv-put.
72 # The options inherited from arv-put include --name, --project-uuid,
73 # --progress/--no-progress/--batch-progress and --resume/--no-resume.
74 arg_parser = argparse.ArgumentParser(
75 description="Upload or list Docker images in Arvados",
76 parents=[keepdocker_parser, arv_put.run_opts, arv_cmd.retry_opt])
78 class DockerError(Exception):
82 def popen_docker(cmd, *args, **kwargs):
83 manage_stdin = ('stdin' not in kwargs)
84 kwargs.setdefault('stdin', subprocess.PIPE)
85 kwargs.setdefault('stdout', sys.stderr)
87 docker_proc = subprocess.Popen(['docker.io'] + cmd, *args, **kwargs)
88 except OSError: # No docker.io in $PATH
89 docker_proc = subprocess.Popen(['docker'] + cmd, *args, **kwargs)
91 docker_proc.stdin.close()
94 def check_docker(proc, description):
96 if proc.returncode != 0:
97 raise DockerError("docker {} returned status code {}".
98 format(description, proc.returncode))
100 def docker_image_format(image_hash):
101 """Return the registry format ('v1' or 'v2') of the given image."""
102 cmd = popen_docker(['inspect', '--format={{.Id}}', image_hash],
103 stdout=subprocess.PIPE)
105 image_id = next(cmd.stdout).decode().strip()
106 if image_id.startswith('sha256:'):
108 elif ':' not in image_id:
113 check_docker(cmd, "inspect")
115 def docker_image_compatible(api, image_hash):
116 supported = api._rootDesc.get('dockerImageFormats', [])
118 logger.warning("server does not specify supported image formats (see docker_image_formats in server config).")
121 fmt = docker_image_format(image_hash)
125 logger.error("image format is {!r} " \
126 "but server supports only {!r}".format(fmt, supported))
130 # Yield a DockerImage tuple for each installed image.
131 list_proc = popen_docker(['images', '--no-trunc'], stdout=subprocess.PIPE)
132 list_output = iter(list_proc.stdout)
133 next(list_output) # Ignore the header line
134 for line in list_output:
136 size_index = len(words) - 2
137 repo, tag, imageid = words[:3]
138 ctime = ' '.join(words[3:size_index])
139 vsize = ' '.join(words[size_index:])
140 yield DockerImage(repo, tag, imageid, ctime, vsize)
141 list_proc.stdout.close()
142 check_docker(list_proc, "images")
144 def find_image_hashes(image_search, image_tag=None):
145 # Given one argument, search for Docker images with matching hashes,
146 # and return their full hashes in a set.
147 # Given two arguments, also search for a Docker image with the
148 # same repository and tag. If one is found, return its hash in a
149 # set; otherwise, fall back to the one-argument hash search.
150 # Returns None if no match is found, or a hash search is ambiguous.
151 hash_search = image_search.lower()
153 for image in docker_images():
154 if (image.repo == image_search) and (image.tag == image_tag):
155 return set([image.hash])
156 elif image.hash.startswith(hash_search):
157 hash_matches.add(image.hash)
160 def find_one_image_hash(image_search, image_tag=None):
161 hashes = find_image_hashes(image_search, image_tag)
162 hash_count = len(hashes)
165 elif hash_count == 0:
166 raise DockerError("no matching image found")
168 raise DockerError("{} images match {}".format(hash_count, image_search))
170 def stat_cache_name(image_file):
171 return getattr(image_file, 'name', image_file) + '.stat'
173 def pull_image(image_name, image_tag):
174 check_docker(popen_docker(['pull', '{}:{}'.format(image_name, image_tag)]),
177 def save_image(image_hash, image_file):
178 # Save the specified Docker image to image_file, then try to save its
179 # stats so we can try to resume after interruption.
180 check_docker(popen_docker(['save', image_hash], stdout=image_file),
184 with open(stat_cache_name(image_file), 'w') as statfile:
185 json.dump(tuple(os.fstat(image_file.fileno())), statfile)
186 except STAT_CACHE_ERRORS:
187 pass # We won't resume from this cache. No big deal.
190 return arv_cmd.make_home_conf_dir(
191 os.path.join('.cache', 'arvados', 'docker'), 0o700)
193 def prep_image_file(filename):
194 # Return a file object ready to save a Docker image,
195 # and a boolean indicating whether or not we need to actually save the
196 # image (False if a cached save is available).
197 cache_dir = get_cache_dir()
198 if cache_dir is None:
199 image_file = tempfile.NamedTemporaryFile(suffix='.tar')
202 file_path = os.path.join(cache_dir, filename)
204 with open(stat_cache_name(file_path)) as statfile:
205 prev_stat = json.load(statfile)
206 now_stat = os.stat(file_path)
207 need_save = any(prev_stat[field] != now_stat[field]
208 for field in [ST_MTIME, ST_SIZE])
209 except STAT_CACHE_ERRORS + (AttributeError, IndexError):
210 need_save = True # We couldn't compare against old stats
211 image_file = open(file_path, 'w+b' if need_save else 'rb')
212 return image_file, need_save
214 def make_link(api_client, num_retries, link_class, link_name, **link_attrs):
215 link_attrs.update({'link_class': link_class, 'name': link_name})
216 return api_client.links().create(body=link_attrs).execute(
217 num_retries=num_retries)
219 def docker_link_sort_key(link):
220 """Build a sort key to find the latest available Docker image.
222 To find one source collection for a Docker image referenced by
223 name or image id, the API server looks for a link with the most
224 recent `image_timestamp` property; then the most recent
225 `created_at` timestamp. This method generates a sort key for
226 Docker metadata links to sort them from least to most preferred.
229 image_timestamp = ciso8601.parse_datetime_unaware(
230 link['properties']['image_timestamp'])
231 except (KeyError, ValueError):
232 image_timestamp = EARLIEST_DATETIME
233 return (image_timestamp,
234 ciso8601.parse_datetime_unaware(link['created_at']))
236 def _get_docker_links(api_client, num_retries, **kwargs):
237 links = arvados.util.list_all(api_client.links().list,
238 num_retries, **kwargs)
240 link['_sort_key'] = docker_link_sort_key(link)
241 links.sort(key=itemgetter('_sort_key'), reverse=True)
244 def _new_image_listing(link, dockerhash, repo='<none>', tag='<none>'):
245 timestamp_index = 1 if (link['_sort_key'][0] is EARLIEST_DATETIME) else 0
247 '_sort_key': link['_sort_key'],
248 'timestamp': link['_sort_key'][timestamp_index],
249 'collection': link['head_uuid'],
250 'dockerhash': dockerhash,
255 def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None):
256 """List all Docker images known to the api_client with image_name and
257 image_tag. If no image_name is given, defaults to listing all
260 Returns a list of tuples representing matching Docker images,
261 sorted in preference order (i.e. the first collection in the list
262 is the one that the API server would use). Each tuple is a
263 (collection_uuid, collection_info) pair, where collection_info is
264 a dict with fields "dockerhash", "repo", "tag", and "timestamp".
271 # Find images with the name the user specified.
272 search_links = _get_docker_links(
273 api_client, num_retries,
274 filters=[['link_class', '=', 'docker_image_repo+tag'],
276 '{}:{}'.format(image_name, image_tag or 'latest')]])
278 repo_links = search_links
280 # Fall back to finding images with the specified image hash.
281 search_links = _get_docker_links(
282 api_client, num_retries,
283 filters=[['link_class', '=', 'docker_image_hash'],
284 ['name', 'ilike', image_name + '%']])
285 hash_links = search_links
286 # Only list information about images that were found in the search.
287 search_filters.append(['head_uuid', 'in',
288 [link['head_uuid'] for link in search_links]])
290 # It should be reasonable to expect that each collection only has one
291 # image hash (though there may be many links specifying this). Find
292 # the API server's most preferred image hash link for each collection.
293 if hash_links is None:
294 hash_links = _get_docker_links(
295 api_client, num_retries,
296 filters=search_filters + [['link_class', '=', 'docker_image_hash']])
297 hash_link_map = {link['head_uuid']: link for link in reversed(hash_links)}
299 # Each collection may have more than one name (though again, one name
300 # may be specified more than once). Build an image listing from name
301 # tags, sorted by API server preference.
302 if repo_links is None:
303 repo_links = _get_docker_links(
304 api_client, num_retries,
305 filters=search_filters + [['link_class', '=',
306 'docker_image_repo+tag']])
307 seen_image_names = collections.defaultdict(set)
309 for link in repo_links:
310 collection_uuid = link['head_uuid']
311 if link['name'] in seen_image_names[collection_uuid]:
313 seen_image_names[collection_uuid].add(link['name'])
315 dockerhash = hash_link_map[collection_uuid]['name']
317 dockerhash = '<unknown>'
318 name_parts = link['name'].split(':', 1)
319 images.append(_new_image_listing(link, dockerhash, *name_parts))
321 # Find any image hash links that did not have a corresponding name link,
322 # and add image listings for them, retaining the API server preference
324 images_start_size = len(images)
325 for collection_uuid, link in hash_link_map.items():
326 if not seen_image_names[collection_uuid]:
327 images.append(_new_image_listing(link, link['name']))
328 if len(images) > images_start_size:
329 images.sort(key=itemgetter('_sort_key'), reverse=True)
331 # Remove any image listings that refer to unknown collections.
332 existing_coll_uuids = {coll['uuid'] for coll in arvados.util.list_all(
333 api_client.collections().list, num_retries,
334 filters=[['uuid', 'in', [im['collection'] for im in images]]],
336 return [(image['collection'], image) for image in images
337 if image['collection'] in existing_coll_uuids]
339 def items_owned_by(owner_uuid, arv_items):
340 return (item for item in arv_items if item['owner_uuid'] == owner_uuid)
342 def _uuid2pdh(api, uuid):
343 return api.collections().list(
344 filters=[['uuid', '=', uuid]],
345 select=['portable_data_hash'],
346 ).execute()['items'][0]['portable_data_hash']
348 def main(arguments=None, stdout=sys.stdout, install_sig_handlers=True, api=None):
349 args = arg_parser.parse_args(arguments)
351 api = arvados.api('v1')
353 if args.image is None or args.image == 'images':
354 fmt = "{:30} {:10} {:12} {:29} {:20}\n"
355 stdout.write(fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED"))
357 for i, j in list_images_in_arv(api, args.retries):
358 stdout.write(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
360 if e.errno == errno.EPIPE:
366 if re.search(r':\w[-.\w]{0,127}$', args.image):
367 # image ends with :valid-tag
368 if args.tag is not None:
370 "image %r already includes a tag, cannot add tag argument %r",
371 args.image, args.tag)
373 # rsplit() accommodates "myrepo.example:8888/repo/image:tag"
374 args.image, args.tag = args.image.rsplit(':', 1)
375 elif args.tag is None:
378 # Pull the image if requested, unless the image is specified as a hash
379 # that we already have.
380 if args.pull and not find_image_hashes(args.image):
381 pull_image(args.image, args.tag)
384 image_hash = find_one_image_hash(args.image, args.tag)
385 except DockerError as error:
386 logger.error(error.message)
389 if not docker_image_compatible(api, image_hash):
390 if args.force_image_format:
391 logger.warning("forcing incompatible image")
393 logger.error("refusing to store " \
394 "incompatible format (use --force-image-format to override)")
397 image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None
399 if args.name is None:
401 collection_name = 'Docker image {} {}'.format(image_repo_tag, image_hash[0:12])
403 collection_name = 'Docker image {}'.format(image_hash[0:12])
405 collection_name = args.name
407 # Acquire a lock so that only one arv-keepdocker process will
408 # dump/upload a particular docker image at a time. Do this before
409 # checking if the image already exists in Arvados so that if there
410 # is an upload already underway, when that upload completes and
411 # this process gets a turn, it will discover the Docker image is
412 # already available and exit quickly.
413 outfile_name = '{}.tar'.format(image_hash)
414 lockfile_name = '{}.lock'.format(outfile_name)
416 cache_dir = get_cache_dir()
418 lockfile = open(os.path.join(cache_dir, lockfile_name), 'w+')
419 fcntl.flock(lockfile, fcntl.LOCK_EX)
423 # Check if this image is already in Arvados.
425 # Project where everything should be owned
426 parent_project_uuid = args.project_uuid or api.users().current().execute(
427 num_retries=args.retries)['uuid']
429 # Find image hash tags
430 existing_links = _get_docker_links(
432 filters=[['link_class', '=', 'docker_image_hash'],
433 ['name', '=', image_hash]])
435 # get readable collections
436 collections = api.collections().list(
437 filters=[['uuid', 'in', [link['head_uuid'] for link in existing_links]]],
438 select=["uuid", "owner_uuid", "name", "manifest_text"]
439 ).execute(num_retries=args.retries)['items']
442 # check for repo+tag links on these collections
444 existing_repo_tag = _get_docker_links(
446 filters=[['link_class', '=', 'docker_image_repo+tag'],
447 ['name', '=', image_repo_tag],
448 ['head_uuid', 'in', [c["uuid"] for c in collections]]])
450 existing_repo_tag = []
453 coll_uuid = next(items_owned_by(parent_project_uuid, collections))['uuid']
454 except StopIteration:
455 # create new collection owned by the project
456 coll_uuid = api.collections().create(
457 body={"manifest_text": collections[0]['manifest_text'],
458 "name": collection_name,
459 "owner_uuid": parent_project_uuid},
460 ensure_unique_name=True
461 ).execute(num_retries=args.retries)['uuid']
463 link_base = {'owner_uuid': parent_project_uuid,
464 'head_uuid': coll_uuid,
465 'properties': existing_links[0]['properties']}
467 if not any(items_owned_by(parent_project_uuid, existing_links)):
468 # create image link owned by the project
469 make_link(api, args.retries,
470 'docker_image_hash', image_hash, **link_base)
472 if image_repo_tag and not any(items_owned_by(parent_project_uuid, existing_repo_tag)):
473 # create repo+tag link owned by the project
474 make_link(api, args.retries, 'docker_image_repo+tag',
475 image_repo_tag, **link_base)
477 stdout.write(coll_uuid + "\n")
481 # Open a file for the saved image, and write it if needed.
482 image_file, need_save = prep_image_file(outfile_name)
484 save_image(image_hash, image_file)
486 # Call arv-put with switches we inherited from it
487 # (a.k.a., switches that aren't our own).
488 put_args = keepdocker_parser.parse_known_args(arguments)[1]
490 if args.name is None:
491 put_args += ['--name', collection_name]
493 coll_uuid = arv_put.main(
494 put_args + ['--filename', outfile_name, image_file.name], stdout=stdout,
495 install_sig_handlers=install_sig_handlers).strip()
497 # Read the image metadata and make Arvados links from it.
499 image_tar = tarfile.open(fileobj=image_file)
500 image_hash_type, _, raw_image_hash = image_hash.rpartition(':')
502 json_filename = raw_image_hash + '.json'
504 json_filename = raw_image_hash + '/json'
505 json_file = image_tar.extractfile(image_tar.getmember(json_filename))
506 image_metadata = json.load(json_file)
509 link_base = {'head_uuid': coll_uuid, 'properties': {}}
510 if 'created' in image_metadata:
511 link_base['properties']['image_timestamp'] = image_metadata['created']
512 if args.project_uuid is not None:
513 link_base['owner_uuid'] = args.project_uuid
515 make_link(api, args.retries, 'docker_image_hash', image_hash, **link_base)
517 make_link(api, args.retries,
518 'docker_image_repo+tag', image_repo_tag, **link_base)
522 for filename in [stat_cache_name(image_file), image_file.name]:
525 except OSError as error:
526 if error.errno != errno.ENOENT:
529 if lockfile is not None:
530 # Closing the lockfile unlocks it.
533 if __name__ == '__main__':