1 # Copyright (C) The Arvados Authors. All rights reserved.
3 # SPDX-License-Identifier: Apache-2.0
5 from builtins import next
21 from operator import itemgetter
26 import arvados.commands._util as arv_cmd
27 import arvados.commands.put as arv_put
28 from arvados.collection import CollectionReader
33 from arvados._version import __version__
35 logger = logging.getLogger('arvados.keepdocker')
36 logger.setLevel(logging.DEBUG if arvados.config.get('ARVADOS_DEBUG')
39 EARLIEST_DATETIME = datetime.datetime(datetime.MINYEAR, 1, 1, 0, 0, 0)
40 STAT_CACHE_ERRORS = (IOError, OSError, ValueError)
42 DockerImage = collections.namedtuple(
43 'DockerImage', ['repo', 'tag', 'hash', 'created', 'vsize'])
45 keepdocker_parser = argparse.ArgumentParser(add_help=False)
46 keepdocker_parser.add_argument(
47 '--version', action='version', version="%s %s" % (sys.argv[0], __version__),
48 help='Print version and exit.')
49 keepdocker_parser.add_argument(
50 '-f', '--force', action='store_true', default=False,
51 help="Re-upload the image even if it already exists on the server")
52 keepdocker_parser.add_argument(
53 '--force-image-format', action='store_true', default=False,
54 help="Proceed even if the image format is not supported by the server")
56 _group = keepdocker_parser.add_mutually_exclusive_group()
58 '--pull', action='store_true', default=False,
59 help="Try to pull the latest image from Docker registry")
61 '--no-pull', action='store_false', dest='pull',
62 help="Use locally installed image only, don't pull image from Docker registry (default)")
64 keepdocker_parser.add_argument(
66 help="Docker image to upload: repo, repo:tag, or hash")
67 keepdocker_parser.add_argument(
69 help="Tag of the Docker image to upload (default 'latest'), if image is given as an untagged repo name")
71 # Combine keepdocker options listed above with run_opts options of arv-put.
72 # The options inherited from arv-put include --name, --project-uuid,
73 # --progress/--no-progress/--batch-progress and --resume/--no-resume.
74 arg_parser = argparse.ArgumentParser(
75 description="Upload or list Docker images in Arvados",
76 parents=[keepdocker_parser, arv_put.run_opts, arv_cmd.retry_opt])
78 class DockerError(Exception):
82 def popen_docker(cmd, *args, **kwargs):
83 manage_stdin = ('stdin' not in kwargs)
84 kwargs.setdefault('stdin', subprocess.PIPE)
85 kwargs.setdefault('stdout', sys.stderr)
87 docker_proc = subprocess.Popen(['docker.io'] + cmd, *args, **kwargs)
88 except OSError: # No docker.io in $PATH
89 docker_proc = subprocess.Popen(['docker'] + cmd, *args, **kwargs)
91 docker_proc.stdin.close()
94 def check_docker(proc, description):
96 if proc.returncode != 0:
97 raise DockerError("docker {} returned status code {}".
98 format(description, proc.returncode))
100 def docker_image_format(image_hash):
101 """Return the registry format ('v1' or 'v2') of the given image."""
102 cmd = popen_docker(['inspect', '--format={{.Id}}', image_hash],
103 stdout=subprocess.PIPE)
105 image_id = next(cmd.stdout).decode().strip()
106 if image_id.startswith('sha256:'):
108 elif ':' not in image_id:
113 check_docker(cmd, "inspect")
115 def docker_image_compatible(api, image_hash):
116 supported = api._rootDesc.get('dockerImageFormats', [])
118 logger.warning("server does not specify supported image formats (see docker_image_formats in server config).")
121 fmt = docker_image_format(image_hash)
125 logger.error("image format is {!r} " \
126 "but server supports only {!r}".format(fmt, supported))
130 # Yield a DockerImage tuple for each installed image.
131 list_proc = popen_docker(['images', '--no-trunc'], stdout=subprocess.PIPE)
132 list_output = iter(list_proc.stdout)
133 next(list_output) # Ignore the header line
134 for line in list_output:
136 size_index = len(words) - 2
137 repo, tag, imageid = words[:3]
138 ctime = ' '.join(words[3:size_index])
139 vsize = ' '.join(words[size_index:])
140 yield DockerImage(repo, tag, imageid, ctime, vsize)
141 list_proc.stdout.close()
142 check_docker(list_proc, "images")
144 def find_image_hashes(image_search, image_tag=None):
145 # Given one argument, search for Docker images with matching hashes,
146 # and return their full hashes in a set.
147 # Given two arguments, also search for a Docker image with the
148 # same repository and tag. If one is found, return its hash in a
149 # set; otherwise, fall back to the one-argument hash search.
150 # Returns None if no match is found, or a hash search is ambiguous.
151 hash_search = image_search.lower()
153 for image in docker_images():
154 if (image.repo == image_search) and (image.tag == image_tag):
155 return set([image.hash])
156 elif image.hash.startswith(hash_search):
157 hash_matches.add(image.hash)
160 def find_one_image_hash(image_search, image_tag=None):
161 hashes = find_image_hashes(image_search, image_tag)
162 hash_count = len(hashes)
165 elif hash_count == 0:
166 raise DockerError("no matching image found")
168 raise DockerError("{} images match {}".format(hash_count, image_search))
170 def stat_cache_name(image_file):
171 return getattr(image_file, 'name', image_file) + '.stat'
173 def pull_image(image_name, image_tag):
174 check_docker(popen_docker(['pull', '{}:{}'.format(image_name, image_tag)]),
177 def save_image(image_hash, image_file):
178 # Save the specified Docker image to image_file, then try to save its
179 # stats so we can try to resume after interruption.
180 check_docker(popen_docker(['save', image_hash], stdout=image_file),
184 with open(stat_cache_name(image_file), 'w') as statfile:
185 json.dump(tuple(os.fstat(image_file.fileno())), statfile)
186 except STAT_CACHE_ERRORS:
187 pass # We won't resume from this cache. No big deal.
190 return arv_cmd.make_home_conf_dir(
191 os.path.join('.cache', 'arvados', 'docker'), 0o700)
193 def prep_image_file(filename):
194 # Return a file object ready to save a Docker image,
195 # and a boolean indicating whether or not we need to actually save the
196 # image (False if a cached save is available).
197 cache_dir = get_cache_dir()
198 if cache_dir is None:
199 image_file = tempfile.NamedTemporaryFile(suffix='.tar')
202 file_path = os.path.join(cache_dir, filename)
204 with open(stat_cache_name(file_path)) as statfile:
205 prev_stat = json.load(statfile)
206 now_stat = os.stat(file_path)
207 need_save = any(prev_stat[field] != now_stat[field]
208 for field in [ST_MTIME, ST_SIZE])
209 except STAT_CACHE_ERRORS + (AttributeError, IndexError):
210 need_save = True # We couldn't compare against old stats
211 image_file = open(file_path, 'w+b' if need_save else 'rb')
212 return image_file, need_save
214 def make_link(api_client, num_retries, link_class, link_name, **link_attrs):
215 link_attrs.update({'link_class': link_class, 'name': link_name})
216 return api_client.links().create(body=link_attrs).execute(
217 num_retries=num_retries)
219 def docker_link_sort_key(link):
220 """Build a sort key to find the latest available Docker image.
222 To find one source collection for a Docker image referenced by
223 name or image id, the API server looks for a link with the most
224 recent `image_timestamp` property; then the most recent
225 `created_at` timestamp. This method generates a sort key for
226 Docker metadata links to sort them from least to most preferred.
229 image_timestamp = ciso8601.parse_datetime_unaware(
230 link['properties']['image_timestamp'])
231 except (KeyError, ValueError):
232 image_timestamp = EARLIEST_DATETIME
233 return (image_timestamp,
234 ciso8601.parse_datetime_unaware(link['created_at']))
236 def _get_docker_links(api_client, num_retries, **kwargs):
237 links = arvados.util.list_all(api_client.links().list,
238 num_retries, **kwargs)
240 link['_sort_key'] = docker_link_sort_key(link)
241 links.sort(key=itemgetter('_sort_key'), reverse=True)
244 def _new_image_listing(link, dockerhash, repo='<none>', tag='<none>'):
245 timestamp_index = 1 if (link['_sort_key'][0] is EARLIEST_DATETIME) else 0
247 '_sort_key': link['_sort_key'],
248 'timestamp': link['_sort_key'][timestamp_index],
249 'collection': link['head_uuid'],
250 'dockerhash': dockerhash,
255 def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None):
256 """List all Docker images known to the api_client with image_name and
257 image_tag. If no image_name is given, defaults to listing all
260 Returns a list of tuples representing matching Docker images,
261 sorted in preference order (i.e. the first collection in the list
262 is the one that the API server would use). Each tuple is a
263 (collection_uuid, collection_info) pair, where collection_info is
264 a dict with fields "dockerhash", "repo", "tag", and "timestamp".
271 # Find images with the name the user specified.
272 search_links = _get_docker_links(
273 api_client, num_retries,
274 filters=[['link_class', '=', 'docker_image_repo+tag'],
276 '{}:{}'.format(image_name, image_tag or 'latest')]])
278 repo_links = search_links
280 # Fall back to finding images with the specified image hash.
281 search_links = _get_docker_links(
282 api_client, num_retries,
283 filters=[['link_class', '=', 'docker_image_hash'],
284 ['name', 'ilike', image_name + '%']])
285 hash_links = search_links
286 # Only list information about images that were found in the search.
287 search_filters.append(['head_uuid', 'in',
288 [link['head_uuid'] for link in search_links]])
290 # It should be reasonable to expect that each collection only has one
291 # image hash (though there may be many links specifying this). Find
292 # the API server's most preferred image hash link for each collection.
293 if hash_links is None:
294 hash_links = _get_docker_links(
295 api_client, num_retries,
296 filters=search_filters + [['link_class', '=', 'docker_image_hash']])
297 hash_link_map = {link['head_uuid']: link for link in reversed(hash_links)}
299 # Each collection may have more than one name (though again, one name
300 # may be specified more than once). Build an image listing from name
301 # tags, sorted by API server preference.
302 if repo_links is None:
303 repo_links = _get_docker_links(
304 api_client, num_retries,
305 filters=search_filters + [['link_class', '=',
306 'docker_image_repo+tag']])
307 seen_image_names = collections.defaultdict(set)
309 for link in repo_links:
310 collection_uuid = link['head_uuid']
311 if link['name'] in seen_image_names[collection_uuid]:
313 seen_image_names[collection_uuid].add(link['name'])
315 dockerhash = hash_link_map[collection_uuid]['name']
317 dockerhash = '<unknown>'
318 name_parts = link['name'].split(':', 1)
319 images.append(_new_image_listing(link, dockerhash, *name_parts))
321 # Find any image hash links that did not have a corresponding name link,
322 # and add image listings for them, retaining the API server preference
324 images_start_size = len(images)
325 for collection_uuid, link in hash_link_map.items():
326 if not seen_image_names[collection_uuid]:
327 images.append(_new_image_listing(link, link['name']))
328 if len(images) > images_start_size:
329 images.sort(key=itemgetter('_sort_key'), reverse=True)
331 # Remove any image listings that refer to unknown collections.
332 existing_coll_uuids = {coll['uuid'] for coll in arvados.util.list_all(
333 api_client.collections().list, num_retries,
334 filters=[['uuid', 'in', [im['collection'] for im in images]]],
336 return [(image['collection'], image) for image in images
337 if image['collection'] in existing_coll_uuids]
339 def items_owned_by(owner_uuid, arv_items):
340 return (item for item in arv_items if item['owner_uuid'] == owner_uuid)
342 def _uuid2pdh(api, uuid):
343 return api.collections().list(
344 filters=[['uuid', '=', uuid]],
345 select=['portable_data_hash'],
346 ).execute()['items'][0]['portable_data_hash']
348 def main(arguments=None, stdout=sys.stdout):
349 args = arg_parser.parse_args(arguments)
350 api = arvados.api('v1')
352 if args.image is None or args.image == 'images':
353 fmt = "{:30} {:10} {:12} {:29} {:20}\n"
354 stdout.write(fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED"))
356 for i, j in list_images_in_arv(api, args.retries):
357 stdout.write(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
359 if e.errno == errno.EPIPE:
365 if re.search(r':\w[-.\w]{0,127}$', args.image):
366 # image ends with :valid-tag
367 if args.tag is not None:
369 "image %r already includes a tag, cannot add tag argument %r",
370 args.image, args.tag)
372 # rsplit() accommodates "myrepo.example:8888/repo/image:tag"
373 args.image, args.tag = args.image.rsplit(':', 1)
374 elif args.tag is None:
377 # Pull the image if requested, unless the image is specified as a hash
378 # that we already have.
379 if args.pull and not find_image_hashes(args.image):
380 pull_image(args.image, args.tag)
383 image_hash = find_one_image_hash(args.image, args.tag)
384 except DockerError as error:
385 logger.error(error.message)
388 if not docker_image_compatible(api, image_hash):
389 if args.force_image_format:
390 logger.warning("forcing incompatible image")
392 logger.error("refusing to store " \
393 "incompatible format (use --force-image-format to override)")
396 image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None
398 if args.name is None:
400 collection_name = 'Docker image {} {}'.format(image_repo_tag, image_hash[0:12])
402 collection_name = 'Docker image {}'.format(image_hash[0:12])
404 collection_name = args.name
406 # Acquire a lock so that only one arv-keepdocker process will
407 # dump/upload a particular docker image at a time. Do this before
408 # checking if the image already exists in Arvados so that if there
409 # is an upload already underway, when that upload completes and
410 # this process gets a turn, it will discover the Docker image is
411 # already available and exit quickly.
412 outfile_name = '{}.tar'.format(image_hash)
413 lockfile_name = '{}.lock'.format(outfile_name)
415 cache_dir = get_cache_dir()
417 lockfile = open(os.path.join(cache_dir, lockfile_name), 'w+')
418 fcntl.flock(lockfile, fcntl.LOCK_EX)
422 # Check if this image is already in Arvados.
424 # Project where everything should be owned
425 parent_project_uuid = args.project_uuid or api.users().current().execute(
426 num_retries=args.retries)['uuid']
428 # Find image hash tags
429 existing_links = _get_docker_links(
431 filters=[['link_class', '=', 'docker_image_hash'],
432 ['name', '=', image_hash]])
434 # get readable collections
435 collections = api.collections().list(
436 filters=[['uuid', 'in', [link['head_uuid'] for link in existing_links]]],
437 select=["uuid", "owner_uuid", "name", "manifest_text"]
438 ).execute(num_retries=args.retries)['items']
441 # check for repo+tag links on these collections
443 existing_repo_tag = _get_docker_links(
445 filters=[['link_class', '=', 'docker_image_repo+tag'],
446 ['name', '=', image_repo_tag],
447 ['head_uuid', 'in', [c["uuid"] for c in collections]]])
449 existing_repo_tag = []
452 coll_uuid = next(items_owned_by(parent_project_uuid, collections))['uuid']
453 except StopIteration:
454 # create new collection owned by the project
455 coll_uuid = api.collections().create(
456 body={"manifest_text": collections[0]['manifest_text'],
457 "name": collection_name,
458 "owner_uuid": parent_project_uuid},
459 ensure_unique_name=True
460 ).execute(num_retries=args.retries)['uuid']
462 link_base = {'owner_uuid': parent_project_uuid,
463 'head_uuid': coll_uuid,
464 'properties': existing_links[0]['properties']}
466 if not any(items_owned_by(parent_project_uuid, existing_links)):
467 # create image link owned by the project
468 make_link(api, args.retries,
469 'docker_image_hash', image_hash, **link_base)
471 if image_repo_tag and not any(items_owned_by(parent_project_uuid, existing_repo_tag)):
472 # create repo+tag link owned by the project
473 make_link(api, args.retries, 'docker_image_repo+tag',
474 image_repo_tag, **link_base)
476 stdout.write(coll_uuid + "\n")
480 # Open a file for the saved image, and write it if needed.
481 image_file, need_save = prep_image_file(outfile_name)
483 save_image(image_hash, image_file)
485 # Call arv-put with switches we inherited from it
486 # (a.k.a., switches that aren't our own).
487 put_args = keepdocker_parser.parse_known_args(arguments)[1]
489 if args.name is None:
490 put_args += ['--name', collection_name]
492 coll_uuid = arv_put.main(
493 put_args + ['--filename', outfile_name, image_file.name], stdout=stdout).strip()
495 # Read the image metadata and make Arvados links from it.
497 image_tar = tarfile.open(fileobj=image_file)
498 image_hash_type, _, raw_image_hash = image_hash.rpartition(':')
500 json_filename = raw_image_hash + '.json'
502 json_filename = raw_image_hash + '/json'
503 json_file = image_tar.extractfile(image_tar.getmember(json_filename))
504 image_metadata = json.load(json_file)
507 link_base = {'head_uuid': coll_uuid, 'properties': {}}
508 if 'created' in image_metadata:
509 link_base['properties']['image_timestamp'] = image_metadata['created']
510 if args.project_uuid is not None:
511 link_base['owner_uuid'] = args.project_uuid
513 make_link(api, args.retries, 'docker_image_hash', image_hash, **link_base)
515 make_link(api, args.retries,
516 'docker_image_repo+tag', image_repo_tag, **link_base)
520 for filename in [stat_cache_name(image_file), image_file.name]:
523 except OSError as error:
524 if error.errno != errno.ENOENT:
527 if lockfile is not None:
528 # Closing the lockfile unlocks it.
531 if __name__ == '__main__':