17 from operator import itemgetter
22 import arvados.commands._util as arv_cmd
23 import arvados.commands.put as arv_put
24 from arvados.collection import CollectionReader
29 from arvados._version import __version__
31 logger = logging.getLogger('arvados.keepdocker')
32 logger.setLevel(logging.DEBUG if arvados.config.get('ARVADOS_DEBUG')
35 EARLIEST_DATETIME = datetime.datetime(datetime.MINYEAR, 1, 1, 0, 0, 0)
36 STAT_CACHE_ERRORS = (IOError, OSError, ValueError)
38 DockerImage = collections.namedtuple(
39 'DockerImage', ['repo', 'tag', 'hash', 'created', 'vsize'])
41 keepdocker_parser = argparse.ArgumentParser(add_help=False)
42 keepdocker_parser.add_argument(
43 '--version', action='version', version="%s %s" % (sys.argv[0], __version__),
44 help='Print version and exit.')
45 keepdocker_parser.add_argument(
46 '-f', '--force', action='store_true', default=False,
47 help="Re-upload the image even if it already exists on the server")
48 keepdocker_parser.add_argument(
49 '--force-image-format', action='store_true', default=False,
50 help="Proceed even if the image format is not supported by the server")
52 _group = keepdocker_parser.add_mutually_exclusive_group()
54 '--pull', action='store_true', default=False,
55 help="Try to pull the latest image from Docker registry")
57 '--no-pull', action='store_false', dest='pull',
58 help="Use locally installed image only, don't pull image from Docker registry (default)")
60 keepdocker_parser.add_argument(
62 help="Docker image to upload, as a repository name or hash")
63 keepdocker_parser.add_argument(
64 'tag', nargs='?', default='latest',
65 help="Tag of the Docker image to upload (default 'latest')")
67 # Combine keepdocker options listed above with run_opts options of arv-put.
68 # The options inherited from arv-put include --name, --project-uuid,
69 # --progress/--no-progress/--batch-progress and --resume/--no-resume.
70 arg_parser = argparse.ArgumentParser(
71 description="Upload or list Docker images in Arvados",
72 parents=[keepdocker_parser, arv_put.run_opts, arv_cmd.retry_opt])
74 class DockerError(Exception):
78 def popen_docker(cmd, *args, **kwargs):
79 manage_stdin = ('stdin' not in kwargs)
80 kwargs.setdefault('stdin', subprocess.PIPE)
81 kwargs.setdefault('stdout', sys.stderr)
83 docker_proc = subprocess.Popen(['docker.io'] + cmd, *args, **kwargs)
84 except OSError: # No docker.io in $PATH
85 docker_proc = subprocess.Popen(['docker'] + cmd, *args, **kwargs)
87 docker_proc.stdin.close()
90 def check_docker(proc, description):
92 if proc.returncode != 0:
93 raise DockerError("docker {} returned status code {}".
94 format(description, proc.returncode))
96 def docker_image_format(image_hash):
97 """Return the registry format ('v1' or 'v2') of the given image."""
98 cmd = popen_docker(['inspect', '--format={{.Id}}', image_hash],
99 stdout=subprocess.PIPE)
101 image_id = next(cmd.stdout).strip()
102 if image_id.startswith('sha256:'):
104 elif ':' not in image_id:
109 check_docker(cmd, "inspect")
111 def docker_image_compatible(api, image_hash):
112 supported = api._rootDesc.get('dockerImageFormats', [])
114 logger.warn("server does not specify supported image formats (see docker_image_formats in server config).")
117 fmt = docker_image_format(image_hash)
121 logger.error("image format is {!r} " \
122 "but server supports only {!r}".format(fmt, supported))
126 # Yield a DockerImage tuple for each installed image.
127 list_proc = popen_docker(['images', '--no-trunc'], stdout=subprocess.PIPE)
128 list_output = iter(list_proc.stdout)
129 next(list_output) # Ignore the header line
130 for line in list_output:
132 size_index = len(words) - 2
133 repo, tag, imageid = words[:3]
134 ctime = ' '.join(words[3:size_index])
135 vsize = ' '.join(words[size_index:])
136 yield DockerImage(repo, tag, imageid, ctime, vsize)
137 list_proc.stdout.close()
138 check_docker(list_proc, "images")
140 def find_image_hashes(image_search, image_tag=None):
141 # Given one argument, search for Docker images with matching hashes,
142 # and return their full hashes in a set.
143 # Given two arguments, also search for a Docker image with the
144 # same repository and tag. If one is found, return its hash in a
145 # set; otherwise, fall back to the one-argument hash search.
146 # Returns None if no match is found, or a hash search is ambiguous.
147 hash_search = image_search.lower()
149 for image in docker_images():
150 if (image.repo == image_search) and (image.tag == image_tag):
151 return set([image.hash])
152 elif image.hash.startswith(hash_search):
153 hash_matches.add(image.hash)
156 def find_one_image_hash(image_search, image_tag=None):
157 hashes = find_image_hashes(image_search, image_tag)
158 hash_count = len(hashes)
161 elif hash_count == 0:
162 raise DockerError("no matching image found")
164 raise DockerError("{} images match {}".format(hash_count, image_search))
166 def stat_cache_name(image_file):
167 return getattr(image_file, 'name', image_file) + '.stat'
169 def pull_image(image_name, image_tag):
170 check_docker(popen_docker(['pull', '{}:{}'.format(image_name, image_tag)]),
173 def save_image(image_hash, image_file):
174 # Save the specified Docker image to image_file, then try to save its
175 # stats so we can try to resume after interruption.
176 check_docker(popen_docker(['save', image_hash], stdout=image_file),
180 with open(stat_cache_name(image_file), 'w') as statfile:
181 json.dump(tuple(os.fstat(image_file.fileno())), statfile)
182 except STAT_CACHE_ERRORS:
183 pass # We won't resume from this cache. No big deal.
185 def prep_image_file(filename):
186 # Return a file object ready to save a Docker image,
187 # and a boolean indicating whether or not we need to actually save the
188 # image (False if a cached save is available).
189 cache_dir = arv_cmd.make_home_conf_dir(
190 os.path.join('.cache', 'arvados', 'docker'), 0o700)
191 if cache_dir is None:
192 image_file = tempfile.NamedTemporaryFile(suffix='.tar')
195 file_path = os.path.join(cache_dir, filename)
197 with open(stat_cache_name(file_path)) as statfile:
198 prev_stat = json.load(statfile)
199 now_stat = os.stat(file_path)
200 need_save = any(prev_stat[field] != now_stat[field]
201 for field in [ST_MTIME, ST_SIZE])
202 except STAT_CACHE_ERRORS + (AttributeError, IndexError):
203 need_save = True # We couldn't compare against old stats
204 image_file = open(file_path, 'w+b' if need_save else 'rb')
205 return image_file, need_save
207 def make_link(api_client, num_retries, link_class, link_name, **link_attrs):
208 link_attrs.update({'link_class': link_class, 'name': link_name})
209 return api_client.links().create(body=link_attrs).execute(
210 num_retries=num_retries)
212 def docker_link_sort_key(link):
213 """Build a sort key to find the latest available Docker image.
215 To find one source collection for a Docker image referenced by
216 name or image id, the API server looks for a link with the most
217 recent `image_timestamp` property; then the most recent
218 `created_at` timestamp. This method generates a sort key for
219 Docker metadata links to sort them from least to most preferred.
222 image_timestamp = ciso8601.parse_datetime_unaware(
223 link['properties']['image_timestamp'])
224 except (KeyError, ValueError):
225 image_timestamp = EARLIEST_DATETIME
226 return (image_timestamp,
227 ciso8601.parse_datetime_unaware(link['created_at']))
229 def _get_docker_links(api_client, num_retries, **kwargs):
230 links = arvados.util.list_all(api_client.links().list,
231 num_retries, **kwargs)
233 link['_sort_key'] = docker_link_sort_key(link)
234 links.sort(key=itemgetter('_sort_key'), reverse=True)
237 def _new_image_listing(link, dockerhash, repo='<none>', tag='<none>'):
238 timestamp_index = 1 if (link['_sort_key'][0] is EARLIEST_DATETIME) else 0
240 '_sort_key': link['_sort_key'],
241 'timestamp': link['_sort_key'][timestamp_index],
242 'collection': link['head_uuid'],
243 'dockerhash': dockerhash,
248 def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None):
249 """List all Docker images known to the api_client with image_name and
250 image_tag. If no image_name is given, defaults to listing all
253 Returns a list of tuples representing matching Docker images,
254 sorted in preference order (i.e. the first collection in the list
255 is the one that the API server would use). Each tuple is a
256 (collection_uuid, collection_info) pair, where collection_info is
257 a dict with fields "dockerhash", "repo", "tag", and "timestamp".
264 # Find images with the name the user specified.
265 search_links = _get_docker_links(
266 api_client, num_retries,
267 filters=[['link_class', '=', 'docker_image_repo+tag'],
269 '{}:{}'.format(image_name, image_tag or 'latest')]])
271 repo_links = search_links
273 # Fall back to finding images with the specified image hash.
274 search_links = _get_docker_links(
275 api_client, num_retries,
276 filters=[['link_class', '=', 'docker_image_hash'],
277 ['name', 'ilike', image_name + '%']])
278 hash_links = search_links
279 # Only list information about images that were found in the search.
280 search_filters.append(['head_uuid', 'in',
281 [link['head_uuid'] for link in search_links]])
283 # It should be reasonable to expect that each collection only has one
284 # image hash (though there may be many links specifying this). Find
285 # the API server's most preferred image hash link for each collection.
286 if hash_links is None:
287 hash_links = _get_docker_links(
288 api_client, num_retries,
289 filters=search_filters + [['link_class', '=', 'docker_image_hash']])
290 hash_link_map = {link['head_uuid']: link for link in reversed(hash_links)}
292 # Each collection may have more than one name (though again, one name
293 # may be specified more than once). Build an image listing from name
294 # tags, sorted by API server preference.
295 if repo_links is None:
296 repo_links = _get_docker_links(
297 api_client, num_retries,
298 filters=search_filters + [['link_class', '=',
299 'docker_image_repo+tag']])
300 seen_image_names = collections.defaultdict(set)
302 for link in repo_links:
303 collection_uuid = link['head_uuid']
304 if link['name'] in seen_image_names[collection_uuid]:
306 seen_image_names[collection_uuid].add(link['name'])
308 dockerhash = hash_link_map[collection_uuid]['name']
310 dockerhash = '<unknown>'
311 name_parts = link['name'].split(':', 1)
312 images.append(_new_image_listing(link, dockerhash, *name_parts))
314 # Find any image hash links that did not have a corresponding name link,
315 # and add image listings for them, retaining the API server preference
317 images_start_size = len(images)
318 for collection_uuid, link in hash_link_map.iteritems():
319 if not seen_image_names[collection_uuid]:
320 images.append(_new_image_listing(link, link['name']))
321 if len(images) > images_start_size:
322 images.sort(key=itemgetter('_sort_key'), reverse=True)
324 # Remove any image listings that refer to unknown collections.
325 existing_coll_uuids = {coll['uuid'] for coll in arvados.util.list_all(
326 api_client.collections().list, num_retries,
327 filters=[['uuid', 'in', [im['collection'] for im in images]]],
329 return [(image['collection'], image) for image in images
330 if image['collection'] in existing_coll_uuids]
332 def items_owned_by(owner_uuid, arv_items):
333 return (item for item in arv_items if item['owner_uuid'] == owner_uuid)
335 def _uuid2pdh(api, uuid):
336 return api.collections().list(
337 filters=[['uuid', '=', uuid]],
338 select=['portable_data_hash'],
339 ).execute()['items'][0]['portable_data_hash']
341 def main(arguments=None, stdout=sys.stdout):
342 args = arg_parser.parse_args(arguments)
343 api = arvados.api('v1')
345 if args.image is None or args.image == 'images':
346 fmt = "{:30} {:10} {:12} {:29} {:20}\n"
347 stdout.write(fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED"))
349 for i, j in list_images_in_arv(api, args.retries):
350 stdout.write(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
352 if e.errno == errno.EPIPE:
358 # Pull the image if requested, unless the image is specified as a hash
359 # that we already have.
360 if args.pull and not find_image_hashes(args.image):
361 pull_image(args.image, args.tag)
364 image_hash = find_one_image_hash(args.image, args.tag)
365 except DockerError as error:
366 logger.error(error.message)
369 if not docker_image_compatible(api, image_hash):
370 if args.force_image_format:
371 logger.warn("forcing incompatible image")
373 logger.error("refusing to store " \
374 "incompatible format (use --force-image-format to override)")
377 image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None
379 if args.name is None:
381 collection_name = 'Docker image {} {}'.format(image_repo_tag, image_hash[0:12])
383 collection_name = 'Docker image {}'.format(image_hash[0:12])
385 collection_name = args.name
388 # Check if this image is already in Arvados.
390 # Project where everything should be owned
391 if args.project_uuid:
392 parent_project_uuid = args.project_uuid
394 parent_project_uuid = api.users().current().execute(
395 num_retries=args.retries)['uuid']
397 # Find image hash tags
398 existing_links = _get_docker_links(
400 filters=[['link_class', '=', 'docker_image_hash'],
401 ['name', '=', image_hash]])
403 # get readable collections
404 collections = api.collections().list(
405 filters=[['uuid', 'in', [link['head_uuid'] for link in existing_links]]],
406 select=["uuid", "owner_uuid", "name", "manifest_text"]
407 ).execute(num_retries=args.retries)['items']
410 # check for repo+tag links on these collections
412 existing_repo_tag = _get_docker_links(
414 filters=[['link_class', '=', 'docker_image_repo+tag'],
415 ['name', '=', image_repo_tag],
416 ['head_uuid', 'in', [c["uuid"] for c in collections]]])
418 existing_repo_tag = []
421 coll_uuid = next(items_owned_by(parent_project_uuid, collections))['uuid']
422 except StopIteration:
423 # create new collection owned by the project
424 coll_uuid = api.collections().create(
425 body={"manifest_text": collections[0]['manifest_text'],
426 "name": collection_name,
427 "owner_uuid": parent_project_uuid},
428 ensure_unique_name=True
429 ).execute(num_retries=args.retries)['uuid']
431 link_base = {'owner_uuid': parent_project_uuid,
432 'head_uuid': coll_uuid,
433 'properties': existing_links[0]['properties']}
435 if not any(items_owned_by(parent_project_uuid, existing_links)):
436 # create image link owned by the project
437 make_link(api, args.retries,
438 'docker_image_hash', image_hash, **link_base)
440 if image_repo_tag and not any(items_owned_by(parent_project_uuid, existing_repo_tag)):
441 # create repo+tag link owned by the project
442 make_link(api, args.retries, 'docker_image_repo+tag',
443 image_repo_tag, **link_base)
445 stdout.write(coll_uuid + "\n")
449 # Open a file for the saved image, and write it if needed.
450 outfile_name = '{}.tar'.format(image_hash)
451 image_file, need_save = prep_image_file(outfile_name)
453 save_image(image_hash, image_file)
455 # Call arv-put with switches we inherited from it
456 # (a.k.a., switches that aren't our own).
457 put_args = keepdocker_parser.parse_known_args(arguments)[1]
459 if args.name is None:
460 put_args += ['--name', collection_name]
462 coll_uuid = arv_put.main(
463 put_args + ['--filename', outfile_name, image_file.name], stdout=stdout).strip()
465 # Read the image metadata and make Arvados links from it.
467 image_tar = tarfile.open(fileobj=image_file)
468 image_hash_type, _, raw_image_hash = image_hash.rpartition(':')
470 json_filename = raw_image_hash + '.json'
472 json_filename = raw_image_hash + '/json'
473 json_file = image_tar.extractfile(image_tar.getmember(json_filename))
474 image_metadata = json.load(json_file)
477 link_base = {'head_uuid': coll_uuid, 'properties': {}}
478 if 'created' in image_metadata:
479 link_base['properties']['image_timestamp'] = image_metadata['created']
480 if args.project_uuid is not None:
481 link_base['owner_uuid'] = args.project_uuid
483 make_link(api, args.retries, 'docker_image_hash', image_hash, **link_base)
485 make_link(api, args.retries,
486 'docker_image_repo+tag', image_repo_tag, **link_base)
490 for filename in [stat_cache_name(image_file), image_file.name]:
493 except OSError as error:
494 if error.errno != errno.ENOENT:
497 if __name__ == '__main__':