3699: figure out correct docker image to fetch
[arvados.git] / sdk / python / arvados / commands / keepdocker.py
1 #!/usr/bin/env python
2
3 import argparse
4 import datetime
5 import errno
6 import json
7 import os
8 import subprocess
9 import sys
10 import tarfile
11 import tempfile
12
13 from collections import namedtuple
14 from stat import *
15
16 import arvados
17 import arvados.commands._util as arv_cmd
18 import arvados.commands.put as arv_put
19
20 STAT_CACHE_ERRORS = (IOError, OSError, ValueError)
21
22 DockerImage = namedtuple('DockerImage',
23                          ['repo', 'tag', 'hash', 'created', 'vsize'])
24
25 keepdocker_parser = argparse.ArgumentParser(add_help=False)
26 keepdocker_parser.add_argument(
27     '-f', '--force', action='store_true', default=False,
28     help="Re-upload the image even if it already exists on the server")
29
30 _group = keepdocker_parser.add_mutually_exclusive_group()
31 _group.add_argument(
32     '--pull', action='store_true', default=False,
33     help="Try to pull the latest image from Docker registry")
34 _group.add_argument(
35     '--no-pull', action='store_false', dest='pull',
36     help="Use locally installed image only, don't pull image from Docker registry (default)")
37
38 keepdocker_parser.add_argument(
39     'image', nargs='?',
40     help="Docker image to upload, as a repository name or hash")
41 keepdocker_parser.add_argument(
42     'tag', nargs='?', default='latest',
43     help="Tag of the Docker image to upload (default 'latest')")
44
45 # Combine keepdocker options listed above with run_opts options of arv-put.
46 # The options inherited from arv-put include --name, --project-uuid,
47 # --progress/--no-progress/--batch-progress and --resume/--no-resume.
48 arg_parser = argparse.ArgumentParser(
49         description="Upload or list Docker images in Arvados",
50         parents=[keepdocker_parser, arv_put.run_opts, arv_cmd.retry_opt])
51
52 class DockerError(Exception):
53     pass
54
55
56 def popen_docker(cmd, *args, **kwargs):
57     manage_stdin = ('stdin' not in kwargs)
58     kwargs.setdefault('stdin', subprocess.PIPE)
59     kwargs.setdefault('stdout', sys.stderr)
60     try:
61         docker_proc = subprocess.Popen(['docker.io'] + cmd, *args, **kwargs)
62     except OSError:  # No docker.io in $PATH
63         docker_proc = subprocess.Popen(['docker'] + cmd, *args, **kwargs)
64     if manage_stdin:
65         docker_proc.stdin.close()
66     return docker_proc
67
68 def check_docker(proc, description):
69     proc.wait()
70     if proc.returncode != 0:
71         raise DockerError("docker {} returned status code {}".
72                           format(description, proc.returncode))
73
74 def docker_images():
75     # Yield a DockerImage tuple for each installed image.
76     list_proc = popen_docker(['images', '--no-trunc'], stdout=subprocess.PIPE)
77     list_output = iter(list_proc.stdout)
78     next(list_output)  # Ignore the header line
79     for line in list_output:
80         words = line.split()
81         size_index = len(words) - 2
82         repo, tag, imageid = words[:3]
83         ctime = ' '.join(words[3:size_index])
84         vsize = ' '.join(words[size_index:])
85         yield DockerImage(repo, tag, imageid, ctime, vsize)
86     list_proc.stdout.close()
87     check_docker(list_proc, "images")
88
89 def find_image_hashes(image_search, image_tag=None):
90     # Given one argument, search for Docker images with matching hashes,
91     # and return their full hashes in a set.
92     # Given two arguments, also search for a Docker image with the
93     # same repository and tag.  If one is found, return its hash in a
94     # set; otherwise, fall back to the one-argument hash search.
95     # Returns None if no match is found, or a hash search is ambiguous.
96     hash_search = image_search.lower()
97     hash_matches = set()
98     for image in docker_images():
99         if (image.repo == image_search) and (image.tag == image_tag):
100             return set([image.hash])
101         elif image.hash.startswith(hash_search):
102             hash_matches.add(image.hash)
103     return hash_matches
104
105 def find_one_image_hash(image_search, image_tag=None):
106     hashes = find_image_hashes(image_search, image_tag)
107     hash_count = len(hashes)
108     if hash_count == 1:
109         return hashes.pop()
110     elif hash_count == 0:
111         raise DockerError("no matching image found")
112     else:
113         raise DockerError("{} images match {}".format(hash_count, image_search))
114
115 def stat_cache_name(image_file):
116     return getattr(image_file, 'name', image_file) + '.stat'
117
118 def pull_image(image_name, image_tag):
119     check_docker(popen_docker(['pull', '{}:{}'.format(image_name, image_tag)]),
120                  "pull")
121
122 def save_image(image_hash, image_file):
123     # Save the specified Docker image to image_file, then try to save its
124     # stats so we can try to resume after interruption.
125     check_docker(popen_docker(['save', image_hash], stdout=image_file),
126                  "save")
127     image_file.flush()
128     try:
129         with open(stat_cache_name(image_file), 'w') as statfile:
130             json.dump(tuple(os.fstat(image_file.fileno())), statfile)
131     except STAT_CACHE_ERRORS:
132         pass  # We won't resume from this cache.  No big deal.
133
134 def prep_image_file(filename):
135     # Return a file object ready to save a Docker image,
136     # and a boolean indicating whether or not we need to actually save the
137     # image (False if a cached save is available).
138     cache_dir = arv_cmd.make_home_conf_dir(
139         os.path.join('.cache', 'arvados', 'docker'), 0o700)
140     if cache_dir is None:
141         image_file = tempfile.NamedTemporaryFile(suffix='.tar')
142         need_save = True
143     else:
144         file_path = os.path.join(cache_dir, filename)
145         try:
146             with open(stat_cache_name(file_path)) as statfile:
147                 prev_stat = json.load(statfile)
148             now_stat = os.stat(file_path)
149             need_save = any(prev_stat[field] != now_stat[field]
150                             for field in [ST_MTIME, ST_SIZE])
151         except STAT_CACHE_ERRORS + (AttributeError, IndexError):
152             need_save = True  # We couldn't compare against old stats
153         image_file = open(file_path, 'w+b' if need_save else 'rb')
154     return image_file, need_save
155
156 def make_link(api_client, num_retries, link_class, link_name, **link_attrs):
157     link_attrs.update({'link_class': link_class, 'name': link_name})
158     return api_client.links().create(body=link_attrs).execute(
159         num_retries=num_retries)
160
161 def ptimestamp(t):
162     s = t.split(".")
163     if len(s) == 2:
164         t = s[0] + s[1][-1:]
165     return datetime.datetime.strptime(t, "%Y-%m-%dT%H:%M:%SZ")
166
167 def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None):
168     """List all Docker images known to the api_client with image_name and
169     image_tag.  If no image_name is given, defaults to listing all
170     Docker images.
171
172     Returns a list of tuples representing matching Docker images,
173     sorted in preference order (i.e. the first collection in the list
174     is the one that the API server would use). Each tuple is a
175     (collection_uuid, collection_info) pair, where collection_info is
176     a dict with fields "dockerhash", "repo", "tag", and "timestamp".
177
178     """
179     docker_image_filters = [['link_class', 'in', ['docker_image_hash', 'docker_image_repo+tag']]]
180     if image_name:
181         image_link_name = "{}:{}".format(image_name, image_tag or 'latest')
182         docker_image_filters.append(['name', '=', image_link_name])
183
184     existing_links = api_client.links().list(
185         filters=docker_image_filters
186         ).execute(num_retries=num_retries)['items']
187     images = {}
188     for link in existing_links:
189         collection_uuid = link["head_uuid"]
190         if collection_uuid not in images:
191             images[collection_uuid]= {"dockerhash": "<none>",
192                       "repo":"<none>",
193                       "tag":"<none>",
194                       "timestamp": ptimestamp("1970-01-01T00:00:01Z")}
195
196         if link["link_class"] == "docker_image_hash":
197             images[collection_uuid]["dockerhash"] = link["name"]
198
199         if link["link_class"] == "docker_image_repo+tag":
200             r = link["name"].split(":")
201             images[collection_uuid]["repo"] = r[0]
202             if len(r) > 1:
203                 images[collection_uuid]["tag"] = r[1]
204
205         if "image_timestamp" in link["properties"]:
206             images[collection_uuid]["timestamp"] = ptimestamp(link["properties"]["image_timestamp"])
207         else:
208             images[collection_uuid]["timestamp"] = ptimestamp(link["created_at"])
209
210     return sorted(images.items(), lambda a, b: cmp(b[1]["timestamp"], a[1]["timestamp"]))
211
212
213 def main(arguments=None):
214     args = arg_parser.parse_args(arguments)
215     api = arvados.api('v1')
216
217     if args.image is None or args.image == 'images':
218         fmt = "{:30}  {:10}  {:12}  {:29}  {:20}"
219         print fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED")
220         for i, j in list_images_in_arv(api, args.retries):
221             print(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
222         sys.exit(0)
223
224     # Pull the image if requested, unless the image is specified as a hash
225     # that we already have.
226     if args.pull and not find_image_hashes(args.image):
227         pull_image(args.image, args.tag)
228
229     try:
230         image_hash = find_one_image_hash(args.image, args.tag)
231     except DockerError as error:
232         print >>sys.stderr, "arv-keepdocker:", error.message
233         sys.exit(1)
234
235     image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None
236
237     if args.name is None:
238         if image_repo_tag:
239             collection_name = 'Docker image {} {}'.format(image_repo_tag, image_hash[0:12])
240         else:
241             collection_name = 'Docker image {}'.format(image_hash[0:12])
242     else:
243         collection_name = args.name
244
245     if not args.force:
246         # Check if this image is already in Arvados.
247
248         # Project where everything should be owned
249         if args.project_uuid:
250             parent_project_uuid = args.project_uuid
251         else:
252             parent_project_uuid = api.users().current().execute(
253                 num_retries=args.retries)['uuid']
254
255         # Find image hash tags
256         existing_links = api.links().list(
257             filters=[['link_class', '=', 'docker_image_hash'],
258                      ['name', '=', image_hash]]
259             ).execute(num_retries=args.retries)['items']
260         if existing_links:
261             # get readable collections
262             collections = api.collections().list(
263                 filters=[['uuid', 'in', [link['head_uuid'] for link in existing_links]]],
264                 select=["uuid", "owner_uuid", "name", "manifest_text"]
265                 ).execute(num_retries=args.retries)['items']
266
267             if collections:
268                 # check for repo+tag links on these collections
269                 existing_repo_tag = (api.links().list(
270                     filters=[['link_class', '=', 'docker_image_repo+tag'],
271                              ['name', '=', image_repo_tag],
272                              ['head_uuid', 'in', collections]]
273                     ).execute(num_retries=args.retries)['items']) if image_repo_tag else []
274
275                 # Filter on elements owned by the parent project
276                 owned_col = [c for c in collections if c['owner_uuid'] == parent_project_uuid]
277                 owned_img = [c for c in existing_links if c['owner_uuid'] == parent_project_uuid]
278                 owned_rep = [c for c in existing_repo_tag if c['owner_uuid'] == parent_project_uuid]
279
280                 if owned_col:
281                     # already have a collection owned by this project
282                     coll_uuid = owned_col[0]['uuid']
283                 else:
284                     # create new collection owned by the project
285                     coll_uuid = api.collections().create(
286                         body={"manifest_text": collections[0]['manifest_text'],
287                               "name": collection_name,
288                               "owner_uuid": parent_project_uuid},
289                         ensure_unique_name=True
290                         ).execute(num_retries=args.retries)['uuid']
291
292                 link_base = {'owner_uuid': parent_project_uuid,
293                              'head_uuid':  coll_uuid }
294
295                 if not owned_img:
296                     # create image link owned by the project
297                     make_link(api, args.retries,
298                               'docker_image_hash', image_hash, **link_base)
299
300                 if not owned_rep and image_repo_tag:
301                     # create repo+tag link owned by the project
302                     make_link(api, args.retries, 'docker_image_repo+tag',
303                               image_repo_tag, **link_base)
304
305                 print(coll_uuid)
306
307                 sys.exit(0)
308
309     # Open a file for the saved image, and write it if needed.
310     outfile_name = '{}.tar'.format(image_hash)
311     image_file, need_save = prep_image_file(outfile_name)
312     if need_save:
313         save_image(image_hash, image_file)
314
315     # Call arv-put with switches we inherited from it
316     # (a.k.a., switches that aren't our own).
317     put_args = keepdocker_parser.parse_known_args(arguments)[1]
318
319     if args.name is None:
320         put_args += ['--name', collection_name]
321
322     coll_uuid = arv_put.main(
323         put_args + ['--filename', outfile_name, image_file.name]).strip()
324
325     # Read the image metadata and make Arvados links from it.
326     image_file.seek(0)
327     image_tar = tarfile.open(fileobj=image_file)
328     json_file = image_tar.extractfile(image_tar.getmember(image_hash + '/json'))
329     image_metadata = json.load(json_file)
330     json_file.close()
331     image_tar.close()
332     link_base = {'head_uuid': coll_uuid, 'properties': {}}
333     if 'created' in image_metadata:
334         link_base['properties']['image_timestamp'] = image_metadata['created']
335     if args.project_uuid is not None:
336         link_base['owner_uuid'] = args.project_uuid
337
338     make_link(api, args.retries, 'docker_image_hash', image_hash, **link_base)
339     if image_repo_tag:
340         make_link(api, args.retries,
341                   'docker_image_repo+tag', image_repo_tag, **link_base)
342
343     # Clean up.
344     image_file.close()
345     for filename in [stat_cache_name(image_file), image_file.name]:
346         try:
347             os.unlink(filename)
348         except OSError as error:
349             if error.errno != errno.ENOENT:
350                 raise
351
352 if __name__ == '__main__':
353     main()