Merge branch 'master' into 4823-python-sdk-writable-collection-api
[arvados.git] / sdk / python / arvados / commands / keepdocker.py
1 #!/usr/bin/env python
2
3 import argparse
4 import datetime
5 import errno
6 import json
7 import os
8 import subprocess
9 import sys
10 import tarfile
11 import tempfile
12 import _strptime
13
14 from collections import namedtuple
15 from stat import *
16
17 import arvados
18 import arvados.commands._util as arv_cmd
19 import arvados.commands.put as arv_put
20
21 STAT_CACHE_ERRORS = (IOError, OSError, ValueError)
22
23 DockerImage = namedtuple('DockerImage',
24                          ['repo', 'tag', 'hash', 'created', 'vsize'])
25
26 keepdocker_parser = argparse.ArgumentParser(add_help=False)
27 keepdocker_parser.add_argument(
28     '-f', '--force', action='store_true', default=False,
29     help="Re-upload the image even if it already exists on the server")
30
31 _group = keepdocker_parser.add_mutually_exclusive_group()
32 _group.add_argument(
33     '--pull', action='store_true', default=False,
34     help="Try to pull the latest image from Docker registry")
35 _group.add_argument(
36     '--no-pull', action='store_false', dest='pull',
37     help="Use locally installed image only, don't pull image from Docker registry (default)")
38
39 keepdocker_parser.add_argument(
40     'image', nargs='?',
41     help="Docker image to upload, as a repository name or hash")
42 keepdocker_parser.add_argument(
43     'tag', nargs='?', default='latest',
44     help="Tag of the Docker image to upload (default 'latest')")
45
46 # Combine keepdocker options listed above with run_opts options of arv-put.
47 # The options inherited from arv-put include --name, --project-uuid,
48 # --progress/--no-progress/--batch-progress and --resume/--no-resume.
49 arg_parser = argparse.ArgumentParser(
50         description="Upload or list Docker images in Arvados",
51         parents=[keepdocker_parser, arv_put.run_opts, arv_cmd.retry_opt])
52
53 class DockerError(Exception):
54     pass
55
56
57 def popen_docker(cmd, *args, **kwargs):
58     manage_stdin = ('stdin' not in kwargs)
59     kwargs.setdefault('stdin', subprocess.PIPE)
60     kwargs.setdefault('stdout', sys.stderr)
61     try:
62         docker_proc = subprocess.Popen(['docker.io'] + cmd, *args, **kwargs)
63     except OSError:  # No docker.io in $PATH
64         docker_proc = subprocess.Popen(['docker'] + cmd, *args, **kwargs)
65     if manage_stdin:
66         docker_proc.stdin.close()
67     return docker_proc
68
69 def check_docker(proc, description):
70     proc.wait()
71     if proc.returncode != 0:
72         raise DockerError("docker {} returned status code {}".
73                           format(description, proc.returncode))
74
75 def docker_images():
76     # Yield a DockerImage tuple for each installed image.
77     list_proc = popen_docker(['images', '--no-trunc'], stdout=subprocess.PIPE)
78     list_output = iter(list_proc.stdout)
79     next(list_output)  # Ignore the header line
80     for line in list_output:
81         words = line.split()
82         size_index = len(words) - 2
83         repo, tag, imageid = words[:3]
84         ctime = ' '.join(words[3:size_index])
85         vsize = ' '.join(words[size_index:])
86         yield DockerImage(repo, tag, imageid, ctime, vsize)
87     list_proc.stdout.close()
88     check_docker(list_proc, "images")
89
90 def find_image_hashes(image_search, image_tag=None):
91     # Given one argument, search for Docker images with matching hashes,
92     # and return their full hashes in a set.
93     # Given two arguments, also search for a Docker image with the
94     # same repository and tag.  If one is found, return its hash in a
95     # set; otherwise, fall back to the one-argument hash search.
96     # Returns None if no match is found, or a hash search is ambiguous.
97     hash_search = image_search.lower()
98     hash_matches = set()
99     for image in docker_images():
100         if (image.repo == image_search) and (image.tag == image_tag):
101             return set([image.hash])
102         elif image.hash.startswith(hash_search):
103             hash_matches.add(image.hash)
104     return hash_matches
105
106 def find_one_image_hash(image_search, image_tag=None):
107     hashes = find_image_hashes(image_search, image_tag)
108     hash_count = len(hashes)
109     if hash_count == 1:
110         return hashes.pop()
111     elif hash_count == 0:
112         raise DockerError("no matching image found")
113     else:
114         raise DockerError("{} images match {}".format(hash_count, image_search))
115
116 def stat_cache_name(image_file):
117     return getattr(image_file, 'name', image_file) + '.stat'
118
119 def pull_image(image_name, image_tag):
120     check_docker(popen_docker(['pull', '{}:{}'.format(image_name, image_tag)]),
121                  "pull")
122
123 def save_image(image_hash, image_file):
124     # Save the specified Docker image to image_file, then try to save its
125     # stats so we can try to resume after interruption.
126     check_docker(popen_docker(['save', image_hash], stdout=image_file),
127                  "save")
128     image_file.flush()
129     try:
130         with open(stat_cache_name(image_file), 'w') as statfile:
131             json.dump(tuple(os.fstat(image_file.fileno())), statfile)
132     except STAT_CACHE_ERRORS:
133         pass  # We won't resume from this cache.  No big deal.
134
135 def prep_image_file(filename):
136     # Return a file object ready to save a Docker image,
137     # and a boolean indicating whether or not we need to actually save the
138     # image (False if a cached save is available).
139     cache_dir = arv_cmd.make_home_conf_dir(
140         os.path.join('.cache', 'arvados', 'docker'), 0o700)
141     if cache_dir is None:
142         image_file = tempfile.NamedTemporaryFile(suffix='.tar')
143         need_save = True
144     else:
145         file_path = os.path.join(cache_dir, filename)
146         try:
147             with open(stat_cache_name(file_path)) as statfile:
148                 prev_stat = json.load(statfile)
149             now_stat = os.stat(file_path)
150             need_save = any(prev_stat[field] != now_stat[field]
151                             for field in [ST_MTIME, ST_SIZE])
152         except STAT_CACHE_ERRORS + (AttributeError, IndexError):
153             need_save = True  # We couldn't compare against old stats
154         image_file = open(file_path, 'w+b' if need_save else 'rb')
155     return image_file, need_save
156
157 def make_link(api_client, num_retries, link_class, link_name, **link_attrs):
158     link_attrs.update({'link_class': link_class, 'name': link_name})
159     return api_client.links().create(body=link_attrs).execute(
160         num_retries=num_retries)
161
162 def ptimestamp(t):
163     s = t.split(".")
164     if len(s) == 2:
165         t = s[0] + s[1][-1:]
166     return datetime.datetime.strptime(t, "%Y-%m-%dT%H:%M:%SZ")
167
168 def list_images_in_arv(api_client, num_retries, image_name=None, image_tag=None):
169     """List all Docker images known to the api_client with image_name and
170     image_tag.  If no image_name is given, defaults to listing all
171     Docker images.
172
173     Returns a list of tuples representing matching Docker images,
174     sorted in preference order (i.e. the first collection in the list
175     is the one that the API server would use). Each tuple is a
176     (collection_uuid, collection_info) pair, where collection_info is
177     a dict with fields "dockerhash", "repo", "tag", and "timestamp".
178
179     """
180     docker_image_filters = [['link_class', 'in', ['docker_image_hash', 'docker_image_repo+tag']]]
181     if image_name:
182         image_link_name = "{}:{}".format(image_name, image_tag or 'latest')
183         docker_image_filters.append(['name', '=', image_link_name])
184
185     existing_links = api_client.links().list(
186         filters=docker_image_filters
187         ).execute(num_retries=num_retries)['items']
188     images = {}
189     for link in existing_links:
190         collection_uuid = link["head_uuid"]
191         if collection_uuid not in images:
192             images[collection_uuid]= {"dockerhash": "<none>",
193                       "repo":"<none>",
194                       "tag":"<none>",
195                       "timestamp": ptimestamp("1970-01-01T00:00:01Z")}
196
197         if link["link_class"] == "docker_image_hash":
198             images[collection_uuid]["dockerhash"] = link["name"]
199
200         if link["link_class"] == "docker_image_repo+tag":
201             r = link["name"].split(":")
202             images[collection_uuid]["repo"] = r[0]
203             if len(r) > 1:
204                 images[collection_uuid]["tag"] = r[1]
205
206         if "image_timestamp" in link["properties"]:
207             images[collection_uuid]["timestamp"] = ptimestamp(link["properties"]["image_timestamp"])
208         else:
209             images[collection_uuid]["timestamp"] = ptimestamp(link["created_at"])
210
211     return sorted(images.items(), lambda a, b: cmp(b[1]["timestamp"], a[1]["timestamp"]))
212
213
214 def main(arguments=None):
215     args = arg_parser.parse_args(arguments)
216     api = arvados.api('v1')
217
218     if args.image is None or args.image == 'images':
219         fmt = "{:30}  {:10}  {:12}  {:29}  {:20}"
220         print fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED")
221         for i, j in list_images_in_arv(api, args.retries):
222             print(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
223         sys.exit(0)
224
225     # Pull the image if requested, unless the image is specified as a hash
226     # that we already have.
227     if args.pull and not find_image_hashes(args.image):
228         pull_image(args.image, args.tag)
229
230     try:
231         image_hash = find_one_image_hash(args.image, args.tag)
232     except DockerError as error:
233         print >>sys.stderr, "arv-keepdocker:", error.message
234         sys.exit(1)
235
236     image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None
237
238     if args.name is None:
239         if image_repo_tag:
240             collection_name = 'Docker image {} {}'.format(image_repo_tag, image_hash[0:12])
241         else:
242             collection_name = 'Docker image {}'.format(image_hash[0:12])
243     else:
244         collection_name = args.name
245
246     if not args.force:
247         # Check if this image is already in Arvados.
248
249         # Project where everything should be owned
250         if args.project_uuid:
251             parent_project_uuid = args.project_uuid
252         else:
253             parent_project_uuid = api.users().current().execute(
254                 num_retries=args.retries)['uuid']
255
256         # Find image hash tags
257         existing_links = api.links().list(
258             filters=[['link_class', '=', 'docker_image_hash'],
259                      ['name', '=', image_hash]]
260             ).execute(num_retries=args.retries)['items']
261         if existing_links:
262             # get readable collections
263             collections = api.collections().list(
264                 filters=[['uuid', 'in', [link['head_uuid'] for link in existing_links]]],
265                 select=["uuid", "owner_uuid", "name", "manifest_text"]
266                 ).execute(num_retries=args.retries)['items']
267
268             if collections:
269                 # check for repo+tag links on these collections
270                 existing_repo_tag = (api.links().list(
271                     filters=[['link_class', '=', 'docker_image_repo+tag'],
272                              ['name', '=', image_repo_tag],
273                              ['head_uuid', 'in', collections]]
274                     ).execute(num_retries=args.retries)['items']) if image_repo_tag else []
275
276                 # Filter on elements owned by the parent project
277                 owned_col = [c for c in collections if c['owner_uuid'] == parent_project_uuid]
278                 owned_img = [c for c in existing_links if c['owner_uuid'] == parent_project_uuid]
279                 owned_rep = [c for c in existing_repo_tag if c['owner_uuid'] == parent_project_uuid]
280
281                 if owned_col:
282                     # already have a collection owned by this project
283                     coll_uuid = owned_col[0]['uuid']
284                 else:
285                     # create new collection owned by the project
286                     coll_uuid = api.collections().create(
287                         body={"manifest_text": collections[0]['manifest_text'],
288                               "name": collection_name,
289                               "owner_uuid": parent_project_uuid},
290                         ensure_unique_name=True
291                         ).execute(num_retries=args.retries)['uuid']
292
293                 link_base = {'owner_uuid': parent_project_uuid,
294                              'head_uuid':  coll_uuid }
295
296                 if not owned_img:
297                     # create image link owned by the project
298                     make_link(api, args.retries,
299                               'docker_image_hash', image_hash, **link_base)
300
301                 if not owned_rep and image_repo_tag:
302                     # create repo+tag link owned by the project
303                     make_link(api, args.retries, 'docker_image_repo+tag',
304                               image_repo_tag, **link_base)
305
306                 print(coll_uuid)
307
308                 sys.exit(0)
309
310     # Open a file for the saved image, and write it if needed.
311     outfile_name = '{}.tar'.format(image_hash)
312     image_file, need_save = prep_image_file(outfile_name)
313     if need_save:
314         save_image(image_hash, image_file)
315
316     # Call arv-put with switches we inherited from it
317     # (a.k.a., switches that aren't our own).
318     put_args = keepdocker_parser.parse_known_args(arguments)[1]
319
320     if args.name is None:
321         put_args += ['--name', collection_name]
322
323     coll_uuid = arv_put.main(
324         put_args + ['--filename', outfile_name, image_file.name]).strip()
325
326     # Read the image metadata and make Arvados links from it.
327     image_file.seek(0)
328     image_tar = tarfile.open(fileobj=image_file)
329     json_file = image_tar.extractfile(image_tar.getmember(image_hash + '/json'))
330     image_metadata = json.load(json_file)
331     json_file.close()
332     image_tar.close()
333     link_base = {'head_uuid': coll_uuid, 'properties': {}}
334     if 'created' in image_metadata:
335         link_base['properties']['image_timestamp'] = image_metadata['created']
336     if args.project_uuid is not None:
337         link_base['owner_uuid'] = args.project_uuid
338
339     make_link(api, args.retries, 'docker_image_hash', image_hash, **link_base)
340     if image_repo_tag:
341         make_link(api, args.retries,
342                   'docker_image_repo+tag', image_repo_tag, **link_base)
343
344     # Clean up.
345     image_file.close()
346     for filename in [stat_cache_name(image_file), image_file.name]:
347         try:
348             os.unlink(filename)
349         except OSError as error:
350             if error.errno != errno.ENOENT:
351                 raise
352
353 if __name__ == '__main__':
354     main()