Merge branch '3609-arv-ws' refs #3609
[arvados.git] / sdk / python / arvados / commands / keepdocker.py
1 #!/usr/bin/env python
2
3 import argparse
4 import datetime
5 import errno
6 import json
7 import os
8 import subprocess
9 import sys
10 import tarfile
11 import tempfile
12
13 from collections import namedtuple
14 from stat import *
15
16 import arvados
17 import arvados.commands._util as arv_cmd
18 import arvados.commands.put as arv_put
19
20 STAT_CACHE_ERRORS = (IOError, OSError, ValueError)
21
22 DockerImage = namedtuple('DockerImage',
23                          ['repo', 'tag', 'hash', 'created', 'vsize'])
24
25 keepdocker_parser = argparse.ArgumentParser(add_help=False)
26 keepdocker_parser.add_argument(
27     '-f', '--force', action='store_true', default=False,
28     help="Re-upload the image even if it already exists on the server")
29
30 _group = keepdocker_parser.add_mutually_exclusive_group()
31 _group.add_argument(
32     '--pull', action='store_true', default=False,
33     help="Try to pull the latest image from Docker registry")
34 _group.add_argument(
35     '--no-pull', action='store_false', dest='pull',
36     help="Use locally installed image only, don't pull image from Docker registry (default)")
37
38 keepdocker_parser.add_argument(
39     'image', nargs='?',
40     help="Docker image to upload, as a repository name or hash")
41 keepdocker_parser.add_argument(
42     'tag', nargs='?', default='latest',
43     help="Tag of the Docker image to upload (default 'latest')")
44
45 # Combine keepdocker options listed above with run_opts options of arv-put.
46 # The options inherited from arv-put include --name, --project-uuid,
47 # --progress/--no-progress/--batch-progress and --resume/--no-resume.
48 arg_parser = argparse.ArgumentParser(
49         description="Upload or list Docker images in Arvados",
50         parents=[keepdocker_parser, arv_put.run_opts, arv_cmd.retry_opt])
51
52 class DockerError(Exception):
53     pass
54
55
56 def popen_docker(cmd, *args, **kwargs):
57     manage_stdin = ('stdin' not in kwargs)
58     kwargs.setdefault('stdin', subprocess.PIPE)
59     kwargs.setdefault('stdout', sys.stderr)
60     try:
61         docker_proc = subprocess.Popen(['docker.io'] + cmd, *args, **kwargs)
62     except OSError:  # No docker.io in $PATH
63         docker_proc = subprocess.Popen(['docker'] + cmd, *args, **kwargs)
64     if manage_stdin:
65         docker_proc.stdin.close()
66     return docker_proc
67
68 def check_docker(proc, description):
69     proc.wait()
70     if proc.returncode != 0:
71         raise DockerError("docker {} returned status code {}".
72                           format(description, proc.returncode))
73
74 def docker_images():
75     # Yield a DockerImage tuple for each installed image.
76     list_proc = popen_docker(['images', '--no-trunc'], stdout=subprocess.PIPE)
77     list_output = iter(list_proc.stdout)
78     next(list_output)  # Ignore the header line
79     for line in list_output:
80         words = line.split()
81         size_index = len(words) - 2
82         repo, tag, imageid = words[:3]
83         ctime = ' '.join(words[3:size_index])
84         vsize = ' '.join(words[size_index:])
85         yield DockerImage(repo, tag, imageid, ctime, vsize)
86     list_proc.stdout.close()
87     check_docker(list_proc, "images")
88
89 def find_image_hashes(image_search, image_tag=None):
90     # Given one argument, search for Docker images with matching hashes,
91     # and return their full hashes in a set.
92     # Given two arguments, also search for a Docker image with the
93     # same repository and tag.  If one is found, return its hash in a
94     # set; otherwise, fall back to the one-argument hash search.
95     # Returns None if no match is found, or a hash search is ambiguous.
96     hash_search = image_search.lower()
97     hash_matches = set()
98     for image in docker_images():
99         if (image.repo == image_search) and (image.tag == image_tag):
100             return set([image.hash])
101         elif image.hash.startswith(hash_search):
102             hash_matches.add(image.hash)
103     return hash_matches
104
105 def find_one_image_hash(image_search, image_tag=None):
106     hashes = find_image_hashes(image_search, image_tag)
107     hash_count = len(hashes)
108     if hash_count == 1:
109         return hashes.pop()
110     elif hash_count == 0:
111         raise DockerError("no matching image found")
112     else:
113         raise DockerError("{} images match {}".format(hash_count, image_search))
114
115 def stat_cache_name(image_file):
116     return getattr(image_file, 'name', image_file) + '.stat'
117
118 def pull_image(image_name, image_tag):
119     check_docker(popen_docker(['pull', '{}:{}'.format(image_name, image_tag)]),
120                  "pull")
121
122 def save_image(image_hash, image_file):
123     # Save the specified Docker image to image_file, then try to save its
124     # stats so we can try to resume after interruption.
125     check_docker(popen_docker(['save', image_hash], stdout=image_file),
126                  "save")
127     image_file.flush()
128     try:
129         with open(stat_cache_name(image_file), 'w') as statfile:
130             json.dump(tuple(os.fstat(image_file.fileno())), statfile)
131     except STAT_CACHE_ERRORS:
132         pass  # We won't resume from this cache.  No big deal.
133
134 def prep_image_file(filename):
135     # Return a file object ready to save a Docker image,
136     # and a boolean indicating whether or not we need to actually save the
137     # image (False if a cached save is available).
138     cache_dir = arv_cmd.make_home_conf_dir(
139         os.path.join('.cache', 'arvados', 'docker'), 0o700)
140     if cache_dir is None:
141         image_file = tempfile.NamedTemporaryFile(suffix='.tar')
142         need_save = True
143     else:
144         file_path = os.path.join(cache_dir, filename)
145         try:
146             with open(stat_cache_name(file_path)) as statfile:
147                 prev_stat = json.load(statfile)
148             now_stat = os.stat(file_path)
149             need_save = any(prev_stat[field] != now_stat[field]
150                             for field in [ST_MTIME, ST_SIZE])
151         except STAT_CACHE_ERRORS + (AttributeError, IndexError):
152             need_save = True  # We couldn't compare against old stats
153         image_file = open(file_path, 'w+b' if need_save else 'rb')
154     return image_file, need_save
155
156 def make_link(api_client, num_retries, link_class, link_name, **link_attrs):
157     link_attrs.update({'link_class': link_class, 'name': link_name})
158     return api_client.links().create(body=link_attrs).execute(
159         num_retries=num_retries)
160
161 def ptimestamp(t):
162     s = t.split(".")
163     if len(s) == 2:
164         t = s[0] + s[1][-1:]
165     return datetime.datetime.strptime(t, "%Y-%m-%dT%H:%M:%SZ")
166
167 def list_images_in_arv(api_client, num_retries):
168     existing_links = api_client.links().list(
169         filters=[['link_class', 'in', ['docker_image_hash', 'docker_image_repo+tag']]]
170         ).execute(num_retries=num_retries)['items']
171     images = {}
172     for link in existing_links:
173         collection_uuid = link["head_uuid"]
174         if collection_uuid not in images:
175             images[collection_uuid]= {"dockerhash": "<none>",
176                       "repo":"<none>",
177                       "tag":"<none>",
178                       "timestamp": ptimestamp("1970-01-01T00:00:01Z")}
179
180         if link["link_class"] == "docker_image_hash":
181             images[collection_uuid]["dockerhash"] = link["name"]
182
183         if link["link_class"] == "docker_image_repo+tag":
184             r = link["name"].split(":")
185             images[collection_uuid]["repo"] = r[0]
186             if len(r) > 1:
187                 images[collection_uuid]["tag"] = r[1]
188
189         if "image_timestamp" in link["properties"]:
190             images[collection_uuid]["timestamp"] = ptimestamp(link["properties"]["image_timestamp"])
191         else:
192             images[collection_uuid]["timestamp"] = ptimestamp(link["created_at"])
193
194     st = sorted(images.items(), lambda a, b: cmp(b[1]["timestamp"], a[1]["timestamp"]))
195
196     fmt = "{:30}  {:10}  {:12}  {:29}  {:20}"
197     print fmt.format("REPOSITORY", "TAG", "IMAGE ID", "COLLECTION", "CREATED")
198     for i, j in st:
199         print(fmt.format(j["repo"], j["tag"], j["dockerhash"][0:12], i, j["timestamp"].strftime("%c")))
200
201 def main(arguments=None):
202     args = arg_parser.parse_args(arguments)
203     api = arvados.api('v1')
204
205     if args.image is None or args.image == 'images':
206         list_images_in_arv(api, args.retries)
207         sys.exit(0)
208
209     # Pull the image if requested, unless the image is specified as a hash
210     # that we already have.
211     if args.pull and not find_image_hashes(args.image):
212         pull_image(args.image, args.tag)
213
214     try:
215         image_hash = find_one_image_hash(args.image, args.tag)
216     except DockerError as error:
217         print >>sys.stderr, "arv-keepdocker:", error.message
218         sys.exit(1)
219
220     image_repo_tag = '{}:{}'.format(args.image, args.tag) if not image_hash.startswith(args.image.lower()) else None
221
222     if args.name is None:
223         if image_repo_tag:
224             collection_name = 'Docker image {} {}'.format(image_repo_tag, image_hash[0:12])
225         else:
226             collection_name = 'Docker image {}'.format(image_hash[0:12])
227     else:
228         collection_name = args.name
229
230     if not args.force:
231         # Check if this image is already in Arvados.
232
233         # Project where everything should be owned
234         if args.project_uuid:
235             parent_project_uuid = args.project_uuid
236         else:
237             parent_project_uuid = api.users().current().execute(
238                 num_retries=args.retries)['uuid']
239
240         # Find image hash tags
241         existing_links = api.links().list(
242             filters=[['link_class', '=', 'docker_image_hash'],
243                      ['name', '=', image_hash]]
244             ).execute(num_retries=args.retries)['items']
245         if existing_links:
246             # get readable collections
247             collections = api.collections().list(
248                 filters=[['uuid', 'in', [link['head_uuid'] for link in existing_links]]],
249                 select=["uuid", "owner_uuid", "name", "manifest_text"]
250                 ).execute(num_retries=args.retries)['items']
251
252             if collections:
253                 # check for repo+tag links on these collections
254                 existing_repo_tag = (api.links().list(
255                     filters=[['link_class', '=', 'docker_image_repo+tag'],
256                              ['name', '=', image_repo_tag],
257                              ['head_uuid', 'in', collections]]
258                     ).execute(num_retries=args.retries)['items']) if image_repo_tag else []
259
260                 # Filter on elements owned by the parent project
261                 owned_col = [c for c in collections if c['owner_uuid'] == parent_project_uuid]
262                 owned_img = [c for c in existing_links if c['owner_uuid'] == parent_project_uuid]
263                 owned_rep = [c for c in existing_repo_tag if c['owner_uuid'] == parent_project_uuid]
264
265                 if owned_col:
266                     # already have a collection owned by this project
267                     coll_uuid = owned_col[0]['uuid']
268                 else:
269                     # create new collection owned by the project
270                     coll_uuid = api.collections().create(
271                         body={"manifest_text": collections[0]['manifest_text'],
272                               "name": collection_name,
273                               "owner_uuid": parent_project_uuid},
274                         ensure_unique_name=True
275                         ).execute(num_retries=args.retries)['uuid']
276
277                 link_base = {'owner_uuid': parent_project_uuid,
278                              'head_uuid':  coll_uuid }
279
280                 if not owned_img:
281                     # create image link owned by the project
282                     make_link(api, args.retries,
283                               'docker_image_hash', image_hash, **link_base)
284
285                 if not owned_rep and image_repo_tag:
286                     # create repo+tag link owned by the project
287                     make_link(api, args.retries, 'docker_image_repo+tag',
288                               image_repo_tag, **link_base)
289
290                 print(coll_uuid)
291
292                 sys.exit(0)
293
294     # Open a file for the saved image, and write it if needed.
295     outfile_name = '{}.tar'.format(image_hash)
296     image_file, need_save = prep_image_file(outfile_name)
297     if need_save:
298         save_image(image_hash, image_file)
299
300     # Call arv-put with switches we inherited from it
301     # (a.k.a., switches that aren't our own).
302     put_args = keepdocker_parser.parse_known_args(arguments)[1]
303
304     if args.name is None:
305         put_args += ['--name', collection_name]
306
307     coll_uuid = arv_put.main(
308         put_args + ['--filename', outfile_name, image_file.name]).strip()
309
310     # Read the image metadata and make Arvados links from it.
311     image_file.seek(0)
312     image_tar = tarfile.open(fileobj=image_file)
313     json_file = image_tar.extractfile(image_tar.getmember(image_hash + '/json'))
314     image_metadata = json.load(json_file)
315     json_file.close()
316     image_tar.close()
317     link_base = {'head_uuid': coll_uuid, 'properties': {}}
318     if 'created' in image_metadata:
319         link_base['properties']['image_timestamp'] = image_metadata['created']
320     if args.project_uuid is not None:
321         link_base['owner_uuid'] = args.project_uuid
322
323     make_link(api, args.retries, 'docker_image_hash', image_hash, **link_base)
324     if image_repo_tag:
325         make_link(api, args.retries,
326                   'docker_image_repo+tag', image_repo_tag, **link_base)
327
328     # Clean up.
329     image_file.close()
330     for filename in [stat_cache_name(image_file), image_file.name]:
331         try:
332             os.unlink(filename)
333         except OSError as error:
334             if error.errno != errno.ENOENT:
335                 raise
336
337 if __name__ == '__main__':
338     main()