4520: Better checking to see if collection already exists at the destination.
[arvados.git] / sdk / python / arvados / commands / put.py
1 #!/usr/bin/env python
2
3 # TODO:
4 # --md5sum - display md5 of each file as read from disk
5
6 import argparse
7 import arvados
8 import base64
9 import datetime
10 import errno
11 import fcntl
12 import hashlib
13 import json
14 import os
15 import pwd
16 import signal
17 import socket
18 import sys
19 import tempfile
20 from apiclient import errors as apiclient_errors
21
22 import arvados.commands._util as arv_cmd
23
24 CAUGHT_SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]
25 api_client = None
26
27 upload_opts = argparse.ArgumentParser(add_help=False)
28
29 upload_opts.add_argument('paths', metavar='path', type=str, nargs='*',
30                          help="""
31 Local file or directory. Default: read from standard input.
32 """)
33
34 _group = upload_opts.add_mutually_exclusive_group()
35
36 _group.add_argument('--max-manifest-depth', type=int, metavar='N',
37                     default=-1, help="""
38 Maximum depth of directory tree to represent in the manifest
39 structure. A directory structure deeper than this will be represented
40 as a single stream in the manifest. If N=0, the manifest will contain
41 a single stream. Default: -1 (unlimited), i.e., exactly one manifest
42 stream per filesystem directory that contains files.
43 """)
44
45 _group.add_argument('--normalize', action='store_true',
46                     help="""
47 Normalize the manifest by re-ordering files and streams after writing
48 data.
49 """)
50
51 _group = upload_opts.add_mutually_exclusive_group()
52
53 _group.add_argument('--as-stream', action='store_true', dest='stream',
54                     help="""
55 Synonym for --stream.
56 """)
57
58 _group.add_argument('--stream', action='store_true',
59                     help="""
60 Store the file content and display the resulting manifest on
61 stdout. Do not write the manifest to Keep or save a Collection object
62 in Arvados.
63 """)
64
65 _group.add_argument('--as-manifest', action='store_true', dest='manifest',
66                     help="""
67 Synonym for --manifest.
68 """)
69
70 _group.add_argument('--in-manifest', action='store_true', dest='manifest',
71                     help="""
72 Synonym for --manifest.
73 """)
74
75 _group.add_argument('--manifest', action='store_true',
76                     help="""
77 Store the file data and resulting manifest in Keep, save a Collection
78 object in Arvados, and display the manifest locator (Collection uuid)
79 on stdout. This is the default behavior.
80 """)
81
82 _group.add_argument('--as-raw', action='store_true', dest='raw',
83                     help="""
84 Synonym for --raw.
85 """)
86
87 _group.add_argument('--raw', action='store_true',
88                     help="""
89 Store the file content and display the data block locators on stdout,
90 separated by commas, with a trailing newline. Do not store a
91 manifest.
92 """)
93
94 upload_opts.add_argument('--use-filename', type=str, default=None,
95                          dest='filename', help="""
96 Synonym for --filename.
97 """)
98
99 upload_opts.add_argument('--filename', type=str, default=None,
100                          help="""
101 Use the given filename in the manifest, instead of the name of the
102 local file. This is useful when "-" or "/dev/stdin" is given as an
103 input file. It can be used only if there is exactly one path given and
104 it is not a directory. Implies --manifest.
105 """)
106
107 upload_opts.add_argument('--portable-data-hash', action='store_true',
108                          help="""
109 Print the portable data hash instead of the Arvados UUID for the collection
110 created by the upload.
111 """)
112
113 upload_opts.add_argument('--replication', type=int, metavar='N', default=None,
114                          help="""
115 Set the replication level for the new collection: how many different
116 physical storage devices (e.g., disks) should have a copy of each data
117 block. Default is to use the server-provided default (if any) or 2.
118 """)
119
120 run_opts = argparse.ArgumentParser(add_help=False)
121
122 run_opts.add_argument('--project-uuid', metavar='UUID', help="""
123 Store the collection in the specified project, instead of your Home
124 project.
125 """)
126
127 run_opts.add_argument('--name', help="""
128 Save the collection with the specified name.
129 """)
130
131 _group = run_opts.add_mutually_exclusive_group()
132 _group.add_argument('--progress', action='store_true',
133                     help="""
134 Display human-readable progress on stderr (bytes and, if possible,
135 percentage of total data size). This is the default behavior when
136 stderr is a tty.
137 """)
138
139 _group.add_argument('--no-progress', action='store_true',
140                     help="""
141 Do not display human-readable progress on stderr, even if stderr is a
142 tty.
143 """)
144
145 _group.add_argument('--batch-progress', action='store_true',
146                     help="""
147 Display machine-readable progress on stderr (bytes and, if known,
148 total data size).
149 """)
150
151 _group = run_opts.add_mutually_exclusive_group()
152 _group.add_argument('--resume', action='store_true', default=True,
153                     help="""
154 Continue interrupted uploads from cached state (default).
155 """)
156 _group.add_argument('--no-resume', action='store_false', dest='resume',
157                     help="""
158 Do not continue interrupted uploads from cached state.
159 """)
160
161 arg_parser = argparse.ArgumentParser(
162     description='Copy data from the local filesystem to Keep.',
163     parents=[upload_opts, run_opts, arv_cmd.retry_opt])
164
165 def parse_arguments(arguments):
166     args = arg_parser.parse_args(arguments)
167
168     if len(args.paths) == 0:
169         args.paths += ['/dev/stdin']
170
171     if len(args.paths) != 1 or os.path.isdir(args.paths[0]):
172         if args.filename:
173             arg_parser.error("""
174     --filename argument cannot be used when storing a directory or
175     multiple files.
176     """)
177
178     # Turn on --progress by default if stderr is a tty.
179     if (not (args.batch_progress or args.no_progress)
180         and os.isatty(sys.stderr.fileno())):
181         args.progress = True
182
183     if args.paths == ['-']:
184         args.paths = ['/dev/stdin']
185         if not args.filename:
186             args.filename = '-'
187
188     return args
189
190 class ResumeCacheConflict(Exception):
191     pass
192
193
194 class ResumeCache(object):
195     CACHE_DIR = '.cache/arvados/arv-put'
196
197     def __init__(self, file_spec):
198         self.cache_file = open(file_spec, 'a+')
199         self._lock_file(self.cache_file)
200         self.filename = self.cache_file.name
201
202     @classmethod
203     def make_path(cls, args):
204         md5 = hashlib.md5()
205         md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost'))
206         realpaths = sorted(os.path.realpath(path) for path in args.paths)
207         md5.update('\0'.join(realpaths))
208         if any(os.path.isdir(path) for path in realpaths):
209             md5.update(str(max(args.max_manifest_depth, -1)))
210         elif args.filename:
211             md5.update(args.filename)
212         return os.path.join(
213             arv_cmd.make_home_conf_dir(cls.CACHE_DIR, 0o700, 'raise'),
214             md5.hexdigest())
215
216     def _lock_file(self, fileobj):
217         try:
218             fcntl.flock(fileobj, fcntl.LOCK_EX | fcntl.LOCK_NB)
219         except IOError:
220             raise ResumeCacheConflict("{} locked".format(fileobj.name))
221
222     def load(self):
223         self.cache_file.seek(0)
224         return json.load(self.cache_file)
225
226     def save(self, data):
227         try:
228             new_cache_fd, new_cache_name = tempfile.mkstemp(
229                 dir=os.path.dirname(self.filename))
230             self._lock_file(new_cache_fd)
231             new_cache = os.fdopen(new_cache_fd, 'r+')
232             json.dump(data, new_cache)
233             os.rename(new_cache_name, self.filename)
234         except (IOError, OSError, ResumeCacheConflict) as error:
235             try:
236                 os.unlink(new_cache_name)
237             except NameError:  # mkstemp failed.
238                 pass
239         else:
240             self.cache_file.close()
241             self.cache_file = new_cache
242
243     def close(self):
244         self.cache_file.close()
245
246     def destroy(self):
247         try:
248             os.unlink(self.filename)
249         except OSError as error:
250             if error.errno != errno.ENOENT:  # That's what we wanted anyway.
251                 raise
252         self.close()
253
254     def restart(self):
255         self.destroy()
256         self.__init__(self.filename)
257
258
259 class ArvPutCollectionWriter(arvados.ResumableCollectionWriter):
260     STATE_PROPS = (arvados.ResumableCollectionWriter.STATE_PROPS +
261                    ['bytes_written', '_seen_inputs'])
262
263     def __init__(self, cache=None, reporter=None, bytes_expected=None, **kwargs):
264         self.bytes_written = 0
265         self._seen_inputs = []
266         self.cache = cache
267         self.reporter = reporter
268         self.bytes_expected = bytes_expected
269         super(ArvPutCollectionWriter, self).__init__(**kwargs)
270
271     @classmethod
272     def from_cache(cls, cache, reporter=None, bytes_expected=None,
273                    num_retries=0, replication=0):
274         try:
275             state = cache.load()
276             state['_data_buffer'] = [base64.decodestring(state['_data_buffer'])]
277             writer = cls.from_state(state, cache, reporter, bytes_expected,
278                                     num_retries=num_retries,
279                                     replication=replication)
280         except (TypeError, ValueError,
281                 arvados.errors.StaleWriterStateError) as error:
282             return cls(cache, reporter, bytes_expected, num_retries=num_retries)
283         else:
284             return writer
285
286     def cache_state(self):
287         if self.cache is None:
288             return
289         state = self.dump_state()
290         # Transform attributes for serialization.
291         for attr, value in state.items():
292             if attr == '_data_buffer':
293                 state[attr] = base64.encodestring(''.join(value))
294             elif hasattr(value, 'popleft'):
295                 state[attr] = list(value)
296         self.cache.save(state)
297
298     def report_progress(self):
299         if self.reporter is not None:
300             self.reporter(self.bytes_written, self.bytes_expected)
301
302     def flush_data(self):
303         start_buffer_len = self._data_buffer_len
304         start_block_count = self.bytes_written / self.KEEP_BLOCK_SIZE
305         super(ArvPutCollectionWriter, self).flush_data()
306         if self._data_buffer_len < start_buffer_len:  # We actually PUT data.
307             self.bytes_written += (start_buffer_len - self._data_buffer_len)
308             self.report_progress()
309             if (self.bytes_written / self.KEEP_BLOCK_SIZE) > start_block_count:
310                 self.cache_state()
311
312     def _record_new_input(self, input_type, source_name, dest_name):
313         # The key needs to be a list because that's what we'll get back
314         # from JSON deserialization.
315         key = [input_type, source_name, dest_name]
316         if key in self._seen_inputs:
317             return False
318         self._seen_inputs.append(key)
319         return True
320
321     def write_file(self, source, filename=None):
322         if self._record_new_input('file', source, filename):
323             super(ArvPutCollectionWriter, self).write_file(source, filename)
324
325     def write_directory_tree(self,
326                              path, stream_name='.', max_manifest_depth=-1):
327         if self._record_new_input('directory', path, stream_name):
328             super(ArvPutCollectionWriter, self).write_directory_tree(
329                 path, stream_name, max_manifest_depth)
330
331
332 def expected_bytes_for(pathlist):
333     # Walk the given directory trees and stat files, adding up file sizes,
334     # so we can display progress as percent
335     bytesum = 0
336     for path in pathlist:
337         if os.path.isdir(path):
338             for filename in arvados.util.listdir_recursive(path):
339                 bytesum += os.path.getsize(os.path.join(path, filename))
340         elif not os.path.isfile(path):
341             return None
342         else:
343             bytesum += os.path.getsize(path)
344     return bytesum
345
346 _machine_format = "{} {}: {{}} written {{}} total\n".format(sys.argv[0],
347                                                             os.getpid())
348 def machine_progress(bytes_written, bytes_expected):
349     return _machine_format.format(
350         bytes_written, -1 if (bytes_expected is None) else bytes_expected)
351
352 def human_progress(bytes_written, bytes_expected):
353     if bytes_expected:
354         return "\r{}M / {}M {:.1%} ".format(
355             bytes_written >> 20, bytes_expected >> 20,
356             float(bytes_written) / bytes_expected)
357     else:
358         return "\r{} ".format(bytes_written)
359
360 def progress_writer(progress_func, outfile=sys.stderr):
361     def write_progress(bytes_written, bytes_expected):
362         outfile.write(progress_func(bytes_written, bytes_expected))
363     return write_progress
364
365 def exit_signal_handler(sigcode, frame):
366     sys.exit(-sigcode)
367
368 def desired_project_uuid(api_client, project_uuid, num_retries):
369     if not project_uuid:
370         query = api_client.users().current()
371     elif arvados.util.user_uuid_pattern.match(project_uuid):
372         query = api_client.users().get(uuid=project_uuid)
373     elif arvados.util.group_uuid_pattern.match(project_uuid):
374         query = api_client.groups().get(uuid=project_uuid)
375     else:
376         raise ValueError("Not a valid project UUID: {}".format(project_uuid))
377     return query.execute(num_retries=num_retries)['uuid']
378
379 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
380     global api_client
381
382     args = parse_arguments(arguments)
383     status = 0
384     if api_client is None:
385         api_client = arvados.api('v1')
386
387     # Determine the name to use
388     if args.name:
389         if args.stream or args.raw:
390             print >>stderr, "Cannot use --name with --stream or --raw"
391             sys.exit(1)
392         collection_name = args.name
393     else:
394         collection_name = "Saved at {} by {}@{}".format(
395             datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC"),
396             pwd.getpwuid(os.getuid()).pw_name,
397             socket.gethostname())
398
399     if args.project_uuid and (args.stream or args.raw):
400         print >>stderr, "Cannot use --project-uuid with --stream or --raw"
401         sys.exit(1)
402
403     # Determine the parent project
404     try:
405         project_uuid = desired_project_uuid(api_client, args.project_uuid,
406                                             args.retries)
407     except (apiclient_errors.Error, ValueError) as error:
408         print >>stderr, error
409         sys.exit(1)
410
411     # write_copies diverges from args.replication here.
412     # args.replication is how many copies we will instruct Arvados to
413     # maintain (by passing it in collections().create()) after all
414     # data is written -- and if None was given, we'll use None there.
415     # Meanwhile, write_copies is how many copies of each data block we
416     # write to Keep, which has to be a number.
417     #
418     # If we simply changed args.replication from None to a default
419     # here, we'd end up erroneously passing the default replication
420     # level (instead of None) to collections().create().
421     write_copies = (args.replication or
422                     api_client._rootDesc.get('defaultCollectionReplication', 2))
423
424     if args.progress:
425         reporter = progress_writer(human_progress)
426     elif args.batch_progress:
427         reporter = progress_writer(machine_progress)
428     else:
429         reporter = None
430     bytes_expected = expected_bytes_for(args.paths)
431
432     resume_cache = None
433     if args.resume:
434         try:
435             resume_cache = ResumeCache(ResumeCache.make_path(args))
436         except (IOError, OSError, ValueError):
437             pass  # Couldn't open cache directory/file.  Continue without it.
438         except ResumeCacheConflict:
439             print >>stderr, "\n".join([
440                 "arv-put: Another process is already uploading this data.",
441                 "         Use --no-resume if this is really what you want."])
442             sys.exit(1)
443
444     if resume_cache is None:
445         writer = ArvPutCollectionWriter(
446             resume_cache, reporter, bytes_expected,
447             num_retries=args.retries,
448             replication=write_copies)
449     else:
450         writer = ArvPutCollectionWriter.from_cache(
451             resume_cache, reporter, bytes_expected,
452             num_retries=args.retries,
453             replication=write_copies)
454
455     # Install our signal handler for each code in CAUGHT_SIGNALS, and save
456     # the originals.
457     orig_signal_handlers = {sigcode: signal.signal(sigcode, exit_signal_handler)
458                             for sigcode in CAUGHT_SIGNALS}
459
460     if writer.bytes_written > 0:  # We're resuming a previous upload.
461         print >>stderr, "\n".join([
462                 "arv-put: Resuming previous upload from last checkpoint.",
463                 "         Use the --no-resume option to start over."])
464
465     writer.report_progress()
466     writer.do_queued_work()  # Do work resumed from cache.
467     for path in args.paths:  # Copy file data to Keep.
468         if os.path.isdir(path):
469             writer.write_directory_tree(
470                 path, max_manifest_depth=args.max_manifest_depth)
471         else:
472             writer.start_new_stream()
473             writer.write_file(path, args.filename or os.path.basename(path))
474     writer.finish_current_stream()
475
476     if args.progress:  # Print newline to split stderr from stdout for humans.
477         print >>stderr
478
479     if args.stream:
480         output = writer.manifest_text()
481         if args.normalize:
482             output = CollectionReader(output).manifest_text(normalize=True)
483     elif args.raw:
484         output = ','.join(writer.data_locators())
485     else:
486         try:
487             manifest_text = writer.manifest_text()
488             if args.normalize:
489                 manifest_text = CollectionReader(manifest_text).manifest_text(normalize=True)
490             replication_attr = 'replication_desired'
491             if api_client._schema.schemas['Collection']['properties'].get(replication_attr, None) is None:
492                 # API called it 'redundancy' before #3410.
493                 replication_attr = 'redundancy'
494             # Register the resulting collection in Arvados.
495             collection = api_client.collections().create(
496                 body={
497                     'owner_uuid': project_uuid,
498                     'name': collection_name,
499                     'manifest_text': manifest_text,
500                     replication_attr: args.replication,
501                     },
502                 ensure_unique_name=True
503                 ).execute(num_retries=args.retries)
504
505             print >>stderr, "Collection saved as '%s'" % collection['name']
506
507             if args.portable_data_hash and 'portable_data_hash' in collection and collection['portable_data_hash']:
508                 output = collection['portable_data_hash']
509             else:
510                 output = collection['uuid']
511
512         except apiclient_errors.Error as error:
513             print >>stderr, (
514                 "arv-put: Error creating Collection on project: {}.".format(
515                     error))
516             status = 1
517
518     # Print the locator (uuid) of the new collection.
519     stdout.write(output)
520     if not output.endswith('\n'):
521         stdout.write('\n')
522
523     for sigcode, orig_handler in orig_signal_handlers.items():
524         signal.signal(sigcode, orig_handler)
525
526     if status != 0:
527         sys.exit(status)
528
529     if resume_cache is not None:
530         resume_cache.destroy()
531
532     return output
533
534 if __name__ == '__main__':
535     main()