Merge branch '9438-http-default-timeout'
[arvados.git] / sdk / python / arvados / commands / put.py
1 #!/usr/bin/env python
2
3 # TODO:
4 # --md5sum - display md5 of each file as read from disk
5
6 import argparse
7 import arvados
8 import arvados.collection
9 import base64
10 import datetime
11 import errno
12 import fcntl
13 import hashlib
14 import json
15 import os
16 import pwd
17 import signal
18 import socket
19 import sys
20 import tempfile
21 from apiclient import errors as apiclient_errors
22
23 import arvados.commands._util as arv_cmd
24
25 CAUGHT_SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]
26 api_client = None
27
28 upload_opts = argparse.ArgumentParser(add_help=False)
29
30 upload_opts.add_argument('paths', metavar='path', type=str, nargs='*',
31                          help="""
32 Local file or directory. Default: read from standard input.
33 """)
34
35 _group = upload_opts.add_mutually_exclusive_group()
36
37 _group.add_argument('--max-manifest-depth', type=int, metavar='N',
38                     default=-1, help="""
39 Maximum depth of directory tree to represent in the manifest
40 structure. A directory structure deeper than this will be represented
41 as a single stream in the manifest. If N=0, the manifest will contain
42 a single stream. Default: -1 (unlimited), i.e., exactly one manifest
43 stream per filesystem directory that contains files.
44 """)
45
46 _group.add_argument('--normalize', action='store_true',
47                     help="""
48 Normalize the manifest by re-ordering files and streams after writing
49 data.
50 """)
51
52 _group = upload_opts.add_mutually_exclusive_group()
53
54 _group.add_argument('--as-stream', action='store_true', dest='stream',
55                     help="""
56 Synonym for --stream.
57 """)
58
59 _group.add_argument('--stream', action='store_true',
60                     help="""
61 Store the file content and display the resulting manifest on
62 stdout. Do not write the manifest to Keep or save a Collection object
63 in Arvados.
64 """)
65
66 _group.add_argument('--as-manifest', action='store_true', dest='manifest',
67                     help="""
68 Synonym for --manifest.
69 """)
70
71 _group.add_argument('--in-manifest', action='store_true', dest='manifest',
72                     help="""
73 Synonym for --manifest.
74 """)
75
76 _group.add_argument('--manifest', action='store_true',
77                     help="""
78 Store the file data and resulting manifest in Keep, save a Collection
79 object in Arvados, and display the manifest locator (Collection uuid)
80 on stdout. This is the default behavior.
81 """)
82
83 _group.add_argument('--as-raw', action='store_true', dest='raw',
84                     help="""
85 Synonym for --raw.
86 """)
87
88 _group.add_argument('--raw', action='store_true',
89                     help="""
90 Store the file content and display the data block locators on stdout,
91 separated by commas, with a trailing newline. Do not store a
92 manifest.
93 """)
94
95 upload_opts.add_argument('--use-filename', type=str, default=None,
96                          dest='filename', help="""
97 Synonym for --filename.
98 """)
99
100 upload_opts.add_argument('--filename', type=str, default=None,
101                          help="""
102 Use the given filename in the manifest, instead of the name of the
103 local file. This is useful when "-" or "/dev/stdin" is given as an
104 input file. It can be used only if there is exactly one path given and
105 it is not a directory. Implies --manifest.
106 """)
107
108 upload_opts.add_argument('--portable-data-hash', action='store_true',
109                          help="""
110 Print the portable data hash instead of the Arvados UUID for the collection
111 created by the upload.
112 """)
113
114 upload_opts.add_argument('--replication', type=int, metavar='N', default=None,
115                          help="""
116 Set the replication level for the new collection: how many different
117 physical storage devices (e.g., disks) should have a copy of each data
118 block. Default is to use the server-provided default (if any) or 2.
119 """)
120
121 run_opts = argparse.ArgumentParser(add_help=False)
122
123 run_opts.add_argument('--project-uuid', metavar='UUID', help="""
124 Store the collection in the specified project, instead of your Home
125 project.
126 """)
127
128 run_opts.add_argument('--name', help="""
129 Save the collection with the specified name.
130 """)
131
132 _group = run_opts.add_mutually_exclusive_group()
133 _group.add_argument('--progress', action='store_true',
134                     help="""
135 Display human-readable progress on stderr (bytes and, if possible,
136 percentage of total data size). This is the default behavior when
137 stderr is a tty.
138 """)
139
140 _group.add_argument('--no-progress', action='store_true',
141                     help="""
142 Do not display human-readable progress on stderr, even if stderr is a
143 tty.
144 """)
145
146 _group.add_argument('--batch-progress', action='store_true',
147                     help="""
148 Display machine-readable progress on stderr (bytes and, if known,
149 total data size).
150 """)
151
152 _group = run_opts.add_mutually_exclusive_group()
153 _group.add_argument('--resume', action='store_true', default=True,
154                     help="""
155 Continue interrupted uploads from cached state (default).
156 """)
157 _group.add_argument('--no-resume', action='store_false', dest='resume',
158                     help="""
159 Do not continue interrupted uploads from cached state.
160 """)
161
162 arg_parser = argparse.ArgumentParser(
163     description='Copy data from the local filesystem to Keep.',
164     parents=[upload_opts, run_opts, arv_cmd.retry_opt])
165
166 def parse_arguments(arguments):
167     args = arg_parser.parse_args(arguments)
168
169     if len(args.paths) == 0:
170         args.paths = ['-']
171
172     args.paths = map(lambda x: "-" if x == "/dev/stdin" else x, args.paths)
173
174     if len(args.paths) != 1 or os.path.isdir(args.paths[0]):
175         if args.filename:
176             arg_parser.error("""
177     --filename argument cannot be used when storing a directory or
178     multiple files.
179     """)
180
181     # Turn on --progress by default if stderr is a tty.
182     if (not (args.batch_progress or args.no_progress)
183         and os.isatty(sys.stderr.fileno())):
184         args.progress = True
185
186     if args.paths == ['-']:
187         args.resume = False
188         if not args.filename:
189             args.filename = 'stdin'
190
191     return args
192
193 class ResumeCacheConflict(Exception):
194     pass
195
196
197 class ResumeCache(object):
198     CACHE_DIR = '.cache/arvados/arv-put'
199
200     def __init__(self, file_spec):
201         self.cache_file = open(file_spec, 'a+')
202         self._lock_file(self.cache_file)
203         self.filename = self.cache_file.name
204
205     @classmethod
206     def make_path(cls, args):
207         md5 = hashlib.md5()
208         md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost'))
209         realpaths = sorted(os.path.realpath(path) for path in args.paths)
210         md5.update('\0'.join(realpaths))
211         if any(os.path.isdir(path) for path in realpaths):
212             md5.update(str(max(args.max_manifest_depth, -1)))
213         elif args.filename:
214             md5.update(args.filename)
215         return os.path.join(
216             arv_cmd.make_home_conf_dir(cls.CACHE_DIR, 0o700, 'raise'),
217             md5.hexdigest())
218
219     def _lock_file(self, fileobj):
220         try:
221             fcntl.flock(fileobj, fcntl.LOCK_EX | fcntl.LOCK_NB)
222         except IOError:
223             raise ResumeCacheConflict("{} locked".format(fileobj.name))
224
225     def load(self):
226         self.cache_file.seek(0)
227         return json.load(self.cache_file)
228
229     def check_cache(self, api_client=None, num_retries=0):
230         try:
231             state = self.load()
232             locator = None
233             try:
234                 if "_finished_streams" in state and len(state["_finished_streams"]) > 0:
235                     locator = state["_finished_streams"][0][1][0]
236                 elif "_current_stream_locators" in state and len(state["_current_stream_locators"]) > 0:
237                     locator = state["_current_stream_locators"][0]
238                 if locator is not None:
239                     kc = arvados.keep.KeepClient(api_client=api_client)
240                     kc.head(locator, num_retries=num_retries)
241             except Exception as e:
242                 self.restart()
243         except (ValueError):
244             pass
245
246     def save(self, data):
247         try:
248             new_cache_fd, new_cache_name = tempfile.mkstemp(
249                 dir=os.path.dirname(self.filename))
250             self._lock_file(new_cache_fd)
251             new_cache = os.fdopen(new_cache_fd, 'r+')
252             json.dump(data, new_cache)
253             os.rename(new_cache_name, self.filename)
254         except (IOError, OSError, ResumeCacheConflict) as error:
255             try:
256                 os.unlink(new_cache_name)
257             except NameError:  # mkstemp failed.
258                 pass
259         else:
260             self.cache_file.close()
261             self.cache_file = new_cache
262
263     def close(self):
264         self.cache_file.close()
265
266     def destroy(self):
267         try:
268             os.unlink(self.filename)
269         except OSError as error:
270             if error.errno != errno.ENOENT:  # That's what we wanted anyway.
271                 raise
272         self.close()
273
274     def restart(self):
275         self.destroy()
276         self.__init__(self.filename)
277
278
279 class ArvPutCollectionWriter(arvados.ResumableCollectionWriter):
280     STATE_PROPS = (arvados.ResumableCollectionWriter.STATE_PROPS +
281                    ['bytes_written', '_seen_inputs'])
282
283     def __init__(self, cache=None, reporter=None, bytes_expected=None, **kwargs):
284         self.bytes_written = 0
285         self._seen_inputs = []
286         self.cache = cache
287         self.reporter = reporter
288         self.bytes_expected = bytes_expected
289         super(ArvPutCollectionWriter, self).__init__(**kwargs)
290
291     @classmethod
292     def from_cache(cls, cache, reporter=None, bytes_expected=None,
293                    num_retries=0, replication=0):
294         try:
295             state = cache.load()
296             state['_data_buffer'] = [base64.decodestring(state['_data_buffer'])]
297             writer = cls.from_state(state, cache, reporter, bytes_expected,
298                                     num_retries=num_retries,
299                                     replication=replication)
300         except (TypeError, ValueError,
301                 arvados.errors.StaleWriterStateError) as error:
302             return cls(cache, reporter, bytes_expected,
303                        num_retries=num_retries,
304                        replication=replication)
305         else:
306             return writer
307
308     def cache_state(self):
309         if self.cache is None:
310             return
311         state = self.dump_state()
312         # Transform attributes for serialization.
313         for attr, value in state.items():
314             if attr == '_data_buffer':
315                 state[attr] = base64.encodestring(''.join(value))
316             elif hasattr(value, 'popleft'):
317                 state[attr] = list(value)
318         self.cache.save(state)
319
320     def report_progress(self):
321         if self.reporter is not None:
322             self.reporter(self.bytes_written, self.bytes_expected)
323
324     def flush_data(self):
325         start_buffer_len = self._data_buffer_len
326         start_block_count = self.bytes_written / arvados.config.KEEP_BLOCK_SIZE
327         super(ArvPutCollectionWriter, self).flush_data()
328         if self._data_buffer_len < start_buffer_len:  # We actually PUT data.
329             self.bytes_written += (start_buffer_len - self._data_buffer_len)
330             self.report_progress()
331             if (self.bytes_written / arvados.config.KEEP_BLOCK_SIZE) > start_block_count:
332                 self.cache_state()
333
334     def _record_new_input(self, input_type, source_name, dest_name):
335         # The key needs to be a list because that's what we'll get back
336         # from JSON deserialization.
337         key = [input_type, source_name, dest_name]
338         if key in self._seen_inputs:
339             return False
340         self._seen_inputs.append(key)
341         return True
342
343     def write_file(self, source, filename=None):
344         if self._record_new_input('file', source, filename):
345             super(ArvPutCollectionWriter, self).write_file(source, filename)
346
347     def write_directory_tree(self,
348                              path, stream_name='.', max_manifest_depth=-1):
349         if self._record_new_input('directory', path, stream_name):
350             super(ArvPutCollectionWriter, self).write_directory_tree(
351                 path, stream_name, max_manifest_depth)
352
353
354 def expected_bytes_for(pathlist):
355     # Walk the given directory trees and stat files, adding up file sizes,
356     # so we can display progress as percent
357     bytesum = 0
358     for path in pathlist:
359         if os.path.isdir(path):
360             for filename in arvados.util.listdir_recursive(path):
361                 bytesum += os.path.getsize(os.path.join(path, filename))
362         elif not os.path.isfile(path):
363             return None
364         else:
365             bytesum += os.path.getsize(path)
366     return bytesum
367
368 _machine_format = "{} {}: {{}} written {{}} total\n".format(sys.argv[0],
369                                                             os.getpid())
370 def machine_progress(bytes_written, bytes_expected):
371     return _machine_format.format(
372         bytes_written, -1 if (bytes_expected is None) else bytes_expected)
373
374 def human_progress(bytes_written, bytes_expected):
375     if bytes_expected:
376         return "\r{}M / {}M {:.1%} ".format(
377             bytes_written >> 20, bytes_expected >> 20,
378             float(bytes_written) / bytes_expected)
379     else:
380         return "\r{} ".format(bytes_written)
381
382 def progress_writer(progress_func, outfile=sys.stderr):
383     def write_progress(bytes_written, bytes_expected):
384         outfile.write(progress_func(bytes_written, bytes_expected))
385     return write_progress
386
387 def exit_signal_handler(sigcode, frame):
388     sys.exit(-sigcode)
389
390 def desired_project_uuid(api_client, project_uuid, num_retries):
391     if not project_uuid:
392         query = api_client.users().current()
393     elif arvados.util.user_uuid_pattern.match(project_uuid):
394         query = api_client.users().get(uuid=project_uuid)
395     elif arvados.util.group_uuid_pattern.match(project_uuid):
396         query = api_client.groups().get(uuid=project_uuid)
397     else:
398         raise ValueError("Not a valid project UUID: {}".format(project_uuid))
399     return query.execute(num_retries=num_retries)['uuid']
400
401 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
402     global api_client
403
404     args = parse_arguments(arguments)
405     status = 0
406     if api_client is None:
407         api_client = arvados.api('v1')
408
409     # Determine the name to use
410     if args.name:
411         if args.stream or args.raw:
412             print >>stderr, "Cannot use --name with --stream or --raw"
413             sys.exit(1)
414         collection_name = args.name
415     else:
416         collection_name = "Saved at {} by {}@{}".format(
417             datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC"),
418             pwd.getpwuid(os.getuid()).pw_name,
419             socket.gethostname())
420
421     if args.project_uuid and (args.stream or args.raw):
422         print >>stderr, "Cannot use --project-uuid with --stream or --raw"
423         sys.exit(1)
424
425     # Determine the parent project
426     try:
427         project_uuid = desired_project_uuid(api_client, args.project_uuid,
428                                             args.retries)
429     except (apiclient_errors.Error, ValueError) as error:
430         print >>stderr, error
431         sys.exit(1)
432
433     # write_copies diverges from args.replication here.
434     # args.replication is how many copies we will instruct Arvados to
435     # maintain (by passing it in collections().create()) after all
436     # data is written -- and if None was given, we'll use None there.
437     # Meanwhile, write_copies is how many copies of each data block we
438     # write to Keep, which has to be a number.
439     #
440     # If we simply changed args.replication from None to a default
441     # here, we'd end up erroneously passing the default replication
442     # level (instead of None) to collections().create().
443     write_copies = (args.replication or
444                     api_client._rootDesc.get('defaultCollectionReplication', 2))
445
446     if args.progress:
447         reporter = progress_writer(human_progress)
448     elif args.batch_progress:
449         reporter = progress_writer(machine_progress)
450     else:
451         reporter = None
452     bytes_expected = expected_bytes_for(args.paths)
453
454     resume_cache = None
455     if args.resume:
456         try:
457             resume_cache = ResumeCache(ResumeCache.make_path(args))
458             resume_cache.check_cache(api_client=api_client, num_retries=args.retries)
459         except (IOError, OSError, ValueError):
460             pass  # Couldn't open cache directory/file.  Continue without it.
461         except ResumeCacheConflict:
462             print >>stderr, "\n".join([
463                 "arv-put: Another process is already uploading this data.",
464                 "         Use --no-resume if this is really what you want."])
465             sys.exit(1)
466
467     if resume_cache is None:
468         writer = ArvPutCollectionWriter(
469             resume_cache, reporter, bytes_expected,
470             num_retries=args.retries,
471             replication=write_copies)
472     else:
473         writer = ArvPutCollectionWriter.from_cache(
474             resume_cache, reporter, bytes_expected,
475             num_retries=args.retries,
476             replication=write_copies)
477
478     # Install our signal handler for each code in CAUGHT_SIGNALS, and save
479     # the originals.
480     orig_signal_handlers = {sigcode: signal.signal(sigcode, exit_signal_handler)
481                             for sigcode in CAUGHT_SIGNALS}
482
483     if writer.bytes_written > 0:  # We're resuming a previous upload.
484         print >>stderr, "\n".join([
485                 "arv-put: Resuming previous upload from last checkpoint.",
486                 "         Use the --no-resume option to start over."])
487
488     writer.report_progress()
489     writer.do_queued_work()  # Do work resumed from cache.
490     for path in args.paths:  # Copy file data to Keep.
491         if path == '-':
492             writer.start_new_stream()
493             writer.start_new_file(args.filename)
494             r = sys.stdin.read(64*1024)
495             while r:
496                 # Need to bypass _queued_file check in ResumableCollectionWriter.write() to get
497                 # CollectionWriter.write().
498                 super(arvados.collection.ResumableCollectionWriter, writer).write(r)
499                 r = sys.stdin.read(64*1024)
500         elif os.path.isdir(path):
501             writer.write_directory_tree(
502                 path, max_manifest_depth=args.max_manifest_depth)
503         else:
504             writer.start_new_stream()
505             writer.write_file(path, args.filename or os.path.basename(path))
506     writer.finish_current_stream()
507
508     if args.progress:  # Print newline to split stderr from stdout for humans.
509         print >>stderr
510
511     output = None
512     if args.stream:
513         output = writer.manifest_text()
514         if args.normalize:
515             output = arvados.collection.CollectionReader(output).manifest_text(normalize=True)
516     elif args.raw:
517         output = ','.join(writer.data_locators())
518     else:
519         try:
520             manifest_text = writer.manifest_text()
521             if args.normalize:
522                 manifest_text = arvados.collection.CollectionReader(manifest_text).manifest_text(normalize=True)
523             replication_attr = 'replication_desired'
524             if api_client._schema.schemas['Collection']['properties'].get(replication_attr, None) is None:
525                 # API called it 'redundancy' before #3410.
526                 replication_attr = 'redundancy'
527             # Register the resulting collection in Arvados.
528             collection = api_client.collections().create(
529                 body={
530                     'owner_uuid': project_uuid,
531                     'name': collection_name,
532                     'manifest_text': manifest_text,
533                     replication_attr: args.replication,
534                     },
535                 ensure_unique_name=True
536                 ).execute(num_retries=args.retries)
537
538             print >>stderr, "Collection saved as '%s'" % collection['name']
539
540             if args.portable_data_hash and 'portable_data_hash' in collection and collection['portable_data_hash']:
541                 output = collection['portable_data_hash']
542             else:
543                 output = collection['uuid']
544
545         except apiclient_errors.Error as error:
546             print >>stderr, (
547                 "arv-put: Error creating Collection on project: {}.".format(
548                     error))
549             status = 1
550
551     # Print the locator (uuid) of the new collection.
552     if output is None:
553         status = status or 1
554     else:
555         stdout.write(output)
556         if not output.endswith('\n'):
557             stdout.write('\n')
558
559     for sigcode, orig_handler in orig_signal_handlers.items():
560         signal.signal(sigcode, orig_handler)
561
562     if status != 0:
563         sys.exit(status)
564
565     if resume_cache is not None:
566         resume_cache.destroy()
567
568     return output
569
570 if __name__ == '__main__':
571     main()