closes #7399
[arvados.git] / sdk / python / arvados / commands / put.py
1 #!/usr/bin/env python
2
3 # TODO:
4 # --md5sum - display md5 of each file as read from disk
5
6 import argparse
7 import arvados
8 import arvados.collection
9 import base64
10 import datetime
11 import errno
12 import fcntl
13 import hashlib
14 import json
15 import os
16 import pwd
17 import signal
18 import socket
19 import sys
20 import tempfile
21 from apiclient import errors as apiclient_errors
22
23 import arvados.commands._util as arv_cmd
24
25 CAUGHT_SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM]
26 api_client = None
27
28 upload_opts = argparse.ArgumentParser(add_help=False)
29
30 upload_opts.add_argument('paths', metavar='path', type=str, nargs='*',
31                          help="""
32 Local file or directory. Default: read from standard input.
33 """)
34
35 _group = upload_opts.add_mutually_exclusive_group()
36
37 _group.add_argument('--max-manifest-depth', type=int, metavar='N',
38                     default=-1, help="""
39 Maximum depth of directory tree to represent in the manifest
40 structure. A directory structure deeper than this will be represented
41 as a single stream in the manifest. If N=0, the manifest will contain
42 a single stream. Default: -1 (unlimited), i.e., exactly one manifest
43 stream per filesystem directory that contains files.
44 """)
45
46 _group.add_argument('--normalize', action='store_true',
47                     help="""
48 Normalize the manifest by re-ordering files and streams after writing
49 data.
50 """)
51
52 _group = upload_opts.add_mutually_exclusive_group()
53
54 _group.add_argument('--as-stream', action='store_true', dest='stream',
55                     help="""
56 Synonym for --stream.
57 """)
58
59 _group.add_argument('--stream', action='store_true',
60                     help="""
61 Store the file content and display the resulting manifest on
62 stdout. Do not write the manifest to Keep or save a Collection object
63 in Arvados.
64 """)
65
66 _group.add_argument('--as-manifest', action='store_true', dest='manifest',
67                     help="""
68 Synonym for --manifest.
69 """)
70
71 _group.add_argument('--in-manifest', action='store_true', dest='manifest',
72                     help="""
73 Synonym for --manifest.
74 """)
75
76 _group.add_argument('--manifest', action='store_true',
77                     help="""
78 Store the file data and resulting manifest in Keep, save a Collection
79 object in Arvados, and display the manifest locator (Collection uuid)
80 on stdout. This is the default behavior.
81 """)
82
83 _group.add_argument('--as-raw', action='store_true', dest='raw',
84                     help="""
85 Synonym for --raw.
86 """)
87
88 _group.add_argument('--raw', action='store_true',
89                     help="""
90 Store the file content and display the data block locators on stdout,
91 separated by commas, with a trailing newline. Do not store a
92 manifest.
93 """)
94
95 upload_opts.add_argument('--use-filename', type=str, default=None,
96                          dest='filename', help="""
97 Synonym for --filename.
98 """)
99
100 upload_opts.add_argument('--filename', type=str, default=None,
101                          help="""
102 Use the given filename in the manifest, instead of the name of the
103 local file. This is useful when "-" or "/dev/stdin" is given as an
104 input file. It can be used only if there is exactly one path given and
105 it is not a directory. Implies --manifest.
106 """)
107
108 upload_opts.add_argument('--portable-data-hash', action='store_true',
109                          help="""
110 Print the portable data hash instead of the Arvados UUID for the collection
111 created by the upload.
112 """)
113
114 upload_opts.add_argument('--replication', type=int, metavar='N', default=None,
115                          help="""
116 Set the replication level for the new collection: how many different
117 physical storage devices (e.g., disks) should have a copy of each data
118 block. Default is to use the server-provided default (if any) or 2.
119 """)
120
121 run_opts = argparse.ArgumentParser(add_help=False)
122
123 run_opts.add_argument('--project-uuid', metavar='UUID', help="""
124 Store the collection in the specified project, instead of your Home
125 project.
126 """)
127
128 run_opts.add_argument('--name', help="""
129 Save the collection with the specified name.
130 """)
131
132 _group = run_opts.add_mutually_exclusive_group()
133 _group.add_argument('--progress', action='store_true',
134                     help="""
135 Display human-readable progress on stderr (bytes and, if possible,
136 percentage of total data size). This is the default behavior when
137 stderr is a tty.
138 """)
139
140 _group.add_argument('--no-progress', action='store_true',
141                     help="""
142 Do not display human-readable progress on stderr, even if stderr is a
143 tty.
144 """)
145
146 _group.add_argument('--batch-progress', action='store_true',
147                     help="""
148 Display machine-readable progress on stderr (bytes and, if known,
149 total data size).
150 """)
151
152 _group = run_opts.add_mutually_exclusive_group()
153 _group.add_argument('--resume', action='store_true', default=True,
154                     help="""
155 Continue interrupted uploads from cached state (default).
156 """)
157 _group.add_argument('--no-resume', action='store_false', dest='resume',
158                     help="""
159 Do not continue interrupted uploads from cached state.
160 """)
161
162 arg_parser = argparse.ArgumentParser(
163     description='Copy data from the local filesystem to Keep.',
164     parents=[upload_opts, run_opts, arv_cmd.retry_opt])
165
166 def parse_arguments(arguments):
167     args = arg_parser.parse_args(arguments)
168
169     if len(args.paths) == 0:
170         args.paths = ['-']
171
172     args.paths = map(lambda x: "-" if x == "/dev/stdin" else x, args.paths)
173
174     if len(args.paths) != 1 or os.path.isdir(args.paths[0]):
175         if args.filename:
176             arg_parser.error("""
177     --filename argument cannot be used when storing a directory or
178     multiple files.
179     """)
180
181     # Turn on --progress by default if stderr is a tty.
182     if (not (args.batch_progress or args.no_progress)
183         and os.isatty(sys.stderr.fileno())):
184         args.progress = True
185
186     if args.paths == ['-']:
187         args.resume = False
188         if not args.filename:
189             args.filename = 'stdin'
190
191     return args
192
193 class ResumeCacheConflict(Exception):
194     pass
195
196
197 class ResumeCache(object):
198     CACHE_DIR = '.cache/arvados/arv-put'
199
200     def __init__(self, file_spec):
201         self.cache_file = open(file_spec, 'a+')
202         self._lock_file(self.cache_file)
203         self.filename = self.cache_file.name
204
205     @classmethod
206     def make_path(cls, args):
207         md5 = hashlib.md5()
208         md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost'))
209         realpaths = sorted(os.path.realpath(path) for path in args.paths)
210         md5.update('\0'.join(realpaths))
211         if any(os.path.isdir(path) for path in realpaths):
212             md5.update(str(max(args.max_manifest_depth, -1)))
213         elif args.filename:
214             md5.update(args.filename)
215         return os.path.join(
216             arv_cmd.make_home_conf_dir(cls.CACHE_DIR, 0o700, 'raise'),
217             md5.hexdigest())
218
219     def _lock_file(self, fileobj):
220         try:
221             fcntl.flock(fileobj, fcntl.LOCK_EX | fcntl.LOCK_NB)
222         except IOError:
223             raise ResumeCacheConflict("{} locked".format(fileobj.name))
224
225     def load(self):
226         self.cache_file.seek(0)
227         return json.load(self.cache_file)
228
229     def save(self, data):
230         try:
231             new_cache_fd, new_cache_name = tempfile.mkstemp(
232                 dir=os.path.dirname(self.filename))
233             self._lock_file(new_cache_fd)
234             new_cache = os.fdopen(new_cache_fd, 'r+')
235             json.dump(data, new_cache)
236             os.rename(new_cache_name, self.filename)
237         except (IOError, OSError, ResumeCacheConflict) as error:
238             try:
239                 os.unlink(new_cache_name)
240             except NameError:  # mkstemp failed.
241                 pass
242         else:
243             self.cache_file.close()
244             self.cache_file = new_cache
245
246     def close(self):
247         self.cache_file.close()
248
249     def destroy(self):
250         try:
251             os.unlink(self.filename)
252         except OSError as error:
253             if error.errno != errno.ENOENT:  # That's what we wanted anyway.
254                 raise
255         self.close()
256
257     def restart(self):
258         self.destroy()
259         self.__init__(self.filename)
260
261
262 class ArvPutCollectionWriter(arvados.ResumableCollectionWriter):
263     STATE_PROPS = (arvados.ResumableCollectionWriter.STATE_PROPS +
264                    ['bytes_written', '_seen_inputs'])
265
266     def __init__(self, cache=None, reporter=None, bytes_expected=None, **kwargs):
267         self.bytes_written = 0
268         self._seen_inputs = []
269         self.cache = cache
270         self.reporter = reporter
271         self.bytes_expected = bytes_expected
272         super(ArvPutCollectionWriter, self).__init__(**kwargs)
273
274     @classmethod
275     def from_cache(cls, cache, reporter=None, bytes_expected=None,
276                    num_retries=0, replication=0):
277         try:
278             state = cache.load()
279             state['_data_buffer'] = [base64.decodestring(state['_data_buffer'])]
280             writer = cls.from_state(state, cache, reporter, bytes_expected,
281                                     num_retries=num_retries,
282                                     replication=replication)
283         except (TypeError, ValueError,
284                 arvados.errors.StaleWriterStateError) as error:
285             return cls(cache, reporter, bytes_expected,
286                        num_retries=num_retries,
287                        replication=replication)
288         else:
289             return writer
290
291     def cache_state(self):
292         if self.cache is None:
293             return
294         state = self.dump_state()
295         # Transform attributes for serialization.
296         for attr, value in state.items():
297             if attr == '_data_buffer':
298                 state[attr] = base64.encodestring(''.join(value))
299             elif hasattr(value, 'popleft'):
300                 state[attr] = list(value)
301         self.cache.save(state)
302
303     def report_progress(self):
304         if self.reporter is not None:
305             self.reporter(self.bytes_written, self.bytes_expected)
306
307     def flush_data(self):
308         start_buffer_len = self._data_buffer_len
309         start_block_count = self.bytes_written / arvados.config.KEEP_BLOCK_SIZE
310         super(ArvPutCollectionWriter, self).flush_data()
311         if self._data_buffer_len < start_buffer_len:  # We actually PUT data.
312             self.bytes_written += (start_buffer_len - self._data_buffer_len)
313             self.report_progress()
314             if (self.bytes_written / arvados.config.KEEP_BLOCK_SIZE) > start_block_count:
315                 self.cache_state()
316
317     def _record_new_input(self, input_type, source_name, dest_name):
318         # The key needs to be a list because that's what we'll get back
319         # from JSON deserialization.
320         key = [input_type, source_name, dest_name]
321         if key in self._seen_inputs:
322             return False
323         self._seen_inputs.append(key)
324         return True
325
326     def write_file(self, source, filename=None):
327         if self._record_new_input('file', source, filename):
328             super(ArvPutCollectionWriter, self).write_file(source, filename)
329
330     def write_directory_tree(self,
331                              path, stream_name='.', max_manifest_depth=-1):
332         if self._record_new_input('directory', path, stream_name):
333             super(ArvPutCollectionWriter, self).write_directory_tree(
334                 path, stream_name, max_manifest_depth)
335
336
337 def expected_bytes_for(pathlist):
338     # Walk the given directory trees and stat files, adding up file sizes,
339     # so we can display progress as percent
340     bytesum = 0
341     for path in pathlist:
342         if os.path.isdir(path):
343             for filename in arvados.util.listdir_recursive(path):
344                 bytesum += os.path.getsize(os.path.join(path, filename))
345         elif not os.path.isfile(path):
346             return None
347         else:
348             bytesum += os.path.getsize(path)
349     return bytesum
350
351 _machine_format = "{} {}: {{}} written {{}} total\n".format(sys.argv[0],
352                                                             os.getpid())
353 def machine_progress(bytes_written, bytes_expected):
354     return _machine_format.format(
355         bytes_written, -1 if (bytes_expected is None) else bytes_expected)
356
357 def human_progress(bytes_written, bytes_expected):
358     if bytes_expected:
359         return "\r{}M / {}M {:.1%} ".format(
360             bytes_written >> 20, bytes_expected >> 20,
361             float(bytes_written) / bytes_expected)
362     else:
363         return "\r{} ".format(bytes_written)
364
365 def progress_writer(progress_func, outfile=sys.stderr):
366     def write_progress(bytes_written, bytes_expected):
367         outfile.write(progress_func(bytes_written, bytes_expected))
368     return write_progress
369
370 def exit_signal_handler(sigcode, frame):
371     sys.exit(-sigcode)
372
373 def desired_project_uuid(api_client, project_uuid, num_retries):
374     if not project_uuid:
375         query = api_client.users().current()
376     elif arvados.util.user_uuid_pattern.match(project_uuid):
377         query = api_client.users().get(uuid=project_uuid)
378     elif arvados.util.group_uuid_pattern.match(project_uuid):
379         query = api_client.groups().get(uuid=project_uuid)
380     else:
381         raise ValueError("Not a valid project UUID: {}".format(project_uuid))
382     return query.execute(num_retries=num_retries)['uuid']
383
384 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
385     global api_client
386
387     args = parse_arguments(arguments)
388     status = 0
389     if api_client is None:
390         api_client = arvados.api('v1')
391
392     # Determine the name to use
393     if args.name:
394         if args.stream or args.raw:
395             print >>stderr, "Cannot use --name with --stream or --raw"
396             sys.exit(1)
397         collection_name = args.name
398     else:
399         collection_name = "Saved at {} by {}@{}".format(
400             datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC"),
401             pwd.getpwuid(os.getuid()).pw_name,
402             socket.gethostname())
403
404     if args.project_uuid and (args.stream or args.raw):
405         print >>stderr, "Cannot use --project-uuid with --stream or --raw"
406         sys.exit(1)
407
408     # Determine the parent project
409     try:
410         project_uuid = desired_project_uuid(api_client, args.project_uuid,
411                                             args.retries)
412     except (apiclient_errors.Error, ValueError) as error:
413         print >>stderr, error
414         sys.exit(1)
415
416     # write_copies diverges from args.replication here.
417     # args.replication is how many copies we will instruct Arvados to
418     # maintain (by passing it in collections().create()) after all
419     # data is written -- and if None was given, we'll use None there.
420     # Meanwhile, write_copies is how many copies of each data block we
421     # write to Keep, which has to be a number.
422     #
423     # If we simply changed args.replication from None to a default
424     # here, we'd end up erroneously passing the default replication
425     # level (instead of None) to collections().create().
426     write_copies = (args.replication or
427                     api_client._rootDesc.get('defaultCollectionReplication', 2))
428
429     if args.progress:
430         reporter = progress_writer(human_progress)
431     elif args.batch_progress:
432         reporter = progress_writer(machine_progress)
433     else:
434         reporter = None
435     bytes_expected = expected_bytes_for(args.paths)
436
437     resume_cache = None
438     if args.resume:
439         try:
440             resume_cache = ResumeCache(ResumeCache.make_path(args))
441         except (IOError, OSError, ValueError):
442             pass  # Couldn't open cache directory/file.  Continue without it.
443         except ResumeCacheConflict:
444             print >>stderr, "\n".join([
445                 "arv-put: Another process is already uploading this data.",
446                 "         Use --no-resume if this is really what you want."])
447             sys.exit(1)
448
449     if resume_cache is None:
450         writer = ArvPutCollectionWriter(
451             resume_cache, reporter, bytes_expected,
452             num_retries=args.retries,
453             replication=write_copies)
454     else:
455         writer = ArvPutCollectionWriter.from_cache(
456             resume_cache, reporter, bytes_expected,
457             num_retries=args.retries,
458             replication=write_copies)
459
460     # Install our signal handler for each code in CAUGHT_SIGNALS, and save
461     # the originals.
462     orig_signal_handlers = {sigcode: signal.signal(sigcode, exit_signal_handler)
463                             for sigcode in CAUGHT_SIGNALS}
464
465     if writer.bytes_written > 0:  # We're resuming a previous upload.
466         print >>stderr, "\n".join([
467                 "arv-put: Resuming previous upload from last checkpoint.",
468                 "         Use the --no-resume option to start over."])
469
470     writer.report_progress()
471     writer.do_queued_work()  # Do work resumed from cache.
472     for path in args.paths:  # Copy file data to Keep.
473         if path == '-':
474             writer.start_new_stream()
475             writer.start_new_file(args.filename)
476             r = sys.stdin.read(64*1024)
477             while r:
478                 # Need to bypass _queued_file check in ResumableCollectionWriter.write() to get
479                 # CollectionWriter.write().
480                 super(arvados.collection.ResumableCollectionWriter, writer).write(r)
481                 r = sys.stdin.read(64*1024)
482         elif os.path.isdir(path):
483             writer.write_directory_tree(
484                 path, max_manifest_depth=args.max_manifest_depth)
485         else:
486             writer.start_new_stream()
487             writer.write_file(path, args.filename or os.path.basename(path))
488     writer.finish_current_stream()
489
490     if args.progress:  # Print newline to split stderr from stdout for humans.
491         print >>stderr
492
493     if args.stream:
494         output = writer.manifest_text()
495         if args.normalize:
496             output = arvados.collection.CollectionReader(output).manifest_text(normalize=True)
497     elif args.raw:
498         output = ','.join(writer.data_locators())
499     else:
500         try:
501             manifest_text = writer.manifest_text()
502             if args.normalize:
503                 manifest_text = arvados.collection.CollectionReader(manifest_text).manifest_text(normalize=True)
504             replication_attr = 'replication_desired'
505             if api_client._schema.schemas['Collection']['properties'].get(replication_attr, None) is None:
506                 # API called it 'redundancy' before #3410.
507                 replication_attr = 'redundancy'
508             # Register the resulting collection in Arvados.
509             collection = api_client.collections().create(
510                 body={
511                     'owner_uuid': project_uuid,
512                     'name': collection_name,
513                     'manifest_text': manifest_text,
514                     replication_attr: args.replication,
515                     },
516                 ensure_unique_name=True
517                 ).execute(num_retries=args.retries)
518
519             print >>stderr, "Collection saved as '%s'" % collection['name']
520
521             if args.portable_data_hash and 'portable_data_hash' in collection and collection['portable_data_hash']:
522                 output = collection['portable_data_hash']
523             else:
524                 output = collection['uuid']
525
526         except apiclient_errors.Error as error:
527             print >>stderr, (
528                 "arv-put: Error creating Collection on project: {}.".format(
529                     error))
530             status = 1
531
532     # Print the locator (uuid) of the new collection.
533     stdout.write(output)
534     if not output.endswith('\n'):
535         stdout.write('\n')
536
537     for sigcode, orig_handler in orig_signal_handlers.items():
538         signal.signal(sigcode, orig_handler)
539
540     if status != 0:
541         sys.exit(status)
542
543     if resume_cache is not None:
544         resume_cache.destroy()
545
546     return output
547
548 if __name__ == '__main__':
549     main()