#!/usr/bin/env python # TODO: # --md5sum - display md5 of each file as read from disk import argparse import os import sys parser = argparse.ArgumentParser( description='Copy data from the local filesystem to Keep.') parser.add_argument('paths', metavar='path', type=str, nargs='*', help=""" Local file or directory. Default: read from standard input. """) parser.add_argument('--max-manifest-depth', type=int, metavar='N', default=-1, help=""" Maximum depth of directory tree to represent in the manifest structure. A directory structure deeper than this will be represented as a single stream in the manifest. If N=0, the manifest will contain a single stream. Default: -1 (unlimited), i.e., exactly one manifest stream per filesystem directory that contains files. """) group = parser.add_mutually_exclusive_group() group.add_argument('--as-stream', action='store_true', dest='stream', help=""" Synonym for --stream. """) group.add_argument('--stream', action='store_true', help=""" Store the file content and display the resulting manifest on stdout. Do not write the manifest to Keep or save a Collection object in Arvados. """) group.add_argument('--as-manifest', action='store_true', dest='manifest', help=""" Synonym for --manifest. """) group.add_argument('--in-manifest', action='store_true', dest='manifest', help=""" Synonym for --manifest. """) group.add_argument('--manifest', action='store_true', help=""" Store the file data and resulting manifest in Keep, save a Collection object in Arvados, and display the manifest locator (Collection uuid) on stdout. This is the default behavior. """) group.add_argument('--as-raw', action='store_true', dest='raw', help=""" Synonym for --raw. """) group.add_argument('--raw', action='store_true', help=""" Store the file content and display the data block locators on stdout, separated by commas, with a trailing newline. Do not store a manifest. """) parser.add_argument('--use-filename', type=str, default=None, dest='filename', help=""" Synonym for --filename. """) parser.add_argument('--filename', type=str, default=None, help=""" Use the given filename in the manifest, instead of the name of the local file. This is useful when "-" or "/dev/stdin" is given as an input file. It can be used only if there is exactly one path given and it is not a directory. Implies --manifest. """) group = parser.add_mutually_exclusive_group() group.add_argument('--progress', action='store_true', help=""" Display human-readable progress on stderr (bytes and, if possible, percentage of total data size). This is the default behavior when stderr is a tty. """) group.add_argument('--no-progress', action='store_true', help=""" Do not display human-readable progress on stderr, even if stderr is a tty. """) group.add_argument('--batch-progress', action='store_true', help=""" Display machine-readable progress on stderr (bytes and, if known, total data size). """) args = parser.parse_args() if len(args.paths) == 0: args.paths += ['/dev/stdin'] if len(args.paths) != 1 or os.path.isdir(args.paths[0]): if args.filename: parser.error(""" --filename argument cannot be used when storing a directory or multiple files. """) # Turn on --progress by default if stderr is a tty. if (not (args.batch_progress or args.no_progress) and os.isatty(sys.stderr.fileno())): args.progress = True import arvados import re import string class CollectionWriterWithProgress(arvados.CollectionWriter): def flush_data(self, *args, **kwargs): if not getattr(self, 'display_type', None): return if not hasattr(self, 'bytes_flushed'): self.bytes_flushed = 0 self.bytes_flushed += self._data_buffer_len super(CollectionWriterWithProgress, self).flush_data(*args, **kwargs) self.bytes_flushed -= self._data_buffer_len if self.display_type == 'machine': sys.stderr.write('%s %d: %d written %d total\n' % (sys.argv[0], os.getpid(), self.bytes_flushed, getattr(self, 'bytes_expected', -1))) elif getattr(self, 'bytes_expected', 0) > 0: pct = 100.0 * self.bytes_flushed / self.bytes_expected sys.stderr.write('\r%dM / %dM %.1f%% ' % (self.bytes_flushed >> 20, self.bytes_expected >> 20, pct)) else: sys.stderr.write('\r%d ' % self.bytes_flushed) def manifest_text(self, *args, **kwargs): manifest_text = (super(CollectionWriterWithProgress, self) .manifest_text(*args, **kwargs)) if getattr(self, 'display_type', None): if self.display_type == 'human': sys.stderr.write('\n') self.display_type = None return manifest_text if args.progress: writer = CollectionWriterWithProgress() writer.display_type = 'human' elif args.batch_progress: writer = CollectionWriterWithProgress() writer.display_type = 'machine' else: writer = arvados.CollectionWriter() if args.paths == ['-']: args.paths = ['/dev/stdin'] if not args.filename: args.filename = '-' # Walk the given directory trees and stat files, adding up file sizes, # so we can display progress as percent writer.bytes_expected = 0 for path in args.paths: if os.path.isdir(path): for filename in arvados.util.listdir_recursive(path): writer.bytes_expected += os.path.getsize( os.path.join(path, filename)) elif not os.path.isfile(path): del writer.bytes_expected break else: writer.bytes_expected += os.path.getsize(path) # Copy file data to Keep. for path in args.paths: if os.path.isdir(path): writer.write_directory_tree(path, max_manifest_depth=args.max_manifest_depth) else: writer.start_new_stream() writer.start_new_file(args.filename or os.path.split(path)[1]) with open(path, 'rb') as f: while True: buf = f.read(2**26) if len(buf) == 0: break writer.write(buf) if args.stream: print writer.manifest_text(), elif args.raw: writer.finish_current_stream() print string.join(writer.data_locators(), ',') else: manifest_locator = writer.finish() # The manifest locator is also used as its UUID. Remove any # signature Keep may have added to the locator; it should not be # considered part of the UUID and will confuse apiserver if the # signature is passed to arvados.api().collections().get(). # apiserver will resolve permissions by checking permission links # anyway. # TODO(twp,tomclegg): re-evaluate the value of storing manifests # in Keep at all. manifest_uuid = re.sub(r'\+A[a-z0-9@_-]+', '', manifest_locator) # Register the resulting collection in Arvados. arvados.api().collections().create( body={ 'uuid': manifest_uuid, 'manifest_text': writer.manifest_text(), }, ).execute() # Print the locator (uuid) of the new collection. print manifest_uuid