#!/usr/bin/env python # TODO: # --md5sum - display md5 of each file as read from disk import argparse import os import sys parser = argparse.ArgumentParser( description='Copy data from the local filesystem to Keep.') parser.add_argument('paths', metavar='path', type=str, nargs='*', help=""" Local file or directory. Default: read from standard input. """) parser.add_argument('--max-manifest-depth', type=int, metavar='N', default=-1, help=""" Maximum depth of directory tree to represent in the manifest structure. A directory structure deeper than this will be represented as a single stream in the manifest. If N=0, the manifest will contain a single stream. Default: -1 (unlimited), i.e., exactly one manifest stream per filesystem directory that contains files. """) group = parser.add_mutually_exclusive_group() group.add_argument('--as-stream', action='store_true', dest='stream', help=""" Synonym for --stream. """) group.add_argument('--stream', action='store_true', help=""" Store the file content and display the resulting manifest on stdout. Do not write the manifest to Keep or save a Collection object in Arvados. """) group.add_argument('--as-manifest', action='store_true', dest='manifest', help=""" Synonym for --manifest. """) group.add_argument('--in-manifest', action='store_true', dest='manifest', help=""" Synonym for --manifest. """) group.add_argument('--manifest', action='store_true', help=""" Store the file data and resulting manifest in Keep, save a Collection object in Arvados, and display the manifest locator (Collection uuid) on stdout. This is the default behavior if more than one path argument is given, or the path given is a directory, or a --filename argument is given. """) group.add_argument('--as-raw', action='store_true', dest='raw', help=""" Synonym for --raw. """) group.add_argument('--raw', action='store_true', help=""" Store the file content and display the data block locators on stdout, separated by spaces, with a trailing newline. Do not store a manifest. This is the default behavior when reading data from a single file or standard input. """) parser.add_argument('--use-filename', type=str, default=None, dest='filename', help=""" Synonym for --filename. """) parser.add_argument('--filename', type=str, default=None, help=""" Use the given filename in the manifest, instead of the name of the local file. This is useful when "-" or "/dev/stdin" is given as an input file. It can be used only if there is exactly one path given and it is not a directory. Implies --manifest. """) group = parser.add_mutually_exclusive_group() group.add_argument('--progress', action='store_true', help=""" Display human-readable progress on stderr (bytes and, if possible, percentage of total data size). This is the default behavior when stderr is a tty. """) group.add_argument('--no-progress', action='store_true', help=""" Do not display human-readable progress on stderr, even if stderr is a tty. """) group.add_argument('--batch-progress', action='store_true', help=""" Display machine-readable progress on stderr (bytes and, if known, total data size). """) args = parser.parse_args() if len(args.paths) == 0: args.paths += ['/dev/stdin'] if len(args.paths) != 1 or os.path.isdir(args.paths[0]): if args.filename: parser.error(""" --filename argument cannot be used when storing a directory or multiple files. """) elif not args.filename and not args.stream and not args.manifest: # When reading from a single non-directory, and no --filename is # given, default to writing raw blocks rather than a manifest. args.raw = True # Turn on --progress by default if stderr is a tty. if (not (args.batch_progress or args.no_progress) and os.isatty(sys.stderr.fileno())): args.progress = True import arvados import re import string class CollectionWriterWithProgress(arvados.CollectionWriter): def flush_data(self, *args, **kwargs): if not getattr(self, 'display_type', None): return if not hasattr(self, 'bytes_flushed'): self.bytes_flushed = 0 self.bytes_flushed += self._data_buffer_len super(CollectionWriterWithProgress, self).flush_data(*args, **kwargs) self.bytes_flushed -= self._data_buffer_len if self.display_type == 'machine': sys.stderr.write('%s %d: %d written %d total\n' % (sys.argv[0], os.getpid(), self.bytes_flushed, getattr(self, 'bytes_expected', -1))) elif getattr(self, 'bytes_expected', 0) > 0: pct = 100.0 * self.bytes_flushed / self.bytes_expected sys.stderr.write('\r%dM / %dM %.1f%% ' % (self.bytes_flushed >> 20, self.bytes_expected >> 20, pct)) else: sys.stderr.write('\r%d ' % self.bytes_flushed) def manifest_text(self, *args, **kwargs): manifest_text = (super(CollectionWriterWithProgress, self) .manifest_text(*args, **kwargs)) if getattr(self, 'display_type', None): if self.display_type == 'human': sys.stderr.write('\n') self.display_type = None return manifest_text if args.progress: writer = CollectionWriterWithProgress() writer.display_type = 'human' elif args.batch_progress: writer = CollectionWriterWithProgress() writer.display_type = 'machine' else: writer = arvados.CollectionWriter() args.paths = [('/dev/stdin' if p=='-' else p) for p in args.paths] # Walk the given directory trees and stat files, adding up file sizes, # so we can display progress as percent writer.bytes_expected = 0 for path in args.paths: if os.path.isdir(path): for filename in arvados.util.listdir_recursive(path): writer.bytes_expected += os.path.getsize( os.path.join(path, filename)) elif not os.path.isfile(path): del writer.bytes_expected break else: writer.bytes_expected += os.path.getsize(path) # Copy file data to Keep. for path in args.paths: if os.path.isdir(path): writer.write_directory_tree(path, max_manifest_depth=args.max_manifest_depth) else: writer.start_new_stream() writer.start_new_file(args.filename or os.path.split(path)[1]) with open(path, 'rb') as f: while True: buf = f.read(2**26) if len(buf) == 0: break writer.write(buf) if args.stream: print writer.manifest_text(), elif args.raw: writer.finish_current_stream() print string.join(writer.data_locators(), ',') + '\n' else: # Register the resulting collection in Arvados. arvados.api().collections().create( body={ 'uuid': writer.finish(), 'manifest_text': writer.manifest_text(), }, ).execute() # Print the locator (uuid) of the new collection. print writer.finish()