#!/usr/bin/env python import argparse import hashlib import os import re import string import sys import logging logger = logging.getLogger(os.path.basename(sys.argv[0])) parser = argparse.ArgumentParser( description='Copy data from Keep to a local file or pipe.') parser.add_argument('locator', type=str, help=""" Collection locator, optionally with a file path or prefix. """) parser.add_argument('destination', type=str, nargs='?', default='/dev/stdout', help=""" Local file or directory where the data is to be written. Default: /dev/stdout. """) group = parser.add_mutually_exclusive_group() group.add_argument('--progress', action='store_true', help=""" Display human-readable progress on stderr (bytes and, if possible, percentage of total data size). This is the default behavior when stderr is a tty and stdout is not a tty. """) group.add_argument('--no-progress', action='store_true', help=""" Do not display human-readable progress on stderr. """) group.add_argument('--batch-progress', action='store_true', help=""" Display machine-readable progress on stderr (bytes and, if known, total data size). """) group = parser.add_mutually_exclusive_group() group.add_argument('--hash', help=""" Display the hash of each file as it is read from Keep, using the given hash algorithm. Supported algorithms include md5, sha1, sha224, sha256, sha384, and sha512. """) group.add_argument('--md5sum', action='store_const', dest='hash', const='md5', help=""" Display the MD5 hash of each file as it is read from Keep. """) parser.add_argument('-n', action='store_true', help=""" Do not write any data -- just read from Keep, and report md5sums if requested. """) parser.add_argument('-r', action='store_true', help=""" Retrieve all files in the specified collection/prefix. This is the default behavior if the "locator" argument ends with a forward slash. """) args = parser.parse_args() if args.locator[-1] == os.sep: args.r = True if (args.r and not args.n and not (args.destination and os.path.isdir(args.destination))): parser.error('Destination is not a directory.') if not args.r and (os.path.isdir(args.destination) or args.destination[-1] == os.path.sep): parser.error('Destination is a directory.') # Turn on --progress by default if stderr is a tty and stdout isn't. if (not (args.batch_progress or args.no_progress) and os.isatty(sys.stderr.fileno()) and not os.isatty(sys.stdout.fileno())): args.progress = True if args.destination == '-': args.destination = '/dev/stdout' args.destination = args.destination.rstrip(os.sep) import arvados r = re.search(r'^(.*?)(/.*)?$', args.locator) collection = r.group(1) get_prefix = r.group(2) if args.r and not get_prefix: get_prefix = os.sep todo = [] todo_bytes = 0 if not get_prefix: if not args.n: with open(args.destination, 'wb') as f: f.write(arvados.Keep.get(collection)) sys.exit(0) reader = arvados.CollectionReader(collection) # Scan the collection. Make an array of (stream, file, local # destination filename) tuples, and add up total size to extract. try: for s in reader.all_streams(): for f in s.all_files(): if get_prefix and get_prefix[-1] == os.sep: if 0 != string.find(os.path.join(s.name(), f.name()), '.' + get_prefix): continue dest_path = os.path.join( args.destination, os.path.join(s.name(), f.name())[len(get_prefix)+1:]) else: if os.path.join(s.name(), f.name()) != '.' + get_prefix: continue dest_path = args.destination todo += [(s, f, dest_path)] todo_bytes += f.size() except arvados.errors.NotFoundError as e: logger.error(e) sys.exit(1) # Read data, and (if not -n) write to local file(s) or pipe. out_bytes = 0 for s,f,outfilename in todo: outfile = None digestor = None if not args.n: if args.r: arvados.util.mkdir_dash_p(os.path.dirname(outfilename)) try: outfile = open(outfilename, 'wb') except Exception as e: logger.error('Open(%s) failed: %s' % (outfilename, e)) if args.hash: digestor = hashlib.new(args.hash) try: for data in f.readall(): if outfile: outfile.write(data) if digestor: digestor.update(data) out_bytes += len(data) if args.progress: sys.stderr.write('\r%d MiB / %d MiB %.1f%%' % (out_bytes >> 20, todo_bytes >> 20, (100 if todo_bytes==0 else 100.0*out_bytes/todo_bytes))) elif args.batch_progress: sys.stderr.write('%s %d read %d total\n' % (sys.argv[0], os.getpid(), out_bytes, todo_bytes)) if digestor: sys.stderr.write("%s %s/%s\n" % (digestor.hexdigest(), s.name(), f.name())) except KeyboardInterrupt: if outfile: os.unlink(outfilename) break if args.progress: sys.stderr.write('\n')