X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/0568c2d42703a7b839f2661968c05a23753f67c3..04efddf61ee4a0e5c65a72a538fe3f026ae94e8e:/sdk/python/arvados/commands/get.py diff --git a/sdk/python/arvados/commands/get.py b/sdk/python/arvados/commands/get.py index b23f2d07ed..888fd390f0 100755 --- a/sdk/python/arvados/commands/get.py +++ b/sdk/python/arvados/commands/get.py @@ -10,14 +10,12 @@ import logging import arvados import arvados.commands._util as arv_cmd +import arvados.util as util from arvados._version import __version__ api_client = None - -def abort(msg, code=1): - print >>sys.stderr, "arv-get:", msg - exit(code) +logger = logging.getLogger('arvados.arv-get') parser = argparse.ArgumentParser( description='Copy data from Keep to a local file or pipe.', @@ -87,9 +85,14 @@ write *anything* if any files exist that would have to be overwritten. This option causes even devices, sockets, and fifos to be skipped. """) +group.add_argument('--strip-manifest', action='store_true', default=False, + help=""" +When getting a collection manifest, strip its access tokens before writing +it. +""") -def parse_arguments(arguments, logger): - args = parser.parse_args() +def parse_arguments(arguments, stdout, stderr): + args = parser.parse_args(arguments) if args.locator[-1] == os.sep: args.r = True @@ -120,27 +123,35 @@ def parse_arguments(arguments, logger): # either going to a named file, or going (via stdout) to something # that isn't a tty. if (not (args.batch_progress or args.no_progress) - and sys.stderr.isatty() + and stderr.isatty() and (args.destination != '-' - or not sys.stdout.isatty())): + or not stdout.isatty())): args.progress = True return args def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr): global api_client - - logger = logging.getLogger('arvados.arv-get') - args = parse_arguments(arguments, logger) + + if stdout is sys.stdout and hasattr(stdout, 'buffer'): + # in Python 3, write to stdout as binary + stdout = stdout.buffer + + args = parse_arguments(arguments, stdout, stderr) if api_client is None: api_client = arvados.api('v1') r = re.search(r'^(.*?)(/.*)?$', args.locator) - collection = r.group(1) + col_loc = r.group(1) get_prefix = r.group(2) if args.r and not get_prefix: get_prefix = os.sep - reader = arvados.CollectionReader(collection, num_retries=args.retries) + try: + reader = arvados.CollectionReader(col_loc, num_retries=args.retries) + except Exception as error: + logger.error("failed to read collection: {}".format(error)) + return 1 + # User asked to download the collection's manifest if not get_prefix: if not args.n: open_flags = os.O_CREAT | os.O_WRONLY @@ -148,57 +159,67 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr): open_flags |= os.O_EXCL try: if args.destination == "-": - sys.stdout.write(reader.manifest_text()) + stdout.write(reader.manifest_text(strip=args.strip_manifest).encode()) else: out_fd = os.open(args.destination, open_flags) with os.fdopen(out_fd, 'wb') as out_file: - out_file.write(reader.manifest_text()) + out_file.write(reader.manifest_text(strip=args.strip_manifest).encode()) except (IOError, OSError) as error: - abort("can't write to '{}': {}".format(args.destination, error)) + logger.error("can't write to '{}': {}".format(args.destination, error)) + return 1 except (arvados.errors.ApiError, arvados.errors.KeepReadError) as error: - abort("failed to download '{}': {}".format(collection, error)) - sys.exit(0) - - reader.normalize() + logger.error("failed to download '{}': {}".format(col_loc, error)) + return 1 + return 0 # Scan the collection. Make an array of (stream, file, local # destination filename) tuples, and add up total size to extract. todo = [] todo_bytes = 0 try: - for s in reader.all_streams(): - for f in s.all_files(): - if get_prefix and get_prefix[-1] == os.sep: - if 0 != string.find(os.path.join(s.name(), f.name()), - '.' + get_prefix): - continue - if args.destination == "-": - dest_path = "-" - else: - dest_path = os.path.join( - args.destination, - os.path.join(s.name(), f.name())[len(get_prefix)+1:]) - if (not (args.n or args.f or args.skip_existing) and - os.path.exists(dest_path)): - abort('Local file %s already exists.' % (dest_path,)) - else: - if os.path.join(s.name(), f.name()) != '.' + get_prefix: - continue - dest_path = args.destination + if get_prefix == os.sep: + item = reader + else: + item = reader.find('.' + get_prefix) + + if isinstance(item, arvados.collection.Subcollection) or isinstance(item, arvados.collection.CollectionReader): + # If the user asked for a file and we got a subcollection, error out. + if get_prefix[-1] != os.sep: + logger.error("requested file '{}' is in fact a subcollection. Append a trailing '/' to download it.".format('.' + get_prefix)) + return 1 + # If the user asked stdout as a destination, error out. + elif args.destination == '-': + logger.error("cannot use 'stdout' as destination when downloading multiple files.") + return 1 + # User asked for a subcollection, and that's what was found. Add up total size + # to download. + for s, f in files_in_collection(item): + dest_path = os.path.join( + args.destination, + os.path.join(s.stream_name(), f.name)[len(get_prefix)+1:]) + if (not (args.n or args.f or args.skip_existing) and + os.path.exists(dest_path)): + logger.error('Local file %s already exists.' % (dest_path,)) + return 1 todo += [(s, f, dest_path)] todo_bytes += f.size() - except arvados.errors.NotFoundError as e: - abort(e) - - # Read data, and (if not -n) write to local file(s) or pipe. + elif isinstance(item, arvados.arvfile.ArvadosFile): + todo += [(item.parent, item, args.destination)] + todo_bytes += item.size() + else: + logger.error("'{}' not found.".format('.' + get_prefix)) + return 1 + except (IOError, arvados.errors.NotFoundError) as e: + logger.error(e) + return 1 out_bytes = 0 - for s,f,outfilename in todo: + for s, f, outfilename in todo: outfile = None digestor = None if not args.n: if outfilename == "-": - outfile = sys.stdout + outfile = stdout else: if args.skip_existing and os.path.exists(outfilename): logger.debug('Local file %s exists. Skipping.', outfilename) @@ -207,40 +228,59 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr): os.path.isdir(outfilename)): # Good thing we looked again: apparently this file wasn't # here yet when we checked earlier. - abort('Local file %s already exists.' % (outfilename,)) + logger.error('Local file %s already exists.' % (outfilename,)) + return 1 if args.r: arvados.util.mkdir_dash_p(os.path.dirname(outfilename)) try: outfile = open(outfilename, 'wb') except Exception as error: - abort('Open(%s) failed: %s' % (outfilename, error)) + logger.error('Open(%s) failed: %s' % (outfilename, error)) + return 1 if args.hash: digestor = hashlib.new(args.hash) try: - for data in f.readall(): - if outfile: - outfile.write(data) - if digestor: - digestor.update(data) - out_bytes += len(data) - if args.progress: - sys.stderr.write('\r%d MiB / %d MiB %.1f%%' % + with s.open(f.name, 'rb') as file_reader: + for data in file_reader.readall(): + if outfile: + outfile.write(data) + if digestor: + digestor.update(data) + out_bytes += len(data) + if args.progress: + stderr.write('\r%d MiB / %d MiB %.1f%%' % (out_bytes >> 20, todo_bytes >> 20, (100 if todo_bytes==0 else 100.0*out_bytes/todo_bytes))) - elif args.batch_progress: - sys.stderr.write('%s %d read %d total\n' % + elif args.batch_progress: + stderr.write('%s %d read %d total\n' % (sys.argv[0], os.getpid(), out_bytes, todo_bytes)) if digestor: - sys.stderr.write("%s %s/%s\n" - % (digestor.hexdigest(), s.name(), f.name())) + stderr.write("%s %s/%s\n" + % (digestor.hexdigest(), s.stream_name(), f.name)) except KeyboardInterrupt: if outfile and (outfile.fileno() > 2) and not outfile.closed: os.unlink(outfile.name) break + finally: + if outfile != None and outfile != stdout: + outfile.close() if args.progress: - sys.stderr.write('\n') + stderr.write('\n') + return 0 + +def files_in_collection(c): + # Sort first by file type, then alphabetically by file path. + for i in sorted(list(c.keys()), + key=lambda k: ( + isinstance(c[k], arvados.collection.Subcollection), + k.upper())): + if isinstance(c[i], arvados.arvfile.ArvadosFile): + yield (c, c[i]) + elif isinstance(c[i], arvados.collection.Subcollection): + for s, f in files_in_collection(c[i]): + yield (s, f)