X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/0568c2d42703a7b839f2661968c05a23753f67c3..04efddf61ee4a0e5c65a72a538fe3f026ae94e8e:/sdk/python/arvados/commands/get.py

diff --git a/sdk/python/arvados/commands/get.py b/sdk/python/arvados/commands/get.py
index b23f2d07ed..888fd390f0 100755
--- a/sdk/python/arvados/commands/get.py
+++ b/sdk/python/arvados/commands/get.py
@@ -10,14 +10,12 @@ import logging
 
 import arvados
 import arvados.commands._util as arv_cmd
+import arvados.util as util
 
 from arvados._version import __version__
 
 api_client = None
-
-def abort(msg, code=1):
-    print >>sys.stderr, "arv-get:", msg
-    exit(code)
+logger = logging.getLogger('arvados.arv-get')
 
 parser = argparse.ArgumentParser(
     description='Copy data from Keep to a local file or pipe.',
@@ -87,9 +85,14 @@ write *anything* if any files exist that would have to be
 overwritten. This option causes even devices, sockets, and fifos to be
 skipped.
 """)
+group.add_argument('--strip-manifest', action='store_true', default=False,
+                   help="""
+When getting a collection manifest, strip its access tokens before writing
+it.
+""")
 
-def parse_arguments(arguments, logger):
-    args = parser.parse_args()
+def parse_arguments(arguments, stdout, stderr):
+    args = parser.parse_args(arguments)
 
     if args.locator[-1] == os.sep:
         args.r = True
@@ -120,27 +123,35 @@ def parse_arguments(arguments, logger):
     # either going to a named file, or going (via stdout) to something
     # that isn't a tty.
     if (not (args.batch_progress or args.no_progress)
-        and sys.stderr.isatty()
+        and stderr.isatty()
         and (args.destination != '-'
-             or not sys.stdout.isatty())):
+             or not stdout.isatty())):
         args.progress = True
     return args
 
 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
     global api_client
-    
-    logger = logging.getLogger('arvados.arv-get')
-    args = parse_arguments(arguments, logger)
+
+    if stdout is sys.stdout and hasattr(stdout, 'buffer'):
+        # in Python 3, write to stdout as binary
+        stdout = stdout.buffer
+
+    args = parse_arguments(arguments, stdout, stderr)
     if api_client is None:
         api_client = arvados.api('v1')
 
     r = re.search(r'^(.*?)(/.*)?$', args.locator)
-    collection = r.group(1)
+    col_loc = r.group(1)
     get_prefix = r.group(2)
     if args.r and not get_prefix:
         get_prefix = os.sep
-    reader = arvados.CollectionReader(collection, num_retries=args.retries)
+    try:
+        reader = arvados.CollectionReader(col_loc, num_retries=args.retries)
+    except Exception as error:
+        logger.error("failed to read collection: {}".format(error))
+        return 1
 
+    # User asked to download the collection's manifest
     if not get_prefix:
         if not args.n:
             open_flags = os.O_CREAT | os.O_WRONLY
@@ -148,57 +159,67 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
                 open_flags |= os.O_EXCL
             try:
                 if args.destination == "-":
-                    sys.stdout.write(reader.manifest_text())
+                    stdout.write(reader.manifest_text(strip=args.strip_manifest).encode())
                 else:
                     out_fd = os.open(args.destination, open_flags)
                     with os.fdopen(out_fd, 'wb') as out_file:
-                        out_file.write(reader.manifest_text())
+                        out_file.write(reader.manifest_text(strip=args.strip_manifest).encode())
             except (IOError, OSError) as error:
-                abort("can't write to '{}': {}".format(args.destination, error))
+                logger.error("can't write to '{}': {}".format(args.destination, error))
+                return 1
             except (arvados.errors.ApiError, arvados.errors.KeepReadError) as error:
-                abort("failed to download '{}': {}".format(collection, error))
-        sys.exit(0)
-
-    reader.normalize()
+                logger.error("failed to download '{}': {}".format(col_loc, error))
+                return 1
+        return 0
 
     # Scan the collection. Make an array of (stream, file, local
     # destination filename) tuples, and add up total size to extract.
     todo = []
     todo_bytes = 0
     try:
-        for s in reader.all_streams():
-            for f in s.all_files():
-                if get_prefix and get_prefix[-1] == os.sep:
-                    if 0 != string.find(os.path.join(s.name(), f.name()),
-                                        '.' + get_prefix):
-                        continue
-                    if args.destination == "-":
-                        dest_path = "-"
-                    else:
-                        dest_path = os.path.join(
-                            args.destination,
-                            os.path.join(s.name(), f.name())[len(get_prefix)+1:])
-                        if (not (args.n or args.f or args.skip_existing) and
-                            os.path.exists(dest_path)):
-                            abort('Local file %s already exists.' % (dest_path,))
-                else:
-                    if os.path.join(s.name(), f.name()) != '.' + get_prefix:
-                        continue
-                    dest_path = args.destination
+        if get_prefix == os.sep:
+            item = reader
+        else:
+            item = reader.find('.' + get_prefix)
+
+        if isinstance(item, arvados.collection.Subcollection) or isinstance(item, arvados.collection.CollectionReader):
+            # If the user asked for a file and we got a subcollection, error out.
+            if get_prefix[-1] != os.sep:
+                logger.error("requested file '{}' is in fact a subcollection. Append a trailing '/' to download it.".format('.' + get_prefix))
+                return 1
+            # If the user asked stdout as a destination, error out.
+            elif args.destination == '-':
+                logger.error("cannot use 'stdout' as destination when downloading multiple files.")
+                return 1
+            # User asked for a subcollection, and that's what was found. Add up total size
+            # to download.
+            for s, f in files_in_collection(item):
+                dest_path = os.path.join(
+                    args.destination,
+                    os.path.join(s.stream_name(), f.name)[len(get_prefix)+1:])
+                if (not (args.n or args.f or args.skip_existing) and
+                    os.path.exists(dest_path)):
+                    logger.error('Local file %s already exists.' % (dest_path,))
+                    return 1
                 todo += [(s, f, dest_path)]
                 todo_bytes += f.size()
-    except arvados.errors.NotFoundError as e:
-        abort(e)
-
-    # Read data, and (if not -n) write to local file(s) or pipe.
+        elif isinstance(item, arvados.arvfile.ArvadosFile):
+            todo += [(item.parent, item, args.destination)]
+            todo_bytes += item.size()
+        else:
+            logger.error("'{}' not found.".format('.' + get_prefix))
+            return 1
+    except (IOError, arvados.errors.NotFoundError) as e:
+        logger.error(e)
+        return 1
 
     out_bytes = 0
-    for s,f,outfilename in todo:
+    for s, f, outfilename in todo:
         outfile = None
         digestor = None
         if not args.n:
             if outfilename == "-":
-                outfile = sys.stdout
+                outfile = stdout
             else:
                 if args.skip_existing and os.path.exists(outfilename):
                     logger.debug('Local file %s exists. Skipping.', outfilename)
@@ -207,40 +228,59 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
                                    os.path.isdir(outfilename)):
                     # Good thing we looked again: apparently this file wasn't
                     # here yet when we checked earlier.
-                    abort('Local file %s already exists.' % (outfilename,))
+                    logger.error('Local file %s already exists.' % (outfilename,))
+                    return 1
                 if args.r:
                     arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
                 try:
                     outfile = open(outfilename, 'wb')
                 except Exception as error:
-                    abort('Open(%s) failed: %s' % (outfilename, error))
+                    logger.error('Open(%s) failed: %s' % (outfilename, error))
+                    return 1
         if args.hash:
             digestor = hashlib.new(args.hash)
         try:
-            for data in f.readall():
-                if outfile:
-                    outfile.write(data)
-                if digestor:
-                    digestor.update(data)
-                out_bytes += len(data)
-                if args.progress:
-                    sys.stderr.write('\r%d MiB / %d MiB %.1f%%' %
+            with s.open(f.name, 'rb') as file_reader:
+                for data in file_reader.readall():
+                    if outfile:
+                        outfile.write(data)
+                    if digestor:
+                        digestor.update(data)
+                    out_bytes += len(data)
+                    if args.progress:
+                        stderr.write('\r%d MiB / %d MiB %.1f%%' %
                                      (out_bytes >> 20,
                                       todo_bytes >> 20,
                                       (100
                                        if todo_bytes==0
                                        else 100.0*out_bytes/todo_bytes)))
-                elif args.batch_progress:
-                    sys.stderr.write('%s %d read %d total\n' %
+                    elif args.batch_progress:
+                        stderr.write('%s %d read %d total\n' %
                                      (sys.argv[0], os.getpid(),
                                       out_bytes, todo_bytes))
             if digestor:
-                sys.stderr.write("%s  %s/%s\n"
-                                 % (digestor.hexdigest(), s.name(), f.name()))
+                stderr.write("%s  %s/%s\n"
+                             % (digestor.hexdigest(), s.stream_name(), f.name))
         except KeyboardInterrupt:
             if outfile and (outfile.fileno() > 2) and not outfile.closed:
                 os.unlink(outfile.name)
             break
+        finally:
+            if outfile != None and outfile != stdout:
+                outfile.close()
 
     if args.progress:
-        sys.stderr.write('\n')
+        stderr.write('\n')
+    return 0
+
+def files_in_collection(c):
+    # Sort first by file type, then alphabetically by file path.
+    for i in sorted(list(c.keys()),
+                    key=lambda k: (
+                        isinstance(c[k], arvados.collection.Subcollection),
+                        k.upper())):
+        if isinstance(c[i], arvados.arvfile.ArvadosFile):
+            yield (c, c[i])
+        elif isinstance(c[i], arvados.collection.Subcollection):
+            for s, f in files_in_collection(c[i]):
+                yield (s, f)