sdk/python/arvados/commands/get.py

   1 #!/usr/bin/env python
   2 # Copyright (C) The Arvados Authors. All rights reserved.
   3 #
   4 # SPDX-License-Identifier: Apache-2.0
   5
   6 import argparse
   7 import hashlib
   8 import os
   9 import re
  10 import string
  11 import sys
  12 import logging
  13
  14 import arvados
  15 import arvados.commands._util as arv_cmd
  16 import arvados.util as util
  17
  18 from arvados._version import __version__
  19
  20 api_client = None
  21 logger = logging.getLogger('arvados.arv-get')
  22
  23 parser = argparse.ArgumentParser(
  24     description='Copy data from Keep to a local file or pipe.',
  25     parents=[arv_cmd.retry_opt])
  26 parser.add_argument('--version', action='version',
  27                     version="%s %s" % (sys.argv[0], __version__),
  28                     help='Print version and exit.')
  29 parser.add_argument('locator', type=str,
  30                     help="""
  31 Collection locator, optionally with a file path or prefix.
  32 """)
  33 parser.add_argument('destination', type=str, nargs='?', default='-',
  34                     help="""
  35 Local file or directory where the data is to be written. Default: stdout.
  36 """)
  37 group = parser.add_mutually_exclusive_group()
  38 group.add_argument('--progress', action='store_true',
  39                    help="""
  40 Display human-readable progress on stderr (bytes and, if possible,
  41 percentage of total data size). This is the default behavior when it
  42 is not expected to interfere with the output: specifically, stderr is
  43 a tty _and_ either stdout is not a tty, or output is being written to
  44 named files rather than stdout.
  45 """)
  46 group.add_argument('--no-progress', action='store_true',
  47                    help="""
  48 Do not display human-readable progress on stderr.
  49 """)
  50 group.add_argument('--batch-progress', action='store_true',
  51                    help="""
  52 Display machine-readable progress on stderr (bytes and, if known,
  53 total data size).
  54 """)
  55 group = parser.add_mutually_exclusive_group()
  56 group.add_argument('--hash',
  57                     help="""
  58 Display the hash of each file as it is read from Keep, using the given
  59 hash algorithm. Supported algorithms include md5, sha1, sha224,
  60 sha256, sha384, and sha512.
  61 """)
  62 group.add_argument('--md5sum', action='store_const',
  63                     dest='hash', const='md5',
  64                     help="""
  65 Display the MD5 hash of each file as it is read from Keep.
  66 """)
  67 parser.add_argument('-n', action='store_true',
  68                     help="""
  69 Do not write any data -- just read from Keep, and report md5sums if
  70 requested.
  71 """)
  72 parser.add_argument('-r', action='store_true',
  73                     help="""
  74 Retrieve all files in the specified collection/prefix. This is the
  75 default behavior if the "locator" argument ends with a forward slash.
  76 """)
  77 group = parser.add_mutually_exclusive_group()
  78 group.add_argument('-f', action='store_true',
  79                    help="""
  80 Overwrite existing files while writing. The default behavior is to
  81 refuse to write *anything* if any of the output files already
  82 exist. As a special case, -f is not needed to write to stdout.
  83 """)
  84 group.add_argument('--skip-existing', action='store_true',
  85                    help="""
  86 Skip files that already exist. The default behavior is to refuse to
  87 write *anything* if any files exist that would have to be
  88 overwritten. This option causes even devices, sockets, and fifos to be
  89 skipped.
  90 """)
  91 group.add_argument('--strip-manifest', action='store_true', default=False,
  92                    help="""
  93 When getting a collection manifest, strip its access tokens before writing
  94 it.
  95 """)
  96
  97 def parse_arguments(arguments, stdout, stderr):
  98     args = parser.parse_args(arguments)
  99
 100     if args.locator[-1] == os.sep:
 101         args.r = True
 102     if (args.r and
 103         not args.n and
 104         not (args.destination and
 105              os.path.isdir(args.destination))):
 106         parser.error('Destination is not a directory.')
 107     if not args.r and (os.path.isdir(args.destination) or
 108                        args.destination[-1] == os.path.sep):
 109         args.destination = os.path.join(args.destination,
 110                                         os.path.basename(args.locator))
 111         logger.debug("Appended source file name to destination directory: %s",
 112                      args.destination)
 113
 114     if args.destination == '/dev/stdout':
 115         args.destination = "-"
 116
 117     if args.destination == '-':
 118         # Normally you have to use -f to write to a file (or device) that
 119         # already exists, but "-" and "/dev/stdout" are common enough to
 120         # merit a special exception.
 121         args.f = True
 122     else:
 123         args.destination = args.destination.rstrip(os.sep)
 124
 125     # Turn on --progress by default if stderr is a tty and output is
 126     # either going to a named file, or going (via stdout) to something
 127     # that isn't a tty.
 128     if (not (args.batch_progress or args.no_progress)
 129         and stderr.isatty()
 130         and (args.destination != '-'
 131              or not stdout.isatty())):
 132         args.progress = True
 133     return args
 134
 135 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
 136     global api_client
 137
 138     if stdout is sys.stdout and hasattr(stdout, 'buffer'):
 139         # in Python 3, write to stdout as binary
 140         stdout = stdout.buffer
 141
 142     args = parse_arguments(arguments, stdout, stderr)
 143     if api_client is None:
 144         api_client = arvados.api('v1')
 145
 146     r = re.search(r'^(.*?)(/.*)?$', args.locator)
 147     col_loc = r.group(1)
 148     get_prefix = r.group(2)
 149     if args.r and not get_prefix:
 150         get_prefix = os.sep
 151     try:
 152         reader = arvados.CollectionReader(col_loc, num_retries=args.retries)
 153     except Exception as error:
 154         logger.error("failed to read collection: {}".format(error))
 155         return 1
 156
 157     # User asked to download the collection's manifest
 158     if not get_prefix:
 159         if not args.n:
 160             open_flags = os.O_CREAT | os.O_WRONLY
 161             if not args.f:
 162                 open_flags |= os.O_EXCL
 163             try:
 164                 if args.destination == "-":
 165                     stdout.write(reader.manifest_text(strip=args.strip_manifest).encode())
 166                 else:
 167                     out_fd = os.open(args.destination, open_flags)
 168                     with os.fdopen(out_fd, 'wb') as out_file:
 169                         out_file.write(reader.manifest_text(strip=args.strip_manifest).encode())
 170             except (IOError, OSError) as error:
 171                 logger.error("can't write to '{}': {}".format(args.destination, error))
 172                 return 1
 173             except (arvados.errors.ApiError, arvados.errors.KeepReadError) as error:
 174                 logger.error("failed to download '{}': {}".format(col_loc, error))
 175                 return 1
 176         return 0
 177
 178     # Scan the collection. Make an array of (stream, file, local
 179     # destination filename) tuples, and add up total size to extract.
 180     todo = []
 181     todo_bytes = 0
 182     try:
 183         if get_prefix == os.sep:
 184             item = reader
 185         else:
 186             item = reader.find('.' + get_prefix)
 187
 188         if isinstance(item, arvados.collection.Subcollection) or isinstance(item, arvados.collection.CollectionReader):
 189             # If the user asked for a file and we got a subcollection, error out.
 190             if get_prefix[-1] != os.sep:
 191                 logger.error("requested file '{}' is in fact a subcollection. Append a trailing '/' to download it.".format('.' + get_prefix))
 192                 return 1
 193             # If the user asked stdout as a destination, error out.
 194             elif args.destination == '-':
 195                 logger.error("cannot use 'stdout' as destination when downloading multiple files.")
 196                 return 1
 197             # User asked for a subcollection, and that's what was found. Add up total size
 198             # to download.
 199             for s, f in files_in_collection(item):
 200                 dest_path = os.path.join(
 201                     args.destination,
 202                     os.path.join(s.stream_name(), f.name)[len(get_prefix)+1:])
 203                 if (not (args.n or args.f or args.skip_existing) and
 204                     os.path.exists(dest_path)):
 205                     logger.error('Local file %s already exists.' % (dest_path,))
 206                     return 1
 207                 todo += [(s, f, dest_path)]
 208                 todo_bytes += f.size()
 209         elif isinstance(item, arvados.arvfile.ArvadosFile):
 210             todo += [(item.parent, item, args.destination)]
 211             todo_bytes += item.size()
 212         else:
 213             logger.error("'{}' not found.".format('.' + get_prefix))
 214             return 1
 215     except (IOError, arvados.errors.NotFoundError) as e:
 216         logger.error(e)
 217         return 1
 218
 219     out_bytes = 0
 220     for s, f, outfilename in todo:
 221         outfile = None
 222         digestor = None
 223         if not args.n:
 224             if outfilename == "-":
 225                 outfile = stdout
 226             else:
 227                 if args.skip_existing and os.path.exists(outfilename):
 228                     logger.debug('Local file %s exists. Skipping.', outfilename)
 229                     continue
 230                 elif not args.f and (os.path.isfile(outfilename) or
 231                                    os.path.isdir(outfilename)):
 232                     # Good thing we looked again: apparently this file wasn't
 233                     # here yet when we checked earlier.
 234                     logger.error('Local file %s already exists.' % (outfilename,))
 235                     return 1
 236                 if args.r:
 237                     arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
 238                 try:
 239                     outfile = open(outfilename, 'wb')
 240                 except Exception as error:
 241                     logger.error('Open(%s) failed: %s' % (outfilename, error))
 242                     return 1
 243         if args.hash:
 244             digestor = hashlib.new(args.hash)
 245         try:
 246             with s.open(f.name, 'rb') as file_reader:
 247                 for data in file_reader.readall():
 248                     if outfile:
 249                         outfile.write(data)
 250                     if digestor:
 251                         digestor.update(data)
 252                     out_bytes += len(data)
 253                     if args.progress:
 254                         stderr.write('\r%d MiB / %d MiB %.1f%%' %
 255                                      (out_bytes >> 20,
 256                                       todo_bytes >> 20,
 257                                       (100
 258                                        if todo_bytes==0
 259                                        else 100.0*out_bytes/todo_bytes)))
 260                     elif args.batch_progress:
 261                         stderr.write('%s %d read %d total\n' %
 262                                      (sys.argv[0], os.getpid(),
 263                                       out_bytes, todo_bytes))
 264             if digestor:
 265                 stderr.write("%s  %s/%s\n"
 266                              % (digestor.hexdigest(), s.stream_name(), f.name))
 267         except KeyboardInterrupt:
 268             if outfile and (outfile.fileno() > 2) and not outfile.closed:
 269                 os.unlink(outfile.name)
 270             break
 271         finally:
 272             if outfile != None and outfile != stdout:
 273                 outfile.close()
 274
 275     if args.progress:
 276         stderr.write('\n')
 277     return 0
 278
 279 def files_in_collection(c):
 280     # Sort first by file type, then alphabetically by file path.
 281     for i in sorted(list(c.keys()),
 282                     key=lambda k: (
 283                         isinstance(c[k], arvados.collection.Subcollection),
 284                         k.upper())):
 285         if isinstance(c[i], arvados.arvfile.ArvadosFile):
 286             yield (c, c[i])
 287         elif isinstance(c[i], arvados.collection.Subcollection):
 288             for s, f in files_in_collection(c[i]):
 289                 yield (s, f)