sdk/python/arvados/commands/get.py

   1 #!/usr/bin/env python
   2
   3 import argparse
   4 import hashlib
   5 import os
   6 import re
   7 import string
   8 import sys
   9 import logging
  10
  11 import arvados
  12 import arvados.commands._util as arv_cmd
  13 import arvados.util as util
  14
  15 from arvados._version import __version__
  16
  17 api_client = None
  18 logger = logging.getLogger('arvados.arv-get')
  19
  20 parser = argparse.ArgumentParser(
  21     description='Copy data from Keep to a local file or pipe.',
  22     parents=[arv_cmd.retry_opt])
  23 parser.add_argument('--version', action='version',
  24                     version="%s %s" % (sys.argv[0], __version__),
  25                     help='Print version and exit.')
  26 parser.add_argument('locator', type=str,
  27                     help="""
  28 Collection locator, optionally with a file path or prefix.
  29 """)
  30 parser.add_argument('destination', type=str, nargs='?', default='-',
  31                     help="""
  32 Local file or directory where the data is to be written. Default: stdout.
  33 """)
  34 group = parser.add_mutually_exclusive_group()
  35 group.add_argument('--progress', action='store_true',
  36                    help="""
  37 Display human-readable progress on stderr (bytes and, if possible,
  38 percentage of total data size). This is the default behavior when it
  39 is not expected to interfere with the output: specifically, stderr is
  40 a tty _and_ either stdout is not a tty, or output is being written to
  41 named files rather than stdout.
  42 """)
  43 group.add_argument('--no-progress', action='store_true',
  44                    help="""
  45 Do not display human-readable progress on stderr.
  46 """)
  47 group.add_argument('--batch-progress', action='store_true',
  48                    help="""
  49 Display machine-readable progress on stderr (bytes and, if known,
  50 total data size).
  51 """)
  52 group = parser.add_mutually_exclusive_group()
  53 group.add_argument('--hash',
  54                     help="""
  55 Display the hash of each file as it is read from Keep, using the given
  56 hash algorithm. Supported algorithms include md5, sha1, sha224,
  57 sha256, sha384, and sha512.
  58 """)
  59 group.add_argument('--md5sum', action='store_const',
  60                     dest='hash', const='md5',
  61                     help="""
  62 Display the MD5 hash of each file as it is read from Keep.
  63 """)
  64 parser.add_argument('-n', action='store_true',
  65                     help="""
  66 Do not write any data -- just read from Keep, and report md5sums if
  67 requested.
  68 """)
  69 parser.add_argument('-r', action='store_true',
  70                     help="""
  71 Retrieve all files in the specified collection/prefix. This is the
  72 default behavior if the "locator" argument ends with a forward slash.
  73 """)
  74 group = parser.add_mutually_exclusive_group()
  75 group.add_argument('-f', action='store_true',
  76                    help="""
  77 Overwrite existing files while writing. The default behavior is to
  78 refuse to write *anything* if any of the output files already
  79 exist. As a special case, -f is not needed to write to stdout.
  80 """)
  81 group.add_argument('--skip-existing', action='store_true',
  82                    help="""
  83 Skip files that already exist. The default behavior is to refuse to
  84 write *anything* if any files exist that would have to be
  85 overwritten. This option causes even devices, sockets, and fifos to be
  86 skipped.
  87 """)
  88 group.add_argument('--strip-manifest', action='store_true', default=False,
  89                    help="""
  90 When getting a collection manifest, strip its access tokens before writing
  91 it.
  92 """)
  93
  94 def parse_arguments(arguments, stdout, stderr):
  95     args = parser.parse_args(arguments)
  96
  97     if args.locator[-1] == os.sep:
  98         args.r = True
  99     if (args.r and
 100         not args.n and
 101         not (args.destination and
 102              os.path.isdir(args.destination))):
 103         parser.error('Destination is not a directory.')
 104     if not args.r and (os.path.isdir(args.destination) or
 105                        args.destination[-1] == os.path.sep):
 106         args.destination = os.path.join(args.destination,
 107                                         os.path.basename(args.locator))
 108         logger.debug("Appended source file name to destination directory: %s",
 109                      args.destination)
 110
 111     if args.destination == '/dev/stdout':
 112         args.destination = "-"
 113
 114     if args.destination == '-':
 115         # Normally you have to use -f to write to a file (or device) that
 116         # already exists, but "-" and "/dev/stdout" are common enough to
 117         # merit a special exception.
 118         args.f = True
 119     else:
 120         args.destination = args.destination.rstrip(os.sep)
 121
 122     # Turn on --progress by default if stderr is a tty and output is
 123     # either going to a named file, or going (via stdout) to something
 124     # that isn't a tty.
 125     if (not (args.batch_progress or args.no_progress)
 126         and stderr.isatty()
 127         and (args.destination != '-'
 128              or not stdout.isatty())):
 129         args.progress = True
 130     return args
 131
 132 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
 133     global api_client
 134
 135     if stdout is sys.stdout and hasattr(stdout, 'buffer'):
 136         # in Python 3, write to stdout as binary
 137         stdout = stdout.buffer
 138
 139     args = parse_arguments(arguments, stdout, stderr)
 140     if api_client is None:
 141         api_client = arvados.api('v1')
 142
 143     r = re.search(r'^(.*?)(/.*)?$', args.locator)
 144     col_loc = r.group(1)
 145     get_prefix = r.group(2)
 146     if args.r and not get_prefix:
 147         get_prefix = os.sep
 148     try:
 149         reader = arvados.CollectionReader(col_loc, num_retries=args.retries)
 150     except Exception as error:
 151         logger.error("failed to read collection: {}".format(error))
 152         return 1
 153
 154     # User asked to download the collection's manifest
 155     if not get_prefix:
 156         if not args.n:
 157             open_flags = os.O_CREAT | os.O_WRONLY
 158             if not args.f:
 159                 open_flags |= os.O_EXCL
 160             try:
 161                 if args.destination == "-":
 162                     stdout.write(reader.manifest_text(strip=args.strip_manifest).encode())
 163                 else:
 164                     out_fd = os.open(args.destination, open_flags)
 165                     with os.fdopen(out_fd, 'wb') as out_file:
 166                         out_file.write(reader.manifest_text(strip=args.strip_manifest).encode())
 167             except (IOError, OSError) as error:
 168                 logger.error("can't write to '{}': {}".format(args.destination, error))
 169                 return 1
 170             except (arvados.errors.ApiError, arvados.errors.KeepReadError) as error:
 171                 logger.error("failed to download '{}': {}".format(col_loc, error))
 172                 return 1
 173         return 0
 174
 175     # Scan the collection. Make an array of (stream, file, local
 176     # destination filename) tuples, and add up total size to extract.
 177     todo = []
 178     todo_bytes = 0
 179     try:
 180         if get_prefix == os.sep:
 181             item = reader
 182         else:
 183             item = reader.find('.' + get_prefix)
 184
 185         if isinstance(item, arvados.collection.Subcollection) or isinstance(item, arvados.collection.CollectionReader):
 186             # If the user asked for a file and we got a subcollection, error out.
 187             if get_prefix[-1] != os.sep:
 188                 logger.error("requested file '{}' is in fact a subcollection. Append a trailing '/' to download it.".format('.' + get_prefix))
 189                 return 1
 190             # If the user asked stdout as a destination, error out.
 191             elif args.destination == '-':
 192                 logger.error("cannot use 'stdout' as destination when downloading multiple files.")
 193                 return 1
 194             # User asked for a subcollection, and that's what was found. Add up total size
 195             # to download.
 196             for s, f in files_in_collection(item):
 197                 dest_path = os.path.join(
 198                     args.destination,
 199                     os.path.join(s.stream_name(), f.name)[len(get_prefix)+1:])
 200                 if (not (args.n or args.f or args.skip_existing) and
 201                     os.path.exists(dest_path)):
 202                     logger.error('Local file %s already exists.' % (dest_path,))
 203                     return 1
 204                 todo += [(s, f, dest_path)]
 205                 todo_bytes += f.size()
 206         elif isinstance(item, arvados.arvfile.ArvadosFile):
 207             todo += [(item.parent, item, args.destination)]
 208             todo_bytes += item.size()
 209         else:
 210             logger.error("'{}' not found.".format('.' + get_prefix))
 211             return 1
 212     except (IOError, arvados.errors.NotFoundError) as e:
 213         logger.error(e)
 214         return 1
 215
 216     out_bytes = 0
 217     for s, f, outfilename in todo:
 218         outfile = None
 219         digestor = None
 220         if not args.n:
 221             if outfilename == "-":
 222                 outfile = stdout
 223             else:
 224                 if args.skip_existing and os.path.exists(outfilename):
 225                     logger.debug('Local file %s exists. Skipping.', outfilename)
 226                     continue
 227                 elif not args.f and (os.path.isfile(outfilename) or
 228                                    os.path.isdir(outfilename)):
 229                     # Good thing we looked again: apparently this file wasn't
 230                     # here yet when we checked earlier.
 231                     logger.error('Local file %s already exists.' % (outfilename,))
 232                     return 1
 233                 if args.r:
 234                     arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
 235                 try:
 236                     outfile = open(outfilename, 'wb')
 237                 except Exception as error:
 238                     logger.error('Open(%s) failed: %s' % (outfilename, error))
 239                     return 1
 240         if args.hash:
 241             digestor = hashlib.new(args.hash)
 242         try:
 243             with s.open(f.name, 'rb') as file_reader:
 244                 for data in file_reader.readall():
 245                     if outfile:
 246                         outfile.write(data)
 247                     if digestor:
 248                         digestor.update(data)
 249                     out_bytes += len(data)
 250                     if args.progress:
 251                         stderr.write('\r%d MiB / %d MiB %.1f%%' %
 252                                      (out_bytes >> 20,
 253                                       todo_bytes >> 20,
 254                                       (100
 255                                        if todo_bytes==0
 256                                        else 100.0*out_bytes/todo_bytes)))
 257                     elif args.batch_progress:
 258                         stderr.write('%s %d read %d total\n' %
 259                                      (sys.argv[0], os.getpid(),
 260                                       out_bytes, todo_bytes))
 261             if digestor:
 262                 stderr.write("%s  %s/%s\n"
 263                              % (digestor.hexdigest(), s.stream_name(), f.name))
 264         except KeyboardInterrupt:
 265             if outfile and (outfile.fileno() > 2) and not outfile.closed:
 266                 os.unlink(outfile.name)
 267             break
 268         finally:
 269             if outfile != None and outfile != stdout:
 270                 outfile.close()
 271
 272     if args.progress:
 273         stderr.write('\n')
 274     return 0
 275
 276 def files_in_collection(c):
 277     # Sort first by file type, then alphabetically by file path.
 278     for i in sorted(list(c.keys()),
 279                     key=lambda k: (
 280                         isinstance(c[k], arvados.collection.Subcollection),
 281                         k.upper())):
 282         if isinstance(c[i], arvados.arvfile.ArvadosFile):
 283             yield (c, c[i])
 284         elif isinstance(c[i], arvados.collection.Subcollection):
 285             for s, f in files_in_collection(c[i]):
 286                 yield (s, f)