sdk/python/arvados/commands/get.py

   1 #!/usr/bin/env python
   2
   3 import argparse
   4 import hashlib
   5 import os
   6 import re
   7 import string
   8 import sys
   9 import logging
  10
  11 import arvados
  12 import arvados.commands._util as arv_cmd
  13
  14 from arvados._version import __version__
  15
  16 api_client = None
  17 logger = logging.getLogger('arvados.arv-get')
  18
  19 parser = argparse.ArgumentParser(
  20     description='Copy data from Keep to a local file or pipe.',
  21     parents=[arv_cmd.retry_opt])
  22 parser.add_argument('--version', action='version',
  23                     version="%s %s" % (sys.argv[0], __version__),
  24                     help='Print version and exit.')
  25 parser.add_argument('locator', type=str,
  26                     help="""
  27 Collection locator, optionally with a file path or prefix.
  28 """)
  29 parser.add_argument('destination', type=str, nargs='?', default='-',
  30                     help="""
  31 Local file or directory where the data is to be written. Default: stdout.
  32 """)
  33 group = parser.add_mutually_exclusive_group()
  34 group.add_argument('--progress', action='store_true',
  35                    help="""
  36 Display human-readable progress on stderr (bytes and, if possible,
  37 percentage of total data size). This is the default behavior when it
  38 is not expected to interfere with the output: specifically, stderr is
  39 a tty _and_ either stdout is not a tty, or output is being written to
  40 named files rather than stdout.
  41 """)
  42 group.add_argument('--no-progress', action='store_true',
  43                    help="""
  44 Do not display human-readable progress on stderr.
  45 """)
  46 group.add_argument('--batch-progress', action='store_true',
  47                    help="""
  48 Display machine-readable progress on stderr (bytes and, if known,
  49 total data size).
  50 """)
  51 group = parser.add_mutually_exclusive_group()
  52 group.add_argument('--hash',
  53                     help="""
  54 Display the hash of each file as it is read from Keep, using the given
  55 hash algorithm. Supported algorithms include md5, sha1, sha224,
  56 sha256, sha384, and sha512.
  57 """)
  58 group.add_argument('--md5sum', action='store_const',
  59                     dest='hash', const='md5',
  60                     help="""
  61 Display the MD5 hash of each file as it is read from Keep.
  62 """)
  63 parser.add_argument('-n', action='store_true',
  64                     help="""
  65 Do not write any data -- just read from Keep, and report md5sums if
  66 requested.
  67 """)
  68 parser.add_argument('-r', action='store_true',
  69                     help="""
  70 Retrieve all files in the specified collection/prefix. This is the
  71 default behavior if the "locator" argument ends with a forward slash.
  72 """)
  73 group = parser.add_mutually_exclusive_group()
  74 group.add_argument('-f', action='store_true',
  75                    help="""
  76 Overwrite existing files while writing. The default behavior is to
  77 refuse to write *anything* if any of the output files already
  78 exist. As a special case, -f is not needed to write to stdout.
  79 """)
  80 group.add_argument('--skip-existing', action='store_true',
  81                    help="""
  82 Skip files that already exist. The default behavior is to refuse to
  83 write *anything* if any files exist that would have to be
  84 overwritten. This option causes even devices, sockets, and fifos to be
  85 skipped.
  86 """)
  87
  88 def parse_arguments(arguments, stdout, stderr):
  89     args = parser.parse_args(arguments)
  90
  91     if args.locator[-1] == os.sep:
  92         args.r = True
  93     if (args.r and
  94         not args.n and
  95         not (args.destination and
  96              os.path.isdir(args.destination))):
  97         parser.error('Destination is not a directory.')
  98     if not args.r and (os.path.isdir(args.destination) or
  99                        args.destination[-1] == os.path.sep):
 100         args.destination = os.path.join(args.destination,
 101                                         os.path.basename(args.locator))
 102         logger.debug("Appended source file name to destination directory: %s",
 103                      args.destination)
 104
 105     if args.destination == '/dev/stdout':
 106         args.destination = "-"
 107
 108     if args.destination == '-':
 109         # Normally you have to use -f to write to a file (or device) that
 110         # already exists, but "-" and "/dev/stdout" are common enough to
 111         # merit a special exception.
 112         args.f = True
 113     else:
 114         args.destination = args.destination.rstrip(os.sep)
 115
 116     # Turn on --progress by default if stderr is a tty and output is
 117     # either going to a named file, or going (via stdout) to something
 118     # that isn't a tty.
 119     if (not (args.batch_progress or args.no_progress)
 120         and stderr.isatty()
 121         and (args.destination != '-'
 122              or not stdout.isatty())):
 123         args.progress = True
 124     return args
 125
 126 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
 127     global api_client
 128
 129     if stdout is sys.stdout and hasattr(stdout, 'buffer'):
 130         # in Python 3, write to stdout as binary
 131         stdout = stdout.buffer
 132
 133     args = parse_arguments(arguments, stdout, stderr)
 134     if api_client is None:
 135         api_client = arvados.api('v1')
 136
 137     r = re.search(r'^(.*?)(/.*)?$', args.locator)
 138     collection = r.group(1)
 139     get_prefix = r.group(2)
 140     if args.r and not get_prefix:
 141         get_prefix = os.sep
 142     try:
 143         reader = arvados.CollectionReader(collection, num_retries=args.retries)
 144     except Exception as error:
 145         logger.error("failed to read collection: {}".format(error))
 146         return 1
 147
 148     # User asked to download the collection's manifest
 149     if not get_prefix:
 150         if not args.n:
 151             open_flags = os.O_CREAT | os.O_WRONLY
 152             if not args.f:
 153                 open_flags |= os.O_EXCL
 154             try:
 155                 if args.destination == "-":
 156                     stdout.write(reader.manifest_text().encode())
 157                 else:
 158                     out_fd = os.open(args.destination, open_flags)
 159                     with os.fdopen(out_fd, 'wb') as out_file:
 160                         out_file.write(reader.manifest_text().encode())
 161             except (IOError, OSError) as error:
 162                 logger.error("can't write to '{}': {}".format(args.destination, error))
 163                 return 1
 164             except (arvados.errors.ApiError, arvados.errors.KeepReadError) as error:
 165                 logger.error("failed to download '{}': {}".format(collection, error))
 166                 return 1
 167         return 0
 168
 169     # Scan the collection. Make an array of (stream, file, local
 170     # destination filename) tuples, and add up total size to extract.
 171     todo = []
 172     todo_bytes = 0
 173     try:
 174         if get_prefix == os.sep:
 175             item = reader
 176         else:
 177             item = reader.find('.' + get_prefix)
 178
 179         if isinstance(item, arvados.collection.Subcollection) or isinstance(item, arvados.collection.CollectionReader):
 180             # If the user asked for a file and we got a subcollection, error out.
 181             if get_prefix[-1] != os.sep:
 182                 logger.error("requested file '{}' is in fact a subcollection. Append a trailing '/' to download it.".format('.' + get_prefix))
 183                 return 1
 184             # If the user asked stdout as a destination, error out.
 185             elif args.destination == '-':
 186                 logger.error("cannot use 'stdout' as destination when downloading multiple files.")
 187                 return 1
 188             # User asked for a subcollection, and that's what was found. Add up total size
 189             # to download.
 190             for s, f in files_in_collection(item):
 191                 dest_path = os.path.join(
 192                     args.destination,
 193                     os.path.join(s.stream_name(), f.name)[len(get_prefix)+1:])
 194                 if (not (args.n or args.f or args.skip_existing) and
 195                     os.path.exists(dest_path)):
 196                     logger.error('Local file %s already exists.' % (dest_path,))
 197                     return 1
 198                 todo += [(s, f, dest_path)]
 199                 todo_bytes += f.size()
 200         elif isinstance(item, arvados.arvfile.ArvadosFile):
 201             todo += [(item.parent, item, args.destination)]
 202             todo_bytes += item.size()
 203         else:
 204             logger.error("'{}' not found.".format('.' + get_prefix))
 205             return 1
 206     except (IOError, arvados.errors.NotFoundError) as e:
 207         logger.error(e)
 208         return 1
 209
 210     out_bytes = 0
 211     for s, f, outfilename in todo:
 212         outfile = None
 213         digestor = None
 214         if not args.n:
 215             if outfilename == "-":
 216                 outfile = stdout
 217             else:
 218                 if args.skip_existing and os.path.exists(outfilename):
 219                     logger.debug('Local file %s exists. Skipping.', outfilename)
 220                     continue
 221                 elif not args.f and (os.path.isfile(outfilename) or
 222                                    os.path.isdir(outfilename)):
 223                     # Good thing we looked again: apparently this file wasn't
 224                     # here yet when we checked earlier.
 225                     logger.error('Local file %s already exists.' % (outfilename,))
 226                     return 1
 227                 if args.r:
 228                     arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
 229                 try:
 230                     outfile = open(outfilename, 'wb')
 231                 except Exception as error:
 232                     logger.error('Open(%s) failed: %s' % (outfilename, error))
 233                     return 1
 234         if args.hash:
 235             digestor = hashlib.new(args.hash)
 236         try:
 237             with s.open(f.name, 'rb') as file_reader:
 238                 for data in file_reader.readall():
 239                     if outfile:
 240                         outfile.write(data)
 241                     if digestor:
 242                         digestor.update(data)
 243                     out_bytes += len(data)
 244                     if args.progress:
 245                         stderr.write('\r%d MiB / %d MiB %.1f%%' %
 246                                      (out_bytes >> 20,
 247                                       todo_bytes >> 20,
 248                                       (100
 249                                        if todo_bytes==0
 250                                        else 100.0*out_bytes/todo_bytes)))
 251                     elif args.batch_progress:
 252                         stderr.write('%s %d read %d total\n' %
 253                                      (sys.argv[0], os.getpid(),
 254                                       out_bytes, todo_bytes))
 255             if digestor:
 256                 stderr.write("%s  %s/%s\n"
 257                              % (digestor.hexdigest(), s.stream_name(), f.name))
 258         except KeyboardInterrupt:
 259             if outfile and (outfile.fileno() > 2) and not outfile.closed:
 260                 os.unlink(outfile.name)
 261             break
 262         finally:
 263             if outfile != None and outfile != stdout:
 264                 outfile.close()
 265
 266     if args.progress:
 267         stderr.write('\n')
 268     return 0
 269
 270 def files_in_collection(c):
 271     # Sort first by file type, then alphabetically by file path.
 272     for i in sorted(list(c.keys()),
 273                     key=lambda k: (
 274                         isinstance(c[k], arvados.collection.Subcollection),
 275                         k.upper())):
 276         if isinstance(c[i], arvados.arvfile.ArvadosFile):
 277             yield (c, c[i])
 278         elif isinstance(c[i], arvados.collection.Subcollection):
 279             for s, f in files_in_collection(c[i]):
 280                 yield (s, f)