sdk/python/bin/arv-get

   1 #!/usr/bin/env python
   2
   3 import argparse
   4 import hashlib
   5 import os
   6 import re
   7 import string
   8 import sys
   9 import logging
  10
  11 import arvados
  12
  13 logger = logging.getLogger('arvados.arv-get')
  14
  15 def abort(msg, code=1):
  16     print >>sys.stderr, "arv-get:", msg
  17     exit(code)
  18
  19 parser = argparse.ArgumentParser(
  20     description='Copy data from Keep to a local file or pipe.')
  21 parser.add_argument('locator', type=str,
  22                     help="""
  23 Collection locator, optionally with a file path or prefix.
  24 """)
  25 parser.add_argument('destination', type=str, nargs='?', default='/dev/stdout',
  26                     help="""
  27 Local file or directory where the data is to be written. Default:
  28 /dev/stdout.
  29 """)
  30 group = parser.add_mutually_exclusive_group()
  31 group.add_argument('--progress', action='store_true',
  32                    help="""
  33 Display human-readable progress on stderr (bytes and, if possible,
  34 percentage of total data size). This is the default behavior when it
  35 is not expected to interfere with the output: specifically, stderr is
  36 a tty _and_ either stdout is not a tty, or output is being written to
  37 named files rather than stdout.
  38 """)
  39 group.add_argument('--no-progress', action='store_true',
  40                    help="""
  41 Do not display human-readable progress on stderr.
  42 """)
  43 group.add_argument('--batch-progress', action='store_true',
  44                    help="""
  45 Display machine-readable progress on stderr (bytes and, if known,
  46 total data size).
  47 """)
  48 group = parser.add_mutually_exclusive_group()
  49 group.add_argument('--hash',
  50                     help="""
  51 Display the hash of each file as it is read from Keep, using the given
  52 hash algorithm. Supported algorithms include md5, sha1, sha224,
  53 sha256, sha384, and sha512.
  54 """)
  55 group.add_argument('--md5sum', action='store_const',
  56                     dest='hash', const='md5',
  57                     help="""
  58 Display the MD5 hash of each file as it is read from Keep.
  59 """)
  60 parser.add_argument('-n', action='store_true',
  61                     help="""
  62 Do not write any data -- just read from Keep, and report md5sums if
  63 requested.
  64 """)
  65 parser.add_argument('-r', action='store_true',
  66                     help="""
  67 Retrieve all files in the specified collection/prefix. This is the
  68 default behavior if the "locator" argument ends with a forward slash.
  69 """)
  70 group = parser.add_mutually_exclusive_group()
  71 group.add_argument('-f', action='store_true',
  72                    help="""
  73 Overwrite existing files while writing. The default behavior is to
  74 refuse to write *anything* if any of the output files already
  75 exist. As a special case, -f is not needed to write to /dev/stdout.
  76 """)
  77 group.add_argument('--skip-existing', action='store_true',
  78                    help="""
  79 Skip files that already exist. The default behavior is to refuse to
  80 write *anything* if any files exist that would have to be
  81 overwritten. This option causes even devices, sockets, and fifos to be
  82 skipped.
  83 """)
  84
  85 args = parser.parse_args()
  86
  87 if args.locator[-1] == os.sep:
  88     args.r = True
  89 if (args.r and
  90     not args.n and
  91     not (args.destination and
  92          os.path.isdir(args.destination))):
  93     parser.error('Destination is not a directory.')
  94 if not args.r and (os.path.isdir(args.destination) or
  95                    args.destination[-1] == os.path.sep):
  96     args.destination = os.path.join(args.destination,
  97                                     os.path.basename(args.locator))
  98     logger.debug("Appended source file name to destination directory: %s",
  99                  args.destination)
 100
 101 if args.destination == '-':
 102     args.destination = '/dev/stdout'
 103 if args.destination == '/dev/stdout':
 104     # Normally you have to use -f to write to a file (or device) that
 105     # already exists, but "-" and "/dev/stdout" are common enough to
 106     # merit a special exception.
 107     args.f = True
 108 else:
 109     args.destination = args.destination.rstrip(os.sep)
 110
 111 # Turn on --progress by default if stderr is a tty and output is
 112 # either going to a named file, or going (via stdout) to something
 113 # that isn't a tty.
 114 if (not (args.batch_progress or args.no_progress)
 115     and sys.stderr.isatty()
 116     and (args.destination != '/dev/stdout'
 117          or not sys.stdout.isatty())):
 118     args.progress = True
 119
 120
 121 r = re.search(r'^(.*?)(/.*)?$', args.locator)
 122 collection = r.group(1)
 123 get_prefix = r.group(2)
 124 if args.r and not get_prefix:
 125     get_prefix = os.sep
 126
 127 todo = []
 128 todo_bytes = 0
 129 if not get_prefix:
 130     try:
 131         if not args.n:
 132             if not args.f and os.path.exists(args.destination):
 133                 abort('Local file %s already exists.' % (args.destination,))
 134             with open(args.destination, 'wb') as f:
 135                 try:
 136                     c = arvados.api('v1').collections().get(
 137                         uuid=collection).execute()
 138                     manifest = c['manifest_text']
 139                 except Exception as e:
 140                     logger.warning(
 141                         "Collection %s not found. " +
 142                         "Trying to fetch directly from Keep (deprecated).",
 143                         collection)
 144                     manifest = arvados.Keep.get(collection)
 145                 f.write(manifest)
 146         sys.exit(0)
 147     except arvados.errors.NotFoundError as e:
 148         abort(e)
 149
 150 reader = arvados.CollectionReader(collection)
 151
 152 # Scan the collection. Make an array of (stream, file, local
 153 # destination filename) tuples, and add up total size to extract.
 154
 155 try:
 156     for s in reader.all_streams():
 157         for f in s.all_files():
 158             if get_prefix and get_prefix[-1] == os.sep:
 159                 if 0 != string.find(os.path.join(s.name(), f.name()),
 160                                     '.' + get_prefix):
 161                     continue
 162                 dest_path = os.path.join(
 163                     args.destination,
 164                     os.path.join(s.name(), f.name())[len(get_prefix)+1:])
 165                 if (not (args.n or args.f or args.skip_existing) and
 166                     os.path.exists(dest_path)):
 167                     abort('Local file %s already exists.' % (dest_path,))
 168             else:
 169                 if os.path.join(s.name(), f.name()) != '.' + get_prefix:
 170                     continue
 171                 dest_path = args.destination
 172             todo += [(s, f, dest_path)]
 173             todo_bytes += f.size()
 174 except arvados.errors.NotFoundError as e:
 175     abort(e)
 176
 177 # Read data, and (if not -n) write to local file(s) or pipe.
 178
 179 out_bytes = 0
 180 for s,f,outfilename in todo:
 181     outfile = None
 182     digestor = None
 183     if not args.n:
 184         if args.skip_existing and os.path.exists(outfilename):
 185             logger.debug('Local file %s exists. Skipping.', outfilename)
 186             continue
 187         elif not args.f and (os.path.isfile(outfilename) or
 188                            os.path.isdir(outfilename)):
 189             # Good thing we looked again: apparently this file wasn't
 190             # here yet when we checked earlier.
 191             abort('Local file %s already exists.' % (outfilename,))
 192         if args.r:
 193             arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
 194         try:
 195             outfile = open(outfilename, 'wb')
 196         except Exception as e:
 197             abort('Open(%s) failed: %s' % (outfilename, e))
 198     if args.hash:
 199         digestor = hashlib.new(args.hash)
 200     try:
 201         for data in f.readall():
 202             if outfile:
 203                 outfile.write(data)
 204             if digestor:
 205                 digestor.update(data)
 206             out_bytes += len(data)
 207             if args.progress:
 208                 sys.stderr.write('\r%d MiB / %d MiB %.1f%%' %
 209                                  (out_bytes >> 20,
 210                                   todo_bytes >> 20,
 211                                   (100
 212                                    if todo_bytes==0
 213                                    else 100.0*out_bytes/todo_bytes)))
 214             elif args.batch_progress:
 215                 sys.stderr.write('%s %d read %d total\n' %
 216                                  (sys.argv[0], os.getpid(),
 217                                   out_bytes, todo_bytes))
 218         if digestor:
 219             sys.stderr.write("%s  %s/%s\n"
 220                              % (digestor.hexdigest(), s.name(), f.name()))
 221     except KeyboardInterrupt:
 222         if outfile and outfile != '/dev/stdout':
 223             os.unlink(outfilename)
 224         break
 225
 226 if args.progress:
 227     sys.stderr.write('\n')