12 import arvados.commands._util as arv_cmd
14 from arvados._version import __version__
18 def abort(msg, code=1):
19 print >>sys.stderr, "arv-get:", msg
22 parser = argparse.ArgumentParser(
23 description='Copy data from Keep to a local file or pipe.',
24 parents=[arv_cmd.retry_opt])
25 parser.add_argument('--version', action='version',
26 version="%s %s" % (sys.argv[0], __version__),
27 help='Print version and exit.')
28 parser.add_argument('locator', type=str,
30 Collection locator, optionally with a file path or prefix.
32 parser.add_argument('destination', type=str, nargs='?', default='-',
34 Local file or directory where the data is to be written. Default: stdout.
36 group = parser.add_mutually_exclusive_group()
37 group.add_argument('--progress', action='store_true',
39 Display human-readable progress on stderr (bytes and, if possible,
40 percentage of total data size). This is the default behavior when it
41 is not expected to interfere with the output: specifically, stderr is
42 a tty _and_ either stdout is not a tty, or output is being written to
43 named files rather than stdout.
45 group.add_argument('--no-progress', action='store_true',
47 Do not display human-readable progress on stderr.
49 group.add_argument('--batch-progress', action='store_true',
51 Display machine-readable progress on stderr (bytes and, if known,
54 group = parser.add_mutually_exclusive_group()
55 group.add_argument('--hash',
57 Display the hash of each file as it is read from Keep, using the given
58 hash algorithm. Supported algorithms include md5, sha1, sha224,
59 sha256, sha384, and sha512.
61 group.add_argument('--md5sum', action='store_const',
62 dest='hash', const='md5',
64 Display the MD5 hash of each file as it is read from Keep.
66 parser.add_argument('-n', action='store_true',
68 Do not write any data -- just read from Keep, and report md5sums if
71 parser.add_argument('-r', action='store_true',
73 Retrieve all files in the specified collection/prefix. This is the
74 default behavior if the "locator" argument ends with a forward slash.
76 group = parser.add_mutually_exclusive_group()
77 group.add_argument('-f', action='store_true',
79 Overwrite existing files while writing. The default behavior is to
80 refuse to write *anything* if any of the output files already
81 exist. As a special case, -f is not needed to write to stdout.
83 group.add_argument('--skip-existing', action='store_true',
85 Skip files that already exist. The default behavior is to refuse to
86 write *anything* if any files exist that would have to be
87 overwritten. This option causes even devices, sockets, and fifos to be
91 def parse_arguments(arguments, logger):
92 args = parser.parse_args()
94 if args.locator[-1] == os.sep:
98 not (args.destination and
99 os.path.isdir(args.destination))):
100 parser.error('Destination is not a directory.')
101 if not args.r and (os.path.isdir(args.destination) or
102 args.destination[-1] == os.path.sep):
103 args.destination = os.path.join(args.destination,
104 os.path.basename(args.locator))
105 logger.debug("Appended source file name to destination directory: %s",
108 if args.destination == '/dev/stdout':
109 args.destination = "-"
111 if args.destination == '-':
112 # Normally you have to use -f to write to a file (or device) that
113 # already exists, but "-" and "/dev/stdout" are common enough to
114 # merit a special exception.
117 args.destination = args.destination.rstrip(os.sep)
119 # Turn on --progress by default if stderr is a tty and output is
120 # either going to a named file, or going (via stdout) to something
122 if (not (args.batch_progress or args.no_progress)
123 and sys.stderr.isatty()
124 and (args.destination != '-'
125 or not sys.stdout.isatty())):
129 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
132 logger = logging.getLogger('arvados.arv-get')
133 args = parse_arguments(arguments, logger)
134 if api_client is None:
135 api_client = arvados.api('v1')
137 r = re.search(r'^(.*?)(/.*)?$', args.locator)
138 collection = r.group(1)
139 get_prefix = r.group(2)
140 if args.r and not get_prefix:
142 reader = arvados.CollectionReader(collection, num_retries=args.retries)
146 open_flags = os.O_CREAT | os.O_WRONLY
148 open_flags |= os.O_EXCL
150 if args.destination == "-":
151 sys.stdout.write(reader.manifest_text())
153 out_fd = os.open(args.destination, open_flags)
154 with os.fdopen(out_fd, 'wb') as out_file:
155 out_file.write(reader.manifest_text())
156 except (IOError, OSError) as error:
157 abort("can't write to '{}': {}".format(args.destination, error))
158 except (arvados.errors.ApiError, arvados.errors.KeepReadError) as error:
159 abort("failed to download '{}': {}".format(collection, error))
164 # Scan the collection. Make an array of (stream, file, local
165 # destination filename) tuples, and add up total size to extract.
169 for s in reader.all_streams():
170 for f in s.all_files():
171 if get_prefix and get_prefix[-1] == os.sep:
172 if 0 != string.find(os.path.join(s.name(), f.name()),
175 if args.destination == "-":
178 dest_path = os.path.join(
180 os.path.join(s.name(), f.name())[len(get_prefix)+1:])
181 if (not (args.n or args.f or args.skip_existing) and
182 os.path.exists(dest_path)):
183 abort('Local file %s already exists.' % (dest_path,))
185 if os.path.join(s.name(), f.name()) != '.' + get_prefix:
187 dest_path = args.destination
188 todo += [(s, f, dest_path)]
189 todo_bytes += f.size()
190 except arvados.errors.NotFoundError as e:
193 # Read data, and (if not -n) write to local file(s) or pipe.
196 for s,f,outfilename in todo:
200 if outfilename == "-":
203 if args.skip_existing and os.path.exists(outfilename):
204 logger.debug('Local file %s exists. Skipping.', outfilename)
206 elif not args.f and (os.path.isfile(outfilename) or
207 os.path.isdir(outfilename)):
208 # Good thing we looked again: apparently this file wasn't
209 # here yet when we checked earlier.
210 abort('Local file %s already exists.' % (outfilename,))
212 arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
214 outfile = open(outfilename, 'wb')
215 except Exception as error:
216 abort('Open(%s) failed: %s' % (outfilename, error))
218 digestor = hashlib.new(args.hash)
220 for data in f.readall():
224 digestor.update(data)
225 out_bytes += len(data)
227 sys.stderr.write('\r%d MiB / %d MiB %.1f%%' %
232 else 100.0*out_bytes/todo_bytes)))
233 elif args.batch_progress:
234 sys.stderr.write('%s %d read %d total\n' %
235 (sys.argv[0], os.getpid(),
236 out_bytes, todo_bytes))
238 sys.stderr.write("%s %s/%s\n"
239 % (digestor.hexdigest(), s.name(), f.name()))
240 except KeyboardInterrupt:
241 if outfile and (outfile.fileno() > 2) and not outfile.closed:
242 os.unlink(outfile.name)
246 sys.stderr.write('\n')