2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: Apache-2.0
15 import arvados.commands._util as arv_cmd
16 import arvados.util as util
18 from arvados._version import __version__
21 logger = logging.getLogger('arvados.arv-get')
23 parser = argparse.ArgumentParser(
24 description='Copy data from Keep to a local file or pipe.',
25 parents=[arv_cmd.retry_opt])
26 parser.add_argument('--version', action='version',
27 version="%s %s" % (sys.argv[0], __version__),
28 help='Print version and exit.')
29 parser.add_argument('locator', type=str,
31 Collection locator, optionally with a file path or prefix.
33 parser.add_argument('destination', type=str, nargs='?', default='-',
35 Local file or directory where the data is to be written. Default: stdout.
37 group = parser.add_mutually_exclusive_group()
38 group.add_argument('--progress', action='store_true',
40 Display human-readable progress on stderr (bytes and, if possible,
41 percentage of total data size). This is the default behavior when it
42 is not expected to interfere with the output: specifically, stderr is
43 a tty _and_ either stdout is not a tty, or output is being written to
44 named files rather than stdout.
46 group.add_argument('--no-progress', action='store_true',
48 Do not display human-readable progress on stderr.
50 group.add_argument('--batch-progress', action='store_true',
52 Display machine-readable progress on stderr (bytes and, if known,
55 group = parser.add_mutually_exclusive_group()
56 group.add_argument('--hash',
58 Display the hash of each file as it is read from Keep, using the given
59 hash algorithm. Supported algorithms include md5, sha1, sha224,
60 sha256, sha384, and sha512.
62 group.add_argument('--md5sum', action='store_const',
63 dest='hash', const='md5',
65 Display the MD5 hash of each file as it is read from Keep.
67 parser.add_argument('-n', action='store_true',
69 Do not write any data -- just read from Keep, and report md5sums if
72 parser.add_argument('-r', action='store_true',
74 Retrieve all files in the specified collection/prefix. This is the
75 default behavior if the "locator" argument ends with a forward slash.
77 group = parser.add_mutually_exclusive_group()
78 group.add_argument('-f', action='store_true',
80 Overwrite existing files while writing. The default behavior is to
81 refuse to write *anything* if any of the output files already
82 exist. As a special case, -f is not needed to write to stdout.
84 group.add_argument('-v', action='count', default=0,
86 Once for verbose mode, twice for debug mode.
88 group.add_argument('--skip-existing', action='store_true',
90 Skip files that already exist. The default behavior is to refuse to
91 write *anything* if any files exist that would have to be
92 overwritten. This option causes even devices, sockets, and fifos to be
95 group.add_argument('--strip-manifest', action='store_true', default=False,
97 When getting a collection manifest, strip its access tokens before writing
101 def parse_arguments(arguments, stdout, stderr):
102 args = parser.parse_args(arguments)
104 if args.locator[-1] == os.sep:
108 not (args.destination and
109 os.path.isdir(args.destination))):
110 parser.error('Destination is not a directory.')
111 if not args.r and (os.path.isdir(args.destination) or
112 args.destination[-1] == os.path.sep):
113 args.destination = os.path.join(args.destination,
114 os.path.basename(args.locator))
115 logger.debug("Appended source file name to destination directory: %s",
118 if args.destination == '/dev/stdout':
119 args.destination = "-"
121 if args.destination == '-':
122 # Normally you have to use -f to write to a file (or device) that
123 # already exists, but "-" and "/dev/stdout" are common enough to
124 # merit a special exception.
127 args.destination = args.destination.rstrip(os.sep)
129 # Turn on --progress by default if stderr is a tty and output is
130 # either going to a named file, or going (via stdout) to something
132 if (not (args.batch_progress or args.no_progress)
134 and (args.destination != '-'
135 or not stdout.isatty())):
139 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
142 if stdout is sys.stdout and hasattr(stdout, 'buffer'):
143 # in Python 3, write to stdout as binary
144 stdout = stdout.buffer
146 args = parse_arguments(arguments, stdout, stderr)
147 logger.setLevel(logging.WARNING - 10 * args.v)
149 request_id = arvados.util.new_request_id()
150 logger.info('X-Request-Id: '+request_id)
152 if api_client is None:
153 api_client = arvados.api('v1', request_id=request_id)
155 r = re.search(r'^(.*?)(/.*)?$', args.locator)
157 get_prefix = r.group(2)
158 if args.r and not get_prefix:
161 # User asked to download the collection's manifest
164 open_flags = os.O_CREAT | os.O_WRONLY
166 open_flags |= os.O_EXCL
168 if args.destination == "-":
169 write_block_or_manifest(
170 dest=stdout, src=col_loc,
171 api_client=api_client, args=args)
173 out_fd = os.open(args.destination, open_flags)
174 with os.fdopen(out_fd, 'wb') as out_file:
175 write_block_or_manifest(
176 dest=out_file, src=col_loc,
177 api_client=api_client, args=args)
178 except (IOError, OSError) as error:
179 logger.error("can't write to '{}': {}".format(args.destination, error))
181 except (arvados.errors.ApiError, arvados.errors.KeepReadError) as error:
182 logger.error("failed to download '{}': {}".format(col_loc, error))
184 except arvados.errors.ArgumentError as error:
185 if 'Argument to CollectionReader' in str(error):
186 logger.error("error reading collection: {}".format(error))
193 reader = arvados.CollectionReader(
194 col_loc, api_client=api_client, num_retries=args.retries)
195 except Exception as error:
196 logger.error("failed to read collection: {}".format(error))
199 # Scan the collection. Make an array of (stream, file, local
200 # destination filename) tuples, and add up total size to extract.
204 if get_prefix == os.sep:
207 item = reader.find('.' + get_prefix)
209 if isinstance(item, arvados.collection.Subcollection) or isinstance(item, arvados.collection.CollectionReader):
210 # If the user asked for a file and we got a subcollection, error out.
211 if get_prefix[-1] != os.sep:
212 logger.error("requested file '{}' is in fact a subcollection. Append a trailing '/' to download it.".format('.' + get_prefix))
214 # If the user asked stdout as a destination, error out.
215 elif args.destination == '-':
216 logger.error("cannot use 'stdout' as destination when downloading multiple files.")
218 # User asked for a subcollection, and that's what was found. Add up total size
220 for s, f in files_in_collection(item):
221 dest_path = os.path.join(
223 os.path.join(s.stream_name(), f.name)[len(get_prefix)+1:])
224 if (not (args.n or args.f or args.skip_existing) and
225 os.path.exists(dest_path)):
226 logger.error('Local file %s already exists.' % (dest_path,))
228 todo += [(s, f, dest_path)]
229 todo_bytes += f.size()
230 elif isinstance(item, arvados.arvfile.ArvadosFile):
231 todo += [(item.parent, item, args.destination)]
232 todo_bytes += item.size()
234 logger.error("'{}' not found.".format('.' + get_prefix))
236 except (IOError, arvados.errors.NotFoundError) as e:
241 for s, f, outfilename in todo:
245 if outfilename == "-":
248 if args.skip_existing and os.path.exists(outfilename):
249 logger.debug('Local file %s exists. Skipping.', outfilename)
251 elif not args.f and (os.path.isfile(outfilename) or
252 os.path.isdir(outfilename)):
253 # Good thing we looked again: apparently this file wasn't
254 # here yet when we checked earlier.
255 logger.error('Local file %s already exists.' % (outfilename,))
258 arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
260 outfile = open(outfilename, 'wb')
261 except Exception as error:
262 logger.error('Open(%s) failed: %s' % (outfilename, error))
265 digestor = hashlib.new(args.hash)
267 with s.open(f.name, 'rb') as file_reader:
268 for data in file_reader.readall():
272 digestor.update(data)
273 out_bytes += len(data)
275 stderr.write('\r%d MiB / %d MiB %.1f%%' %
280 else 100.0*out_bytes/todo_bytes)))
281 elif args.batch_progress:
282 stderr.write('%s %d read %d total\n' %
283 (sys.argv[0], os.getpid(),
284 out_bytes, todo_bytes))
286 stderr.write("%s %s/%s\n"
287 % (digestor.hexdigest(), s.stream_name(), f.name))
288 except KeyboardInterrupt:
289 if outfile and (outfile.fileno() > 2) and not outfile.closed:
290 os.unlink(outfile.name)
293 if outfile != None and outfile != stdout:
300 def files_in_collection(c):
301 # Sort first by file type, then alphabetically by file path.
302 for i in sorted(list(c.keys()),
304 isinstance(c[k], arvados.collection.Subcollection),
306 if isinstance(c[i], arvados.arvfile.ArvadosFile):
308 elif isinstance(c[i], arvados.collection.Subcollection):
309 for s, f in files_in_collection(c[i]):
312 def write_block_or_manifest(dest, src, api_client, args):
315 kc = arvados.keep.KeepClient(api_client=api_client)
316 dest.write(kc.get(src, num_retries=args.retries))
318 # collection UUID or portable data hash
319 reader = arvados.CollectionReader(
320 src, api_client=api_client, num_retries=args.retries)
321 dest.write(reader.manifest_text(strip=args.strip_manifest).encode())