2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: Apache-2.0
15 import arvados.commands._util as arv_cmd
16 import arvados.util as util
18 from arvados._version import __version__
21 logger = logging.getLogger('arvados.arv-get')
23 parser = argparse.ArgumentParser(
24 description='Copy data from Keep to a local file or pipe.',
25 parents=[arv_cmd.retry_opt])
26 parser.add_argument('--version', action='version',
27 version="%s %s" % (sys.argv[0], __version__),
28 help='Print version and exit.')
29 parser.add_argument('locator', type=str,
31 Collection locator, optionally with a file path or prefix.
33 parser.add_argument('destination', type=str, nargs='?', default='-',
35 Local file or directory where the data is to be written. Default: stdout.
37 group = parser.add_mutually_exclusive_group()
38 group.add_argument('--progress', action='store_true',
40 Display human-readable progress on stderr (bytes and, if possible,
41 percentage of total data size). This is the default behavior when it
42 is not expected to interfere with the output: specifically, stderr is
43 a tty _and_ either stdout is not a tty, or output is being written to
44 named files rather than stdout.
46 group.add_argument('--no-progress', action='store_true',
48 Do not display human-readable progress on stderr.
50 group.add_argument('--batch-progress', action='store_true',
52 Display machine-readable progress on stderr (bytes and, if known,
55 group = parser.add_mutually_exclusive_group()
56 group.add_argument('--hash',
58 Display the hash of each file as it is read from Keep, using the given
59 hash algorithm. Supported algorithms include md5, sha1, sha224,
60 sha256, sha384, and sha512.
62 group.add_argument('--md5sum', action='store_const',
63 dest='hash', const='md5',
65 Display the MD5 hash of each file as it is read from Keep.
67 parser.add_argument('-n', action='store_true',
69 Do not write any data -- just read from Keep, and report md5sums if
72 parser.add_argument('-r', action='store_true',
74 Retrieve all files in the specified collection/prefix. This is the
75 default behavior if the "locator" argument ends with a forward slash.
77 group = parser.add_mutually_exclusive_group()
78 group.add_argument('-f', action='store_true',
80 Overwrite existing files while writing. The default behavior is to
81 refuse to write *anything* if any of the output files already
82 exist. As a special case, -f is not needed to write to stdout.
84 group.add_argument('--skip-existing', action='store_true',
86 Skip files that already exist. The default behavior is to refuse to
87 write *anything* if any files exist that would have to be
88 overwritten. This option causes even devices, sockets, and fifos to be
91 group.add_argument('--strip-manifest', action='store_true', default=False,
93 When getting a collection manifest, strip its access tokens before writing
97 def parse_arguments(arguments, stdout, stderr):
98 args = parser.parse_args(arguments)
100 if args.locator[-1] == os.sep:
104 not (args.destination and
105 os.path.isdir(args.destination))):
106 parser.error('Destination is not a directory.')
107 if not args.r and (os.path.isdir(args.destination) or
108 args.destination[-1] == os.path.sep):
109 args.destination = os.path.join(args.destination,
110 os.path.basename(args.locator))
111 logger.debug("Appended source file name to destination directory: %s",
114 if args.destination == '/dev/stdout':
115 args.destination = "-"
117 if args.destination == '-':
118 # Normally you have to use -f to write to a file (or device) that
119 # already exists, but "-" and "/dev/stdout" are common enough to
120 # merit a special exception.
123 args.destination = args.destination.rstrip(os.sep)
125 # Turn on --progress by default if stderr is a tty and output is
126 # either going to a named file, or going (via stdout) to something
128 if (not (args.batch_progress or args.no_progress)
130 and (args.destination != '-'
131 or not stdout.isatty())):
135 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
138 if stdout is sys.stdout and hasattr(stdout, 'buffer'):
139 # in Python 3, write to stdout as binary
140 stdout = stdout.buffer
142 args = parse_arguments(arguments, stdout, stderr)
143 if api_client is None:
144 api_client = arvados.api('v1')
146 r = re.search(r'^(.*?)(/.*)?$', args.locator)
148 get_prefix = r.group(2)
149 if args.r and not get_prefix:
152 # User asked to download the collection's manifest
155 open_flags = os.O_CREAT | os.O_WRONLY
157 open_flags |= os.O_EXCL
159 if args.destination == "-":
160 write_block_or_manifest(dest=stdout, src=col_loc,
161 api_client=api_client, args=args)
163 out_fd = os.open(args.destination, open_flags)
164 with os.fdopen(out_fd, 'wb') as out_file:
165 write_block_or_manifest(dest=out_file,
166 src=col_loc, api_client=api_client,
168 except (IOError, OSError) as error:
169 logger.error("can't write to '{}': {}".format(args.destination, error))
171 except (arvados.errors.ApiError, arvados.errors.KeepReadError) as error:
172 logger.error("failed to download '{}': {}".format(col_loc, error))
174 except arvados.errors.ArgumentError as error:
175 if 'Argument to CollectionReader' in str(error):
176 logger.error("error reading collection: {}".format(error))
183 reader = arvados.CollectionReader(col_loc, num_retries=args.retries)
184 except Exception as error:
185 logger.error("failed to read collection: {}".format(error))
188 # Scan the collection. Make an array of (stream, file, local
189 # destination filename) tuples, and add up total size to extract.
193 if get_prefix == os.sep:
196 item = reader.find('.' + get_prefix)
198 if isinstance(item, arvados.collection.Subcollection) or isinstance(item, arvados.collection.CollectionReader):
199 # If the user asked for a file and we got a subcollection, error out.
200 if get_prefix[-1] != os.sep:
201 logger.error("requested file '{}' is in fact a subcollection. Append a trailing '/' to download it.".format('.' + get_prefix))
203 # If the user asked stdout as a destination, error out.
204 elif args.destination == '-':
205 logger.error("cannot use 'stdout' as destination when downloading multiple files.")
207 # User asked for a subcollection, and that's what was found. Add up total size
209 for s, f in files_in_collection(item):
210 dest_path = os.path.join(
212 os.path.join(s.stream_name(), f.name)[len(get_prefix)+1:])
213 if (not (args.n or args.f or args.skip_existing) and
214 os.path.exists(dest_path)):
215 logger.error('Local file %s already exists.' % (dest_path,))
217 todo += [(s, f, dest_path)]
218 todo_bytes += f.size()
219 elif isinstance(item, arvados.arvfile.ArvadosFile):
220 todo += [(item.parent, item, args.destination)]
221 todo_bytes += item.size()
223 logger.error("'{}' not found.".format('.' + get_prefix))
225 except (IOError, arvados.errors.NotFoundError) as e:
230 for s, f, outfilename in todo:
234 if outfilename == "-":
237 if args.skip_existing and os.path.exists(outfilename):
238 logger.debug('Local file %s exists. Skipping.', outfilename)
240 elif not args.f and (os.path.isfile(outfilename) or
241 os.path.isdir(outfilename)):
242 # Good thing we looked again: apparently this file wasn't
243 # here yet when we checked earlier.
244 logger.error('Local file %s already exists.' % (outfilename,))
247 arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
249 outfile = open(outfilename, 'wb')
250 except Exception as error:
251 logger.error('Open(%s) failed: %s' % (outfilename, error))
254 digestor = hashlib.new(args.hash)
256 with s.open(f.name, 'rb') as file_reader:
257 for data in file_reader.readall():
261 digestor.update(data)
262 out_bytes += len(data)
264 stderr.write('\r%d MiB / %d MiB %.1f%%' %
269 else 100.0*out_bytes/todo_bytes)))
270 elif args.batch_progress:
271 stderr.write('%s %d read %d total\n' %
272 (sys.argv[0], os.getpid(),
273 out_bytes, todo_bytes))
275 stderr.write("%s %s/%s\n"
276 % (digestor.hexdigest(), s.stream_name(), f.name))
277 except KeyboardInterrupt:
278 if outfile and (outfile.fileno() > 2) and not outfile.closed:
279 os.unlink(outfile.name)
282 if outfile != None and outfile != stdout:
289 def files_in_collection(c):
290 # Sort first by file type, then alphabetically by file path.
291 for i in sorted(list(c.keys()),
293 isinstance(c[k], arvados.collection.Subcollection),
295 if isinstance(c[i], arvados.arvfile.ArvadosFile):
297 elif isinstance(c[i], arvados.collection.Subcollection):
298 for s, f in files_in_collection(c[i]):
301 def write_block_or_manifest(dest, src, api_client, args):
304 kc = arvados.keep.KeepClient(api_client=api_client)
305 dest.write(kc.get(src, num_retries=args.retries))
307 # collection UUID or portable data hash
308 reader = arvados.CollectionReader(src, num_retries=args.retries)
309 dest.write(reader.manifest_text(strip=args.strip_manifest).encode())