2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: Apache-2.0
15 import arvados.commands._util as arv_cmd
16 import arvados.util as util
18 from arvados._version import __version__
21 logger = logging.getLogger('arvados.arv-get')
23 parser = argparse.ArgumentParser(
24 description='Copy data from Keep to a local file or pipe.',
25 parents=[arv_cmd.retry_opt])
26 parser.add_argument('--version', action='version',
27 version="%s %s" % (sys.argv[0], __version__),
28 help='Print version and exit.')
29 parser.add_argument('locator', type=str,
31 Collection locator, optionally with a file path or prefix.
33 parser.add_argument('destination', type=str, nargs='?', default='-',
35 Local file or directory where the data is to be written. Default: stdout.
37 group = parser.add_mutually_exclusive_group()
38 group.add_argument('--progress', action='store_true',
40 Display human-readable progress on stderr (bytes and, if possible,
41 percentage of total data size). This is the default behavior when it
42 is not expected to interfere with the output: specifically, stderr is
43 a tty _and_ either stdout is not a tty, or output is being written to
44 named files rather than stdout.
46 group.add_argument('--no-progress', action='store_true',
48 Do not display human-readable progress on stderr.
50 group.add_argument('--batch-progress', action='store_true',
52 Display machine-readable progress on stderr (bytes and, if known,
55 group = parser.add_mutually_exclusive_group()
56 group.add_argument('--hash',
58 Display the hash of each file as it is read from Keep, using the given
59 hash algorithm. Supported algorithms include md5, sha1, sha224,
60 sha256, sha384, and sha512.
62 group.add_argument('--md5sum', action='store_const',
63 dest='hash', const='md5',
65 Display the MD5 hash of each file as it is read from Keep.
67 parser.add_argument('-n', action='store_true',
69 Do not write any data -- just read from Keep, and report md5sums if
72 parser.add_argument('-r', action='store_true',
74 Retrieve all files in the specified collection/prefix. This is the
75 default behavior if the "locator" argument ends with a forward slash.
77 group = parser.add_mutually_exclusive_group()
78 group.add_argument('-f', action='store_true',
80 Overwrite existing files while writing. The default behavior is to
81 refuse to write *anything* if any of the output files already
82 exist. As a special case, -f is not needed to write to stdout.
84 group.add_argument('--skip-existing', action='store_true',
86 Skip files that already exist. The default behavior is to refuse to
87 write *anything* if any files exist that would have to be
88 overwritten. This option causes even devices, sockets, and fifos to be
91 group.add_argument('--strip-manifest', action='store_true', default=False,
93 When getting a collection manifest, strip its access tokens before writing
97 def parse_arguments(arguments, stdout, stderr):
98 args = parser.parse_args(arguments)
100 if args.locator[-1] == os.sep:
104 not (args.destination and
105 os.path.isdir(args.destination))):
106 parser.error('Destination is not a directory.')
107 if not args.r and (os.path.isdir(args.destination) or
108 args.destination[-1] == os.path.sep):
109 args.destination = os.path.join(args.destination,
110 os.path.basename(args.locator))
111 logger.debug("Appended source file name to destination directory: %s",
114 if args.destination == '/dev/stdout':
115 args.destination = "-"
117 if args.destination == '-':
118 # Normally you have to use -f to write to a file (or device) that
119 # already exists, but "-" and "/dev/stdout" are common enough to
120 # merit a special exception.
123 args.destination = args.destination.rstrip(os.sep)
125 # Turn on --progress by default if stderr is a tty and output is
126 # either going to a named file, or going (via stdout) to something
128 if (not (args.batch_progress or args.no_progress)
130 and (args.destination != '-'
131 or not stdout.isatty())):
135 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
138 if stdout is sys.stdout and hasattr(stdout, 'buffer'):
139 # in Python 3, write to stdout as binary
140 stdout = stdout.buffer
142 args = parse_arguments(arguments, stdout, stderr)
143 if api_client is None:
144 api_client = arvados.api('v1')
146 r = re.search(r'^(.*?)(/.*)?$', args.locator)
148 get_prefix = r.group(2)
149 if args.r and not get_prefix:
152 reader = arvados.CollectionReader(col_loc, num_retries=args.retries)
153 except Exception as error:
154 logger.error("failed to read collection: {}".format(error))
157 # User asked to download the collection's manifest
160 open_flags = os.O_CREAT | os.O_WRONLY
162 open_flags |= os.O_EXCL
164 if args.destination == "-":
165 stdout.write(reader.manifest_text(strip=args.strip_manifest).encode())
167 out_fd = os.open(args.destination, open_flags)
168 with os.fdopen(out_fd, 'wb') as out_file:
169 out_file.write(reader.manifest_text(strip=args.strip_manifest).encode())
170 except (IOError, OSError) as error:
171 logger.error("can't write to '{}': {}".format(args.destination, error))
173 except (arvados.errors.ApiError, arvados.errors.KeepReadError) as error:
174 logger.error("failed to download '{}': {}".format(col_loc, error))
178 # Scan the collection. Make an array of (stream, file, local
179 # destination filename) tuples, and add up total size to extract.
183 if get_prefix == os.sep:
186 item = reader.find('.' + get_prefix)
188 if isinstance(item, arvados.collection.Subcollection) or isinstance(item, arvados.collection.CollectionReader):
189 # If the user asked for a file and we got a subcollection, error out.
190 if get_prefix[-1] != os.sep:
191 logger.error("requested file '{}' is in fact a subcollection. Append a trailing '/' to download it.".format('.' + get_prefix))
193 # If the user asked stdout as a destination, error out.
194 elif args.destination == '-':
195 logger.error("cannot use 'stdout' as destination when downloading multiple files.")
197 # User asked for a subcollection, and that's what was found. Add up total size
199 for s, f in files_in_collection(item):
200 dest_path = os.path.join(
202 os.path.join(s.stream_name(), f.name)[len(get_prefix)+1:])
203 if (not (args.n or args.f or args.skip_existing) and
204 os.path.exists(dest_path)):
205 logger.error('Local file %s already exists.' % (dest_path,))
207 todo += [(s, f, dest_path)]
208 todo_bytes += f.size()
209 elif isinstance(item, arvados.arvfile.ArvadosFile):
210 todo += [(item.parent, item, args.destination)]
211 todo_bytes += item.size()
213 logger.error("'{}' not found.".format('.' + get_prefix))
215 except (IOError, arvados.errors.NotFoundError) as e:
220 for s, f, outfilename in todo:
224 if outfilename == "-":
227 if args.skip_existing and os.path.exists(outfilename):
228 logger.debug('Local file %s exists. Skipping.', outfilename)
230 elif not args.f and (os.path.isfile(outfilename) or
231 os.path.isdir(outfilename)):
232 # Good thing we looked again: apparently this file wasn't
233 # here yet when we checked earlier.
234 logger.error('Local file %s already exists.' % (outfilename,))
237 arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
239 outfile = open(outfilename, 'wb')
240 except Exception as error:
241 logger.error('Open(%s) failed: %s' % (outfilename, error))
244 digestor = hashlib.new(args.hash)
246 with s.open(f.name, 'rb') as file_reader:
247 for data in file_reader.readall():
251 digestor.update(data)
252 out_bytes += len(data)
254 stderr.write('\r%d MiB / %d MiB %.1f%%' %
259 else 100.0*out_bytes/todo_bytes)))
260 elif args.batch_progress:
261 stderr.write('%s %d read %d total\n' %
262 (sys.argv[0], os.getpid(),
263 out_bytes, todo_bytes))
265 stderr.write("%s %s/%s\n"
266 % (digestor.hexdigest(), s.stream_name(), f.name))
267 except KeyboardInterrupt:
268 if outfile and (outfile.fileno() > 2) and not outfile.closed:
269 os.unlink(outfile.name)
272 if outfile != None and outfile != stdout:
279 def files_in_collection(c):
280 # Sort first by file type, then alphabetically by file path.
281 for i in sorted(list(c.keys()),
283 isinstance(c[k], arvados.collection.Subcollection),
285 if isinstance(c[i], arvados.arvfile.ArvadosFile):
287 elif isinstance(c[i], arvados.collection.Subcollection):
288 for s, f in files_in_collection(c[i]):