12 import arvados.commands._util as arv_cmd
13 import arvados.util as util
15 from arvados._version import __version__
18 logger = logging.getLogger('arvados.arv-get')
20 parser = argparse.ArgumentParser(
21 description='Copy data from Keep to a local file or pipe.',
22 parents=[arv_cmd.retry_opt])
23 parser.add_argument('--version', action='version',
24 version="%s %s" % (sys.argv[0], __version__),
25 help='Print version and exit.')
26 parser.add_argument('locator', type=str,
28 Collection locator, optionally with a file path or prefix.
30 parser.add_argument('destination', type=str, nargs='?', default='-',
32 Local file or directory where the data is to be written. Default: stdout.
34 group = parser.add_mutually_exclusive_group()
35 group.add_argument('--progress', action='store_true',
37 Display human-readable progress on stderr (bytes and, if possible,
38 percentage of total data size). This is the default behavior when it
39 is not expected to interfere with the output: specifically, stderr is
40 a tty _and_ either stdout is not a tty, or output is being written to
41 named files rather than stdout.
43 group.add_argument('--no-progress', action='store_true',
45 Do not display human-readable progress on stderr.
47 group.add_argument('--batch-progress', action='store_true',
49 Display machine-readable progress on stderr (bytes and, if known,
52 group = parser.add_mutually_exclusive_group()
53 group.add_argument('--hash',
55 Display the hash of each file as it is read from Keep, using the given
56 hash algorithm. Supported algorithms include md5, sha1, sha224,
57 sha256, sha384, and sha512.
59 group.add_argument('--md5sum', action='store_const',
60 dest='hash', const='md5',
62 Display the MD5 hash of each file as it is read from Keep.
64 parser.add_argument('-n', action='store_true',
66 Do not write any data -- just read from Keep, and report md5sums if
69 parser.add_argument('-r', action='store_true',
71 Retrieve all files in the specified collection/prefix. This is the
72 default behavior if the "locator" argument ends with a forward slash.
74 group = parser.add_mutually_exclusive_group()
75 group.add_argument('-f', action='store_true',
77 Overwrite existing files while writing. The default behavior is to
78 refuse to write *anything* if any of the output files already
79 exist. As a special case, -f is not needed to write to stdout.
81 group.add_argument('--skip-existing', action='store_true',
83 Skip files that already exist. The default behavior is to refuse to
84 write *anything* if any files exist that would have to be
85 overwritten. This option causes even devices, sockets, and fifos to be
88 group.add_argument('--strip-manifest', action='store_true', default=False,
90 When getting a collection manifest, strip its access tokens before writing
94 def parse_arguments(arguments, stdout, stderr):
95 args = parser.parse_args(arguments)
97 if args.locator[-1] == os.sep:
101 not (args.destination and
102 os.path.isdir(args.destination))):
103 parser.error('Destination is not a directory.')
104 if not args.r and (os.path.isdir(args.destination) or
105 args.destination[-1] == os.path.sep):
106 args.destination = os.path.join(args.destination,
107 os.path.basename(args.locator))
108 logger.debug("Appended source file name to destination directory: %s",
111 if args.destination == '/dev/stdout':
112 args.destination = "-"
114 if args.destination == '-':
115 # Normally you have to use -f to write to a file (or device) that
116 # already exists, but "-" and "/dev/stdout" are common enough to
117 # merit a special exception.
120 args.destination = args.destination.rstrip(os.sep)
122 # Turn on --progress by default if stderr is a tty and output is
123 # either going to a named file, or going (via stdout) to something
125 if (not (args.batch_progress or args.no_progress)
127 and (args.destination != '-'
128 or not stdout.isatty())):
132 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
135 if stdout is sys.stdout and hasattr(stdout, 'buffer'):
136 # in Python 3, write to stdout as binary
137 stdout = stdout.buffer
139 args = parse_arguments(arguments, stdout, stderr)
140 if api_client is None:
141 api_client = arvados.api('v1')
143 r = re.search(r'^(.*?)(/.*)?$', args.locator)
145 get_prefix = r.group(2)
146 if args.r and not get_prefix:
149 reader = arvados.CollectionReader(col_loc, num_retries=args.retries)
150 except Exception as error:
151 logger.error("failed to read collection: {}".format(error))
154 # User asked to download the collection's manifest
157 open_flags = os.O_CREAT | os.O_WRONLY
159 open_flags |= os.O_EXCL
161 if args.destination == "-":
162 stdout.write(reader.manifest_text(strip=args.strip_manifest).encode())
164 out_fd = os.open(args.destination, open_flags)
165 with os.fdopen(out_fd, 'wb') as out_file:
166 out_file.write(reader.manifest_text(strip=args.strip_manifest).encode())
167 except (IOError, OSError) as error:
168 logger.error("can't write to '{}': {}".format(args.destination, error))
170 except (arvados.errors.ApiError, arvados.errors.KeepReadError) as error:
171 logger.error("failed to download '{}': {}".format(col_loc, error))
175 # Scan the collection. Make an array of (stream, file, local
176 # destination filename) tuples, and add up total size to extract.
180 if get_prefix == os.sep:
183 item = reader.find('.' + get_prefix)
185 if isinstance(item, arvados.collection.Subcollection) or isinstance(item, arvados.collection.CollectionReader):
186 # If the user asked for a file and we got a subcollection, error out.
187 if get_prefix[-1] != os.sep:
188 logger.error("requested file '{}' is in fact a subcollection. Append a trailing '/' to download it.".format('.' + get_prefix))
190 # If the user asked stdout as a destination, error out.
191 elif args.destination == '-':
192 logger.error("cannot use 'stdout' as destination when downloading multiple files.")
194 # User asked for a subcollection, and that's what was found. Add up total size
196 for s, f in files_in_collection(item):
197 dest_path = os.path.join(
199 os.path.join(s.stream_name(), f.name)[len(get_prefix)+1:])
200 if (not (args.n or args.f or args.skip_existing) and
201 os.path.exists(dest_path)):
202 logger.error('Local file %s already exists.' % (dest_path,))
204 todo += [(s, f, dest_path)]
205 todo_bytes += f.size()
206 elif isinstance(item, arvados.arvfile.ArvadosFile):
207 todo += [(item.parent, item, args.destination)]
208 todo_bytes += item.size()
210 logger.error("'{}' not found.".format('.' + get_prefix))
212 except (IOError, arvados.errors.NotFoundError) as e:
217 for s, f, outfilename in todo:
221 if outfilename == "-":
224 if args.skip_existing and os.path.exists(outfilename):
225 logger.debug('Local file %s exists. Skipping.', outfilename)
227 elif not args.f and (os.path.isfile(outfilename) or
228 os.path.isdir(outfilename)):
229 # Good thing we looked again: apparently this file wasn't
230 # here yet when we checked earlier.
231 logger.error('Local file %s already exists.' % (outfilename,))
234 arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
236 outfile = open(outfilename, 'wb')
237 except Exception as error:
238 logger.error('Open(%s) failed: %s' % (outfilename, error))
241 digestor = hashlib.new(args.hash)
243 with s.open(f.name, 'rb') as file_reader:
244 for data in file_reader.readall():
248 digestor.update(data)
249 out_bytes += len(data)
251 stderr.write('\r%d MiB / %d MiB %.1f%%' %
256 else 100.0*out_bytes/todo_bytes)))
257 elif args.batch_progress:
258 stderr.write('%s %d read %d total\n' %
259 (sys.argv[0], os.getpid(),
260 out_bytes, todo_bytes))
262 stderr.write("%s %s/%s\n"
263 % (digestor.hexdigest(), s.stream_name(), f.name))
264 except KeyboardInterrupt:
265 if outfile and (outfile.fileno() > 2) and not outfile.closed:
266 os.unlink(outfile.name)
269 if outfile != None and outfile != stdout:
276 def files_in_collection(c):
277 # Sort first by file type, then alphabetically by file path.
278 for i in sorted(list(c.keys()),
280 isinstance(c[k], arvados.collection.Subcollection),
282 if isinstance(c[i], arvados.arvfile.ArvadosFile):
284 elif isinstance(c[i], arvados.collection.Subcollection):
285 for s, f in files_in_collection(c[i]):