12 import arvados.commands._util as arv_cmd
14 from arvados._version import __version__
17 logger = logging.getLogger('arvados.arv-get')
19 parser = argparse.ArgumentParser(
20 description='Copy data from Keep to a local file or pipe.',
21 parents=[arv_cmd.retry_opt])
22 parser.add_argument('--version', action='version',
23 version="%s %s" % (sys.argv[0], __version__),
24 help='Print version and exit.')
25 parser.add_argument('locator', type=str,
27 Collection locator, optionally with a file path or prefix.
29 parser.add_argument('destination', type=str, nargs='?', default='-',
31 Local file or directory where the data is to be written. Default: stdout.
33 group = parser.add_mutually_exclusive_group()
34 group.add_argument('--progress', action='store_true',
36 Display human-readable progress on stderr (bytes and, if possible,
37 percentage of total data size). This is the default behavior when it
38 is not expected to interfere with the output: specifically, stderr is
39 a tty _and_ either stdout is not a tty, or output is being written to
40 named files rather than stdout.
42 group.add_argument('--no-progress', action='store_true',
44 Do not display human-readable progress on stderr.
46 group.add_argument('--batch-progress', action='store_true',
48 Display machine-readable progress on stderr (bytes and, if known,
51 group = parser.add_mutually_exclusive_group()
52 group.add_argument('--hash',
54 Display the hash of each file as it is read from Keep, using the given
55 hash algorithm. Supported algorithms include md5, sha1, sha224,
56 sha256, sha384, and sha512.
58 group.add_argument('--md5sum', action='store_const',
59 dest='hash', const='md5',
61 Display the MD5 hash of each file as it is read from Keep.
63 parser.add_argument('-n', action='store_true',
65 Do not write any data -- just read from Keep, and report md5sums if
68 parser.add_argument('-r', action='store_true',
70 Retrieve all files in the specified collection/prefix. This is the
71 default behavior if the "locator" argument ends with a forward slash.
73 group = parser.add_mutually_exclusive_group()
74 group.add_argument('-f', action='store_true',
76 Overwrite existing files while writing. The default behavior is to
77 refuse to write *anything* if any of the output files already
78 exist. As a special case, -f is not needed to write to stdout.
80 group.add_argument('--skip-existing', action='store_true',
82 Skip files that already exist. The default behavior is to refuse to
83 write *anything* if any files exist that would have to be
84 overwritten. This option causes even devices, sockets, and fifos to be
88 def parse_arguments(arguments, stdout, stderr):
89 args = parser.parse_args(arguments)
91 if args.locator[-1] == os.sep:
95 not (args.destination and
96 os.path.isdir(args.destination))):
97 parser.error('Destination is not a directory.')
98 if not args.r and (os.path.isdir(args.destination) or
99 args.destination[-1] == os.path.sep):
100 args.destination = os.path.join(args.destination,
101 os.path.basename(args.locator))
102 logger.debug("Appended source file name to destination directory: %s",
105 if args.destination == '/dev/stdout':
106 args.destination = "-"
108 if args.destination == '-':
109 # Normally you have to use -f to write to a file (or device) that
110 # already exists, but "-" and "/dev/stdout" are common enough to
111 # merit a special exception.
114 args.destination = args.destination.rstrip(os.sep)
116 # Turn on --progress by default if stderr is a tty and output is
117 # either going to a named file, or going (via stdout) to something
119 if (not (args.batch_progress or args.no_progress)
121 and (args.destination != '-'
122 or not stdout.isatty())):
126 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
129 args = parse_arguments(arguments, stdout, stderr)
130 if api_client is None:
131 api_client = arvados.api('v1')
133 r = re.search(r'^(.*?)(/.*)?$', args.locator)
134 collection = r.group(1)
135 get_prefix = r.group(2)
136 if args.r and not get_prefix:
139 reader = arvados.CollectionReader(collection, num_retries=args.retries)
140 except Exception as error:
141 logger.error("failed to read collection: {}".format(error))
144 # User asked to download the collection's manifest
147 open_flags = os.O_CREAT | os.O_WRONLY
149 open_flags |= os.O_EXCL
151 if args.destination == "-":
152 stdout.write(reader.manifest_text())
154 out_fd = os.open(args.destination, open_flags)
155 with os.fdopen(out_fd, 'wb') as out_file:
156 out_file.write(reader.manifest_text())
157 except (IOError, OSError) as error:
158 logger.error("can't write to '{}': {}".format(args.destination, error))
160 except (arvados.errors.ApiError, arvados.errors.KeepReadError) as error:
161 logger.error("failed to download '{}': {}".format(collection, error))
165 # Scan the collection. Make an array of (stream, file, local
166 # destination filename) tuples, and add up total size to extract.
170 if get_prefix == os.sep:
173 item = reader.find('.' + get_prefix)
175 if isinstance(item, arvados.collection.Subcollection) or isinstance(item, arvados.collection.CollectionReader):
176 # If the user asked for a file and we got a subcollection, error out.
177 if get_prefix[-1] != os.sep:
178 logger.error("requested file '{}' is in fact a subcollection. Append a trailing '/' to download it.".format('.' + get_prefix))
180 # If the user asked stdout as a destination, error out.
181 elif args.destination == '-':
182 logger.error("cannot use 'stdout' as destination when downloading multiple files.")
184 # User asked for a subcollection, and that's what was found. Add up total size
186 for s, f in files_in_collection(item):
187 dest_path = os.path.join(
189 os.path.join(s.stream_name(), f.name)[len(get_prefix)+1:])
190 if (not (args.n or args.f or args.skip_existing) and
191 os.path.exists(dest_path)):
192 logger.error('Local file %s already exists.' % (dest_path,))
194 todo += [(s, f, dest_path)]
195 todo_bytes += f.size()
196 elif isinstance(item, arvados.arvfile.ArvadosFile):
197 todo += [(item.parent, item, args.destination)]
198 todo_bytes += item.size()
200 logger.error("'{}' not found.".format('.' + get_prefix))
202 except (IOError, arvados.errors.NotFoundError) as e:
207 for s, f, outfilename in todo:
211 if outfilename == "-":
214 if args.skip_existing and os.path.exists(outfilename):
215 logger.debug('Local file %s exists. Skipping.', outfilename)
217 elif not args.f and (os.path.isfile(outfilename) or
218 os.path.isdir(outfilename)):
219 # Good thing we looked again: apparently this file wasn't
220 # here yet when we checked earlier.
221 logger.error('Local file %s already exists.' % (outfilename,))
224 arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
226 outfile = open(outfilename, 'wb')
227 except Exception as error:
228 logger.error('Open(%s) failed: %s' % (outfilename, error))
231 digestor = hashlib.new(args.hash)
233 with s.open(f.name, 'r') as file_reader:
234 for data in file_reader.readall():
238 digestor.update(data)
239 out_bytes += len(data)
241 stderr.write('\r%d MiB / %d MiB %.1f%%' %
246 else 100.0*out_bytes/todo_bytes)))
247 elif args.batch_progress:
248 stderr.write('%s %d read %d total\n' %
249 (sys.argv[0], os.getpid(),
250 out_bytes, todo_bytes))
252 stderr.write("%s %s/%s\n"
253 % (digestor.hexdigest(), s.stream_name(), f.name))
254 except KeyboardInterrupt:
255 if outfile and (outfile.fileno() > 2) and not outfile.closed:
256 os.unlink(outfile.name)
259 if outfile != None and outfile != stdout:
266 def files_in_collection(c):
267 # Sort first by file type, then alphabetically by file path.
268 for i in sorted(c.keys(),
270 isinstance(c[k], arvados.collection.Subcollection),
272 if isinstance(c[i], arvados.arvfile.ArvadosFile):
274 elif isinstance(c[i], arvados.collection.Subcollection):
275 for s, f in files_in_collection(c[i]):