12 import arvados.commands._util as arv_cmd
14 from arvados._version import __version__
18 def abort(msg, code=1):
19 print >>sys.stderr, "arv-get:", msg
22 parser = argparse.ArgumentParser(
23 description='Copy data from Keep to a local file or pipe.',
24 parents=[arv_cmd.retry_opt])
25 parser.add_argument('--version', action='version',
26 version="%s %s" % (sys.argv[0], __version__),
27 help='Print version and exit.')
28 parser.add_argument('locator', type=str,
30 Collection locator, optionally with a file path or prefix.
32 parser.add_argument('destination', type=str, nargs='?', default='-',
34 Local file or directory where the data is to be written. Default: stdout.
36 group = parser.add_mutually_exclusive_group()
37 group.add_argument('--progress', action='store_true',
39 Display human-readable progress on stderr (bytes and, if possible,
40 percentage of total data size). This is the default behavior when it
41 is not expected to interfere with the output: specifically, stderr is
42 a tty _and_ either stdout is not a tty, or output is being written to
43 named files rather than stdout.
45 group.add_argument('--no-progress', action='store_true',
47 Do not display human-readable progress on stderr.
49 group.add_argument('--batch-progress', action='store_true',
51 Display machine-readable progress on stderr (bytes and, if known,
54 group = parser.add_mutually_exclusive_group()
55 group.add_argument('--hash',
57 Display the hash of each file as it is read from Keep, using the given
58 hash algorithm. Supported algorithms include md5, sha1, sha224,
59 sha256, sha384, and sha512.
61 group.add_argument('--md5sum', action='store_const',
62 dest='hash', const='md5',
64 Display the MD5 hash of each file as it is read from Keep.
66 parser.add_argument('-n', action='store_true',
68 Do not write any data -- just read from Keep, and report md5sums if
71 parser.add_argument('-r', action='store_true',
73 Retrieve all files in the specified collection/prefix. This is the
74 default behavior if the "locator" argument ends with a forward slash.
76 group = parser.add_mutually_exclusive_group()
77 group.add_argument('-f', action='store_true',
79 Overwrite existing files while writing. The default behavior is to
80 refuse to write *anything* if any of the output files already
81 exist. As a special case, -f is not needed to write to stdout.
83 group.add_argument('--skip-existing', action='store_true',
85 Skip files that already exist. The default behavior is to refuse to
86 write *anything* if any files exist that would have to be
87 overwritten. This option causes even devices, sockets, and fifos to be
91 def parse_arguments(arguments, logger):
92 args = parser.parse_args()
94 if args.locator[-1] == os.sep:
98 not (args.destination and
99 os.path.isdir(args.destination))):
100 parser.error('Destination is not a directory.')
101 if not args.r and (os.path.isdir(args.destination) or
102 args.destination[-1] == os.path.sep):
103 args.destination = os.path.join(args.destination,
104 os.path.basename(args.locator))
105 logger.debug("Appended source file name to destination directory: %s",
108 if args.destination == '/dev/stdout':
109 args.destination = "-"
111 if args.destination == '-':
112 # Normally you have to use -f to write to a file (or device) that
113 # already exists, but "-" and "/dev/stdout" are common enough to
114 # merit a special exception.
117 args.destination = args.destination.rstrip(os.sep)
119 # Turn on --progress by default if stderr is a tty and output is
120 # either going to a named file, or going (via stdout) to something
122 if (not (args.batch_progress or args.no_progress)
123 and sys.stderr.isatty()
124 and (args.destination != '-'
125 or not sys.stdout.isatty())):
129 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
132 logger = logging.getLogger('arvados.arv-get')
133 args = parse_arguments(arguments, logger)
134 if api_client is None:
135 api_client = arvados.api('v1')
137 r = re.search(r'^(.*?)(/.*)?$', args.locator)
138 collection = r.group(1)
139 get_prefix = r.group(2)
140 if args.r and not get_prefix:
142 reader = arvados.CollectionReader(collection, num_retries=args.retries)
146 open_flags = os.O_CREAT | os.O_WRONLY
148 open_flags |= os.O_EXCL
150 if args.destination == "-":
151 sys.stdout.write(reader.manifest_text())
153 out_fd = os.open(args.destination, open_flags)
154 with os.fdopen(out_fd, 'wb') as out_file:
155 out_file.write(reader.manifest_text())
156 except (IOError, OSError) as error:
157 abort("can't write to '{}': {}".format(args.destination, error))
158 except (arvados.errors.ApiError, arvados.errors.KeepReadError) as error:
159 abort("failed to download '{}': {}".format(collection, error))
162 # Scan the collection. Make an array of (stream, file, local
163 # destination filename) tuples, and add up total size to extract.
167 for s, f in files_in_collection(reader):
168 if get_prefix and get_prefix[-1] == os.sep:
169 if 0 != string.find(os.path.join(s.stream_name(), f.name),
172 if args.destination == "-":
175 dest_path = os.path.join(
177 os.path.join(s.stream_name(), f.name)[len(get_prefix)+1:])
178 if (not (args.n or args.f or args.skip_existing) and
179 os.path.exists(dest_path)):
180 abort('Local file %s already exists.' % (dest_path,))
182 if os.path.join(s.stream_name(), f.name) != '.' + get_prefix:
184 dest_path = args.destination
185 todo += [(s, f, dest_path)]
186 todo_bytes += f.size()
187 except arvados.errors.NotFoundError as e:
191 for s, f, outfilename in todo:
195 if outfilename == "-":
198 if args.skip_existing and os.path.exists(outfilename):
199 logger.debug('Local file %s exists. Skipping.', outfilename)
201 elif not args.f and (os.path.isfile(outfilename) or
202 os.path.isdir(outfilename)):
203 # Good thing we looked again: apparently this file wasn't
204 # here yet when we checked earlier.
205 abort('Local file %s already exists.' % (outfilename,))
207 arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
209 outfile = open(outfilename, 'wb')
210 except Exception as error:
211 abort('Open(%s) failed: %s' % (outfilename, error))
213 digestor = hashlib.new(args.hash)
215 with s.open(f.name, 'r') as file_reader:
216 for data in file_reader.readall():
220 digestor.update(data)
221 out_bytes += len(data)
223 sys.stderr.write('\r%d MiB / %d MiB %.1f%%' %
228 else 100.0*out_bytes/todo_bytes)))
229 elif args.batch_progress:
230 sys.stderr.write('%s %d read %d total\n' %
231 (sys.argv[0], os.getpid(),
232 out_bytes, todo_bytes))
234 sys.stderr.write("%s %s/%s\n"
235 % (digestor.hexdigest(), s.stream_name(), f.name))
236 except KeyboardInterrupt:
237 if outfile and (outfile.fileno() > 2) and not outfile.closed:
238 os.unlink(outfile.name)
242 sys.stderr.write('\n')
244 def files_in_collection(c):
245 # Sort first by file type, then alphabetically by file path.
246 for i in sorted(c.keys(),
248 isinstance(c[k], arvados.collection.Subcollection),
250 if isinstance(c[i], arvados.arvfile.ArvadosFile):
252 elif isinstance(c[i], arvados.collection.Subcollection):
253 for s, f in files_in_collection(c[i]):