13 logger = logging.getLogger('arvados.arv-get')
15 def abort(msg, code=1):
16 print >>sys.stderr, "arv-get:", msg
19 parser = argparse.ArgumentParser(
20 description='Copy data from Keep to a local file or pipe.')
21 parser.add_argument('locator', type=str,
23 Collection locator, optionally with a file path or prefix.
25 parser.add_argument('destination', type=str, nargs='?', default='/dev/stdout',
27 Local file or directory where the data is to be written. Default:
30 group = parser.add_mutually_exclusive_group()
31 group.add_argument('--progress', action='store_true',
33 Display human-readable progress on stderr (bytes and, if possible,
34 percentage of total data size). This is the default behavior when it
35 is not expected to interfere with the output: specifically, stderr is
36 a tty _and_ either stdout is not a tty, or output is being written to
37 named files rather than stdout.
39 group.add_argument('--no-progress', action='store_true',
41 Do not display human-readable progress on stderr.
43 group.add_argument('--batch-progress', action='store_true',
45 Display machine-readable progress on stderr (bytes and, if known,
48 group = parser.add_mutually_exclusive_group()
49 group.add_argument('--hash',
51 Display the hash of each file as it is read from Keep, using the given
52 hash algorithm. Supported algorithms include md5, sha1, sha224,
53 sha256, sha384, and sha512.
55 group.add_argument('--md5sum', action='store_const',
56 dest='hash', const='md5',
58 Display the MD5 hash of each file as it is read from Keep.
60 parser.add_argument('-n', action='store_true',
62 Do not write any data -- just read from Keep, and report md5sums if
65 parser.add_argument('-r', action='store_true',
67 Retrieve all files in the specified collection/prefix. This is the
68 default behavior if the "locator" argument ends with a forward slash.
70 group = parser.add_mutually_exclusive_group()
71 group.add_argument('-f', action='store_true',
73 Overwrite existing files while writing. The default behavior is to
74 refuse to write *anything* if any of the output files already
75 exist. As a special case, -f is not needed to write to /dev/stdout.
77 group.add_argument('--skip-existing', action='store_true',
79 Skip files that already exist. The default behavior is to refuse to
80 write *anything* if any files exist that would have to be
81 overwritten. This option causes even devices, sockets, and fifos to be
85 args = parser.parse_args()
87 if args.locator[-1] == os.sep:
91 not (args.destination and
92 os.path.isdir(args.destination))):
93 parser.error('Destination is not a directory.')
94 if not args.r and (os.path.isdir(args.destination) or
95 args.destination[-1] == os.path.sep):
96 args.destination = os.path.join(args.destination,
97 os.path.basename(args.locator))
98 logger.debug("Appended source file name to destination directory: %s",
101 if args.destination == '-':
102 args.destination = '/dev/stdout'
103 if args.destination == '/dev/stdout':
104 # Normally you have to use -f to write to a file (or device) that
105 # already exists, but "-" and "/dev/stdout" are common enough to
106 # merit a special exception.
109 args.destination = args.destination.rstrip(os.sep)
111 # Turn on --progress by default if stderr is a tty and output is
112 # either going to a named file, or going (via stdout) to something
114 if (not (args.batch_progress or args.no_progress)
115 and sys.stderr.isatty()
116 and (args.destination != '/dev/stdout'
117 or not sys.stdout.isatty())):
121 r = re.search(r'^(.*?)(/.*)?$', args.locator)
122 collection = r.group(1)
123 get_prefix = r.group(2)
124 if args.r and not get_prefix:
132 if not args.f and os.path.exists(args.destination):
133 abort('Local file %s already exists.' % (args.destination,))
134 with open(args.destination, 'wb') as f:
136 c = arvados.api('v1').collections().get(
137 uuid=collection).execute()
138 manifest = c['manifest_text']
139 except Exception as e:
141 "Collection %s not found. " +
142 "Trying to fetch directly from Keep (deprecated).",
144 manifest = arvados.Keep.get(collection)
147 except arvados.errors.NotFoundError as e:
150 reader = arvados.CollectionReader(collection)
152 # Scan the collection. Make an array of (stream, file, local
153 # destination filename) tuples, and add up total size to extract.
156 for s in reader.all_streams():
157 for f in s.all_files():
158 if get_prefix and get_prefix[-1] == os.sep:
159 if 0 != string.find(os.path.join(s.name(), f.name()),
162 dest_path = os.path.join(
164 os.path.join(s.name(), f.name())[len(get_prefix)+1:])
165 if (not (args.n or args.f or args.skip_existing) and
166 os.path.exists(dest_path)):
167 abort('Local file %s already exists.' % (dest_path,))
169 if os.path.join(s.name(), f.name()) != '.' + get_prefix:
171 dest_path = args.destination
172 todo += [(s, f, dest_path)]
173 todo_bytes += f.size()
174 except arvados.errors.NotFoundError as e:
177 # Read data, and (if not -n) write to local file(s) or pipe.
180 for s,f,outfilename in todo:
184 if args.skip_existing and os.path.exists(outfilename):
185 logger.debug('Local file %s exists. Skipping.', outfilename)
187 elif not args.f and (os.path.isfile(outfilename) or
188 os.path.isdir(outfilename)):
189 # Good thing we looked again: apparently this file wasn't
190 # here yet when we checked earlier.
191 abort('Local file %s already exists.' % (outfilename,))
193 arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
195 outfile = open(outfilename, 'wb')
196 except Exception as e:
197 abort('Open(%s) failed: %s' % (outfilename, e))
199 digestor = hashlib.new(args.hash)
201 for data in f.readall():
205 digestor.update(data)
206 out_bytes += len(data)
208 sys.stderr.write('\r%d MiB / %d MiB %.1f%%' %
213 else 100.0*out_bytes/todo_bytes)))
214 elif args.batch_progress:
215 sys.stderr.write('%s %d read %d total\n' %
216 (sys.argv[0], os.getpid(),
217 out_bytes, todo_bytes))
219 sys.stderr.write("%s %s/%s\n"
220 % (digestor.hexdigest(), s.name(), f.name()))
221 except KeyboardInterrupt:
222 if outfile and outfile != '/dev/stdout':
223 os.unlink(outfilename)
227 sys.stderr.write('\n')