11 logger = logging.getLogger(os.path.basename(sys.argv[0]))
13 parser = argparse.ArgumentParser(
14 description='Copy data from Keep to a local file or pipe.')
15 parser.add_argument('locator', type=str,
17 Collection locator, optionally with a file path or prefix.
19 parser.add_argument('destination', type=str, nargs='?', default='/dev/stdout',
21 Local file or directory where the data is to be written. Default:
24 group = parser.add_mutually_exclusive_group()
25 group.add_argument('--progress', action='store_true',
27 Display human-readable progress on stderr (bytes and, if possible,
28 percentage of total data size). This is the default behavior when
29 stderr is a tty and stdout is not a tty.
31 group.add_argument('--no-progress', action='store_true',
33 Do not display human-readable progress on stderr.
35 group.add_argument('--batch-progress', action='store_true',
37 Display machine-readable progress on stderr (bytes and, if known,
40 group = parser.add_mutually_exclusive_group()
41 group.add_argument('--hash',
43 Display the hash of each file as it is read from Keep, using the given
44 hash algorithm. Supported algorithms include md5, sha1, sha224,
45 sha256, sha384, and sha512.
47 group.add_argument('--md5sum', action='store_const',
48 dest='hash', const='md5',
50 Display the MD5 hash of each file as it is read from Keep.
52 parser.add_argument('-n', action='store_true',
54 Do not write any data -- just read from Keep, and report md5sums if
57 parser.add_argument('-r', action='store_true',
59 Retrieve all files in the specified collection/prefix. This is the
60 default behavior if the "locator" argument ends with a forward slash.
62 group = parser.add_mutually_exclusive_group()
63 group.add_argument('-f', action='store_true',
65 Overwrite existing files while writing. The default behavior is to
66 refuse to write *anything* if any of the output files already
67 exist. As a special case, -f is not needed to write to /dev/stdout.
69 group.add_argument('--skip-existing', action='store_true',
71 Skip files that already exist. The default behavior is to refuse to
72 write *anything* if any files exist that would have to be
73 overwritten. This option causes even devices, sockets, and fifos to be
77 args = parser.parse_args()
79 if args.locator[-1] == os.sep:
83 not (args.destination and
84 os.path.isdir(args.destination))):
85 parser.error('Destination is not a directory.')
86 if not args.r and (os.path.isdir(args.destination) or
87 args.destination[-1] == os.path.sep):
88 args.destination = os.path.join(args.destination,
89 os.path.basename(args.locator))
90 logger.debug("Appended source file name to destination directory: %s" %
93 # Turn on --progress by default if stderr is a tty and stdout isn't.
94 if (not (args.batch_progress or args.no_progress)
95 and os.isatty(sys.stderr.fileno())
96 and not os.isatty(sys.stdout.fileno())):
99 if args.destination == '-':
100 args.destination = '/dev/stdout'
101 if args.destination == '/dev/stdout':
102 # Normally you have to use -f to write to a file (or device) that
103 # already exists, but "-" and "/dev/stdout" are common enough to
104 # merit a special exception.
107 args.destination = args.destination.rstrip(os.sep)
112 r = re.search(r'^(.*?)(/.*)?$', args.locator)
113 collection = r.group(1)
114 get_prefix = r.group(2)
115 if args.r and not get_prefix:
123 if not args.f and os.path.exists(args.destination):
124 logger.error('Local file %s already exists' % args.destination)
126 with open(args.destination, 'wb') as f:
128 c = arvados.api('v1').collections().get(
129 uuid=collection).execute()
130 manifest = c['manifest_text']
131 except Exception as e:
133 "API lookup failed for collection %s (%s: %s)" %
134 (collection, type(e), str(e)))
135 manifest = arvados.Keep.get(collection)
138 except arvados.errors.NotFoundError as e:
142 reader = arvados.CollectionReader(collection)
144 # Scan the collection. Make an array of (stream, file, local
145 # destination filename) tuples, and add up total size to extract.
148 for s in reader.all_streams():
149 for f in s.all_files():
150 if get_prefix and get_prefix[-1] == os.sep:
151 if 0 != string.find(os.path.join(s.name(), f.name()),
154 dest_path = os.path.join(
156 os.path.join(s.name(), f.name())[len(get_prefix)+1:])
157 if (not (args.n or args.f or args.skip_existing) and
158 os.path.exists(dest_path)):
159 logger.error('Local file %s already exists' % dest_path)
162 if os.path.join(s.name(), f.name()) != '.' + get_prefix:
164 dest_path = args.destination
165 todo += [(s, f, dest_path)]
166 todo_bytes += f.size()
167 except arvados.errors.NotFoundError as e:
171 # Read data, and (if not -n) write to local file(s) or pipe.
174 for s,f,outfilename in todo:
178 if args.skip_existing and os.path.exists(outfilename):
179 logger.debug('Local file %s exists. Skipping.' % outfilename)
181 elif not args.f and (os.path.isfile(outfilename) or
182 os.path.isdir(outfilename)):
183 # Good thing we looked again: apparently this file wasn't
184 # here yet when we checked earlier.
185 logger.error('Local file %s already exists' % outfilename)
188 arvados.util.mkdir_dash_p(os.path.dirname(outfilename))
190 outfile = open(outfilename, 'wb')
191 except Exception as e:
192 logger.error('Open(%s) failed: %s' % (outfilename, e))
195 digestor = hashlib.new(args.hash)
197 for data in f.readall():
201 digestor.update(data)
202 out_bytes += len(data)
204 sys.stderr.write('\r%d MiB / %d MiB %.1f%%' %
209 else 100.0*out_bytes/todo_bytes)))
210 elif args.batch_progress:
211 sys.stderr.write('%s %d read %d total\n' %
212 (sys.argv[0], os.getpid(),
213 out_bytes, todo_bytes))
215 sys.stderr.write("%s %s/%s\n"
216 % (digestor.hexdigest(), s.name(), f.name()))
217 except KeyboardInterrupt:
218 if outfile and outfile != '/dev/stdout':
219 os.unlink(outfilename)
223 sys.stderr.write('\n')