X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/c7f445954df35959174761ba7b1f44ecf377c87a..66c13b6055a363cb08197b8c5d040ed9a511c8ca:/sdk/python/arvados/commands/put.py diff --git a/sdk/python/arvados/commands/put.py b/sdk/python/arvados/commands/put.py index e4e1b6dad2..4a926c701c 100644 --- a/sdk/python/arvados/commands/put.py +++ b/sdk/python/arvados/commands/put.py @@ -6,18 +6,23 @@ import argparse import arvados import base64 +import datetime import errno import fcntl import hashlib import json import os +import pwd import signal +import socket import sys import tempfile +from apiclient import errors as apiclient_errors import arvados.commands._util as arv_cmd CAUGHT_SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM] +api_client = None upload_opts = argparse.ArgumentParser(add_help=False) @@ -91,7 +96,23 @@ input file. It can be used only if there is exactly one path given and it is not a directory. Implies --manifest. """) +upload_opts.add_argument('--portable-data-hash', action='store_true', + help=""" +Print the portable data hash instead of the Arvados UUID for the collection +created by the upload. +""") + run_opts = argparse.ArgumentParser(add_help=False) + +run_opts.add_argument('--project-uuid', metavar='UUID', help=""" +Store the collection in the specified project, instead of your Home +project. +""") + +run_opts.add_argument('--name', help=""" +Save the collection with the specified name. +""") + _group = run_opts.add_mutually_exclusive_group() _group.add_argument('--progress', action='store_true', help=""" @@ -124,7 +145,7 @@ Do not continue interrupted uploads from cached state. arg_parser = argparse.ArgumentParser( description='Copy data from the local filesystem to Keep.', - parents=[upload_opts, run_opts]) + parents=[upload_opts, run_opts, arv_cmd.retry_opt]) def parse_arguments(arguments): args = arg_parser.parse_args(arguments) @@ -158,10 +179,6 @@ class ResumeCacheConflict(Exception): class ResumeCache(object): CACHE_DIR = '.cache/arvados/arv-put' - @classmethod - def setup_user_cache(cls): - return arv_cmd.make_home_conf_dir(cls.CACHE_DIR, 0o700) - def __init__(self, file_spec): self.cache_file = open(file_spec, 'a+') self._lock_file(self.cache_file) @@ -177,7 +194,9 @@ class ResumeCache(object): md5.update(str(max(args.max_manifest_depth, -1))) elif args.filename: md5.update(args.filename) - return os.path.join(cls.CACHE_DIR, md5.hexdigest()) + return os.path.join( + arv_cmd.make_home_conf_dir(cls.CACHE_DIR, 0o700, 'raise'), + md5.hexdigest()) def _lock_file(self, fileobj): try: @@ -226,23 +245,27 @@ class ArvPutCollectionWriter(arvados.ResumableCollectionWriter): STATE_PROPS = (arvados.ResumableCollectionWriter.STATE_PROPS + ['bytes_written', '_seen_inputs']) - def __init__(self, cache=None, reporter=None, bytes_expected=None): + def __init__(self, cache=None, reporter=None, bytes_expected=None, + api_client=None, num_retries=0): self.bytes_written = 0 self._seen_inputs = [] self.cache = cache self.reporter = reporter self.bytes_expected = bytes_expected - super(ArvPutCollectionWriter, self).__init__() + super(ArvPutCollectionWriter, self).__init__( + api_client, num_retries=num_retries) @classmethod - def from_cache(cls, cache, reporter=None, bytes_expected=None): + def from_cache(cls, cache, reporter=None, bytes_expected=None, + num_retries=0): try: state = cache.load() state['_data_buffer'] = [base64.decodestring(state['_data_buffer'])] - writer = cls.from_state(state, cache, reporter, bytes_expected) + writer = cls.from_state(state, cache, reporter, bytes_expected, + num_retries=num_retries) except (TypeError, ValueError, arvados.errors.StaleWriterStateError) as error: - return cls(cache, reporter, bytes_expected) + return cls(cache, reporter, bytes_expected, num_retries=num_retries) else: return writer @@ -328,8 +351,48 @@ def progress_writer(progress_func, outfile=sys.stderr): def exit_signal_handler(sigcode, frame): sys.exit(-sigcode) -def main(arguments=None): +def desired_project_uuid(api_client, project_uuid, num_retries): + if not project_uuid: + query = api_client.users().current() + elif arvados.util.user_uuid_pattern.match(project_uuid): + query = api_client.users().get(uuid=project_uuid) + elif arvados.util.group_uuid_pattern.match(project_uuid): + query = api_client.groups().get(uuid=project_uuid) + else: + raise ValueError("Not a valid project UUID: {}".format(project_uuid)) + return query.execute(num_retries=num_retries)['uuid'] + +def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr): + global api_client + args = parse_arguments(arguments) + status = 0 + if api_client is None: + api_client = arvados.api('v1') + + # Determine the name to use + if args.name: + if args.stream or args.raw: + print >>stderr, "Cannot use --name with --stream or --raw" + sys.exit(1) + collection_name = args.name + else: + collection_name = "Saved at {} by {}@{}".format( + datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC"), + pwd.getpwuid(os.getuid()).pw_name, + socket.gethostname()) + + if args.project_uuid and (args.stream or args.raw): + print >>stderr, "Cannot use --project-uuid with --stream or --raw" + sys.exit(1) + + # Determine the parent project + try: + project_uuid = desired_project_uuid(api_client, args.project_uuid, + args.retries) + except (apiclient_errors.Error, ValueError) as error: + print >>stderr, error + sys.exit(1) if args.progress: reporter = progress_writer(human_progress) @@ -340,22 +403,23 @@ def main(arguments=None): bytes_expected = expected_bytes_for(args.paths) resume_cache = None - try: - if ResumeCache.setup_user_cache() is not None: + if args.resume: + try: resume_cache = ResumeCache(ResumeCache.make_path(args)) - except (IOError, OSError): - pass # Couldn't open cache directory/file. Continue without it. - except ResumeCacheConflict: - print "arv-put: Another process is already uploading this data." - sys.exit(1) + except (IOError, OSError, ValueError): + pass # Couldn't open cache directory/file. Continue without it. + except ResumeCacheConflict: + print >>stderr, "\n".join([ + "arv-put: Another process is already uploading this data.", + " Use --no-resume if this is really what you want."]) + sys.exit(1) if resume_cache is None: - writer = ArvPutCollectionWriter(resume_cache, reporter, bytes_expected) + writer = ArvPutCollectionWriter(resume_cache, reporter, bytes_expected, + num_retries=args.retries) else: - if not args.resume: - resume_cache.restart() writer = ArvPutCollectionWriter.from_cache( - resume_cache, reporter, bytes_expected) + resume_cache, reporter, bytes_expected, num_retries=args.retries) # Install our signal handler for each code in CAUGHT_SIGNALS, and save # the originals. @@ -363,11 +427,11 @@ def main(arguments=None): for sigcode in CAUGHT_SIGNALS} if writer.bytes_written > 0: # We're resuming a previous upload. - print >>sys.stderr, "\n".join([ + print >>stderr, "\n".join([ "arv-put: Resuming previous upload from last checkpoint.", " Use the --no-resume option to start over."]) - writer.report_progress() + writer.report_progress() writer.do_queued_work() # Do work resumed from cache. for path in args.paths: # Copy file data to Keep. if os.path.isdir(path): @@ -379,29 +443,52 @@ def main(arguments=None): writer.finish_current_stream() if args.progress: # Print newline to split stderr from stdout for humans. - print >>sys.stderr + print >>stderr if args.stream: - print writer.manifest_text(), + output = writer.manifest_text() elif args.raw: - print ','.join(writer.data_locators()) + output = ','.join(writer.data_locators()) else: - # Register the resulting collection in Arvados. - collection = arvados.api().collections().create( - body={ - 'uuid': writer.finish(), - 'manifest_text': writer.manifest_text(), - }, - ).execute() - - # Print the locator (uuid) of the new collection. - print collection['uuid'] + try: + # Register the resulting collection in Arvados. + collection = api_client.collections().create( + body={ + 'owner_uuid': project_uuid, + 'name': collection_name, + 'manifest_text': writer.manifest_text() + }, + ensure_unique_name=True + ).execute(num_retries=args.retries) + + print >>stderr, "Collection saved as '%s'" % collection['name'] + + if args.portable_data_hash and 'portable_data_hash' in collection and collection['portable_data_hash']: + output = collection['portable_data_hash'] + else: + output = collection['uuid'] + + except apiclient_errors.Error as error: + print >>stderr, ( + "arv-put: Error creating Collection on project: {}.".format( + error)) + status = 1 + + # Print the locator (uuid) of the new collection. + stdout.write(output) + if not output.endswith('\n'): + stdout.write('\n') for sigcode, orig_handler in orig_signal_handlers.items(): signal.signal(sigcode, orig_handler) + if status != 0: + sys.exit(status) + if resume_cache is not None: resume_cache.destroy() + return output + if __name__ == '__main__': main()