sdk/python/arvados/commands/put.py

   1 # Copyright (C) The Arvados Authors. All rights reserved.
   2 #
   3 # SPDX-License-Identifier: Apache-2.0
   4
   5 from __future__ import division
   6 from future.utils import listitems, listvalues
   7 from builtins import str
   8 from builtins import object
   9 import argparse
  10 import arvados
  11 import arvados.collection
  12 import base64
  13 import ciso8601
  14 import copy
  15 import datetime
  16 import errno
  17 import fcntl
  18 import fnmatch
  19 import hashlib
  20 import json
  21 import logging
  22 import os
  23 import pwd
  24 import re
  25 import signal
  26 import socket
  27 import sys
  28 import tempfile
  29 import threading
  30 import time
  31 import traceback
  32
  33 from apiclient import errors as apiclient_errors
  34 from arvados._version import __version__
  35 from arvados.util import keep_locator_pattern
  36
  37 import arvados.commands._util as arv_cmd
  38
  39 api_client = None
  40
  41 upload_opts = argparse.ArgumentParser(add_help=False)
  42
  43 upload_opts.add_argument('--version', action='version',
  44                          version="%s %s" % (sys.argv[0], __version__),
  45                          help='Print version and exit.')
  46 upload_opts.add_argument('paths', metavar='path', type=str, nargs='*',
  47                          help="""
  48 Local file or directory. If path is a directory reference with a trailing
  49 slash, then just upload the directory's contents; otherwise upload the
  50 directory itself. Default: read from standard input.
  51 """)
  52
  53 _group = upload_opts.add_mutually_exclusive_group()
  54
  55 _group.add_argument('--max-manifest-depth', type=int, metavar='N',
  56                     default=-1, help=argparse.SUPPRESS)
  57
  58 _group.add_argument('--normalize', action='store_true',
  59                     help="""
  60 Normalize the manifest by re-ordering files and streams after writing
  61 data.
  62 """)
  63
  64 _group.add_argument('--dry-run', action='store_true', default=False,
  65                     help="""
  66 Don't actually upload files, but only check if any file should be
  67 uploaded. Exit with code=2 when files are pending for upload.
  68 """)
  69
  70 _group = upload_opts.add_mutually_exclusive_group()
  71
  72 _group.add_argument('--as-stream', action='store_true', dest='stream',
  73                     help="""
  74 Synonym for --stream.
  75 """)
  76
  77 _group.add_argument('--stream', action='store_true',
  78                     help="""
  79 Store the file content and display the resulting manifest on
  80 stdout. Do not save a Collection object in Arvados.
  81 """)
  82
  83 _group.add_argument('--as-manifest', action='store_true', dest='manifest',
  84                     help="""
  85 Synonym for --manifest.
  86 """)
  87
  88 _group.add_argument('--in-manifest', action='store_true', dest='manifest',
  89                     help="""
  90 Synonym for --manifest.
  91 """)
  92
  93 _group.add_argument('--manifest', action='store_true',
  94                     help="""
  95 Store the file data and resulting manifest in Keep, save a Collection
  96 object in Arvados, and display the manifest locator (Collection uuid)
  97 on stdout. This is the default behavior.
  98 """)
  99
 100 _group.add_argument('--as-raw', action='store_true', dest='raw',
 101                     help="""
 102 Synonym for --raw.
 103 """)
 104
 105 _group.add_argument('--raw', action='store_true',
 106                     help="""
 107 Store the file content and display the data block locators on stdout,
 108 separated by commas, with a trailing newline. Do not store a
 109 manifest.
 110 """)
 111
 112 upload_opts.add_argument('--update-collection', type=str, default=None,
 113                          dest='update_collection', metavar="UUID", help="""
 114 Update an existing collection identified by the given Arvados collection
 115 UUID. All new local files will be uploaded.
 116 """)
 117
 118 upload_opts.add_argument('--use-filename', type=str, default=None,
 119                          dest='filename', help="""
 120 Synonym for --filename.
 121 """)
 122
 123 upload_opts.add_argument('--filename', type=str, default=None,
 124                          help="""
 125 Use the given filename in the manifest, instead of the name of the
 126 local file. This is useful when "-" or "/dev/stdin" is given as an
 127 input file. It can be used only if there is exactly one path given and
 128 it is not a directory. Implies --manifest.
 129 """)
 130
 131 upload_opts.add_argument('--portable-data-hash', action='store_true',
 132                          help="""
 133 Print the portable data hash instead of the Arvados UUID for the collection
 134 created by the upload.
 135 """)
 136
 137 upload_opts.add_argument('--replication', type=int, metavar='N', default=None,
 138                          help="""
 139 Set the replication level for the new collection: how many different
 140 physical storage devices (e.g., disks) should have a copy of each data
 141 block. Default is to use the server-provided default (if any) or 2.
 142 """)
 143
 144 upload_opts.add_argument('--storage-classes', help="""
 145 Specify comma separated list of storage classes to be used when saving data to Keep.
 146 """)
 147
 148 upload_opts.add_argument('--threads', type=int, metavar='N', default=None,
 149                          help="""
 150 Set the number of upload threads to be used. Take into account that
 151 using lots of threads will increase the RAM requirements. Default is
 152 to use 2 threads.
 153 On high latency installations, using a greater number will improve
 154 overall throughput.
 155 """)
 156
 157 upload_opts.add_argument('--exclude', metavar='PATTERN', default=[],
 158                       action='append', help="""
 159 Exclude files and directories whose names match the given glob pattern. When
 160 using a path-like pattern like 'subdir/*.txt', all text files inside 'subdir'
 161 directory, relative to the provided input dirs will be excluded.
 162 When using a filename pattern like '*.txt', any text file will be excluded
 163 no matter where it is placed.
 164 For the special case of needing to exclude only files or dirs directly below
 165 the given input directory, you can use a pattern like './exclude_this.gif'.
 166 You can specify multiple patterns by using this argument more than once.
 167 """)
 168
 169 _group = upload_opts.add_mutually_exclusive_group()
 170 _group.add_argument('--follow-links', action='store_true', default=True,
 171                     dest='follow_links', help="""
 172 Follow file and directory symlinks (default).
 173 """)
 174 _group.add_argument('--no-follow-links', action='store_false', dest='follow_links',
 175                     help="""
 176 Ignore file and directory symlinks. Even paths given explicitly on the
 177 command line will be skipped if they are symlinks.
 178 """)
 179
 180
 181 run_opts = argparse.ArgumentParser(add_help=False)
 182
 183 run_opts.add_argument('--project-uuid', metavar='UUID', help="""
 184 Store the collection in the specified project, instead of your Home
 185 project.
 186 """)
 187
 188 run_opts.add_argument('--name', help="""
 189 Save the collection with the specified name.
 190 """)
 191
 192 _group = run_opts.add_mutually_exclusive_group()
 193 _group.add_argument('--progress', action='store_true',
 194                     help="""
 195 Display human-readable progress on stderr (bytes and, if possible,
 196 percentage of total data size). This is the default behavior when
 197 stderr is a tty.
 198 """)
 199
 200 _group.add_argument('--no-progress', action='store_true',
 201                     help="""
 202 Do not display human-readable progress on stderr, even if stderr is a
 203 tty.
 204 """)
 205
 206 _group.add_argument('--batch-progress', action='store_true',
 207                     help="""
 208 Display machine-readable progress on stderr (bytes and, if known,
 209 total data size).
 210 """)
 211
 212 run_opts.add_argument('--silent', action='store_true',
 213                       help="""
 214 Do not print any debug messages to console. (Any error messages will
 215 still be displayed.)
 216 """)
 217
 218 run_opts.add_argument('--batch', action='store_true', default=False,
 219                       help="""
 220 Retries with '--no-resume --no-cache' if cached state contains invalid/expired
 221 block signatures.
 222 """)
 223
 224 _group = run_opts.add_mutually_exclusive_group()
 225 _group.add_argument('--resume', action='store_true', default=True,
 226                     help="""
 227 Continue interrupted uploads from cached state (default).
 228 """)
 229 _group.add_argument('--no-resume', action='store_false', dest='resume',
 230                     help="""
 231 Do not continue interrupted uploads from cached state.
 232 """)
 233
 234 _group = run_opts.add_mutually_exclusive_group()
 235 _group.add_argument('--cache', action='store_true', dest='use_cache', default=True,
 236                     help="""
 237 Save upload state in a cache file for resuming (default).
 238 """)
 239 _group.add_argument('--no-cache', action='store_false', dest='use_cache',
 240                     help="""
 241 Do not save upload state in a cache file for resuming.
 242 """)
 243
 244 _group = upload_opts.add_mutually_exclusive_group()
 245 _group.add_argument('--trash-at', metavar='YYYY-MM-DDTHH:MM', default=None,
 246                     help="""
 247 Set the trash date of the resulting collection to an absolute date in the future.
 248 The accepted format is defined by the ISO 8601 standard. Examples: 20090103, 2009-01-03, 20090103T181505, 2009-01-03T18:15:05.\n
 249 Timezone information can be added. If not, the provided date/time is assumed as being in the local system's timezone.
 250 """)
 251 _group.add_argument('--trash-after', type=int, metavar='DAYS', default=None,
 252                     help="""
 253 Set the trash date of the resulting collection to an amount of days from the
 254 date/time that the upload process finishes.
 255 """)
 256
 257 arg_parser = argparse.ArgumentParser(
 258     description='Copy data from the local filesystem to Keep.',
 259     parents=[upload_opts, run_opts, arv_cmd.retry_opt])
 260
 261 def parse_arguments(arguments):
 262     args = arg_parser.parse_args(arguments)
 263
 264     if len(args.paths) == 0:
 265         args.paths = ['-']
 266
 267     args.paths = ["-" if x == "/dev/stdin" else x for x in args.paths]
 268
 269     if args.filename and (len(args.paths) != 1 or os.path.isdir(args.paths[0])):
 270         arg_parser.error("""
 271     --filename argument cannot be used when storing a directory or
 272     multiple files.
 273     """)
 274
 275     # Turn on --progress by default if stderr is a tty.
 276     if (not (args.batch_progress or args.no_progress or args.silent)
 277         and os.isatty(sys.stderr.fileno())):
 278         args.progress = True
 279
 280     # Turn off --resume (default) if --no-cache is used.
 281     if not args.use_cache:
 282         args.resume = False
 283
 284     if args.paths == ['-']:
 285         if args.update_collection:
 286             arg_parser.error("""
 287     --update-collection cannot be used when reading from stdin.
 288     """)
 289         args.resume = False
 290         args.use_cache = False
 291         if not args.filename:
 292             args.filename = 'stdin'
 293
 294     # Remove possible duplicated patterns
 295     if len(args.exclude) > 0:
 296         args.exclude = list(set(args.exclude))
 297
 298     return args
 299
 300
 301 class PathDoesNotExistError(Exception):
 302     pass
 303
 304
 305 class CollectionUpdateError(Exception):
 306     pass
 307
 308
 309 class ResumeCacheConflict(Exception):
 310     pass
 311
 312
 313 class ResumeCacheInvalidError(Exception):
 314     pass
 315
 316 class ArvPutArgumentConflict(Exception):
 317     pass
 318
 319
 320 class ArvPutUploadIsPending(Exception):
 321     pass
 322
 323
 324 class ArvPutUploadNotPending(Exception):
 325     pass
 326
 327
 328 class FileUploadList(list):
 329     def __init__(self, dry_run=False):
 330         list.__init__(self)
 331         self.dry_run = dry_run
 332
 333     def append(self, other):
 334         if self.dry_run:
 335             raise ArvPutUploadIsPending()
 336         super(FileUploadList, self).append(other)
 337
 338
 339 # Appends the X-Request-Id to the log message when log level is ERROR or DEBUG
 340 class ArvPutLogFormatter(logging.Formatter):
 341     std_fmtr = logging.Formatter(arvados.log_format, arvados.log_date_format)
 342     err_fmtr = None
 343     request_id_informed = False
 344
 345     def __init__(self, request_id):
 346         self.err_fmtr = logging.Formatter(
 347             arvados.log_format+' (X-Request-Id: {})'.format(request_id),
 348             arvados.log_date_format)
 349
 350     def format(self, record):
 351         if (not self.request_id_informed) and (record.levelno in (logging.DEBUG, logging.ERROR)):
 352             self.request_id_informed = True
 353             return self.err_fmtr.format(record)
 354         return self.std_fmtr.format(record)
 355
 356
 357 class ResumeCache(object):
 358     CACHE_DIR = '.cache/arvados/arv-put'
 359
 360     def __init__(self, file_spec):
 361         self.cache_file = open(file_spec, 'a+')
 362         self._lock_file(self.cache_file)
 363         self.filename = self.cache_file.name
 364
 365     @classmethod
 366     def make_path(cls, args):
 367         md5 = hashlib.md5()
 368         md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost').encode())
 369         realpaths = sorted(os.path.realpath(path) for path in args.paths)
 370         md5.update(b'\0'.join([p.encode() for p in realpaths]))
 371         if any(os.path.isdir(path) for path in realpaths):
 372             md5.update(b'-1')
 373         elif args.filename:
 374             md5.update(args.filename.encode())
 375         return os.path.join(
 376             arv_cmd.make_home_conf_dir(cls.CACHE_DIR, 0o700, 'raise'),
 377             md5.hexdigest())
 378
 379     def _lock_file(self, fileobj):
 380         try:
 381             fcntl.flock(fileobj, fcntl.LOCK_EX | fcntl.LOCK_NB)
 382         except IOError:
 383             raise ResumeCacheConflict(u"{} locked".format(fileobj.name))
 384
 385     def load(self):
 386         self.cache_file.seek(0)
 387         return json.load(self.cache_file)
 388
 389     def check_cache(self, api_client=None, num_retries=0):
 390         try:
 391             state = self.load()
 392             locator = None
 393             try:
 394                 if "_finished_streams" in state and len(state["_finished_streams"]) > 0:
 395                     locator = state["_finished_streams"][0][1][0]
 396                 elif "_current_stream_locators" in state and len(state["_current_stream_locators"]) > 0:
 397                     locator = state["_current_stream_locators"][0]
 398                 if locator is not None:
 399                     kc = arvados.keep.KeepClient(api_client=api_client)
 400                     kc.head(locator, num_retries=num_retries)
 401             except Exception as e:
 402                 self.restart()
 403         except (ValueError):
 404             pass
 405
 406     def save(self, data):
 407         try:
 408             new_cache_fd, new_cache_name = tempfile.mkstemp(
 409                 dir=os.path.dirname(self.filename))
 410             self._lock_file(new_cache_fd)
 411             new_cache = os.fdopen(new_cache_fd, 'r+')
 412             json.dump(data, new_cache)
 413             os.rename(new_cache_name, self.filename)
 414         except (IOError, OSError, ResumeCacheConflict):
 415             try:
 416                 os.unlink(new_cache_name)
 417             except NameError:  # mkstemp failed.
 418                 pass
 419         else:
 420             self.cache_file.close()
 421             self.cache_file = new_cache
 422
 423     def close(self):
 424         self.cache_file.close()
 425
 426     def destroy(self):
 427         try:
 428             os.unlink(self.filename)
 429         except OSError as error:
 430             if error.errno != errno.ENOENT:  # That's what we wanted anyway.
 431                 raise
 432         self.close()
 433
 434     def restart(self):
 435         self.destroy()
 436         self.__init__(self.filename)
 437
 438
 439 class ArvPutUploadJob(object):
 440     CACHE_DIR = '.cache/arvados/arv-put'
 441     EMPTY_STATE = {
 442         'manifest' : None, # Last saved manifest checkpoint
 443         'files' : {} # Previous run file list: {path : {size, mtime}}
 444     }
 445
 446     def __init__(self, paths, resume=True, use_cache=True, reporter=None,
 447                  name=None, owner_uuid=None, api_client=None, batch_mode=False,
 448                  ensure_unique_name=False, num_retries=None,
 449                  put_threads=None, replication_desired=None, filename=None,
 450                  update_time=60.0, update_collection=None, storage_classes=None,
 451                  logger=logging.getLogger('arvados.arv_put'), dry_run=False,
 452                  follow_links=True, exclude_paths=[], exclude_names=None,
 453                  trash_at=None):
 454         self.paths = paths
 455         self.resume = resume
 456         self.use_cache = use_cache
 457         self.batch_mode = batch_mode
 458         self.update = False
 459         self.reporter = reporter
 460         # This will set to 0 before start counting, if no special files are going
 461         # to be read.
 462         self.bytes_expected = None
 463         self.bytes_written = 0
 464         self.bytes_skipped = 0
 465         self.name = name
 466         self.owner_uuid = owner_uuid
 467         self.ensure_unique_name = ensure_unique_name
 468         self.num_retries = num_retries
 469         self.replication_desired = replication_desired
 470         self.put_threads = put_threads
 471         self.filename = filename
 472         self.storage_classes = storage_classes
 473         self._api_client = api_client
 474         self._state_lock = threading.Lock()
 475         self._state = None # Previous run state (file list & manifest)
 476         self._current_files = [] # Current run file list
 477         self._cache_file = None
 478         self._collection_lock = threading.Lock()
 479         self._remote_collection = None # Collection being updated (if asked)
 480         self._local_collection = None # Collection from previous run manifest
 481         self._file_paths = set() # Files to be updated in remote collection
 482         self._stop_checkpointer = threading.Event()
 483         self._checkpointer = threading.Thread(target=self._update_task)
 484         self._checkpointer.daemon = True
 485         self._update_task_time = update_time  # How many seconds wait between update runs
 486         self._files_to_upload = FileUploadList(dry_run=dry_run)
 487         self._upload_started = False
 488         self.logger = logger
 489         self.dry_run = dry_run
 490         self._checkpoint_before_quit = True
 491         self.follow_links = follow_links
 492         self.exclude_paths = exclude_paths
 493         self.exclude_names = exclude_names
 494         self._trash_at = trash_at
 495
 496         if self._trash_at is not None:
 497             if type(self._trash_at) not in [datetime.datetime, datetime.timedelta]:
 498                 raise TypeError('trash_at should be None, timezone-naive datetime or timedelta')
 499             if type(self._trash_at) == datetime.datetime and self._trash_at.tzinfo is not None:
 500                 raise TypeError('provided trash_at datetime should be timezone-naive')
 501
 502         if not self.use_cache and self.resume:
 503             raise ArvPutArgumentConflict('resume cannot be True when use_cache is False')
 504
 505         # Check for obvious dry-run responses
 506         if self.dry_run and (not self.use_cache or not self.resume):
 507             raise ArvPutUploadIsPending()
 508
 509         # Load cached data if any and if needed
 510         self._setup_state(update_collection)
 511
 512         # Build the upload file list, excluding requested files and counting the
 513         # bytes expected to be uploaded.
 514         self._build_upload_list()
 515
 516     def _build_upload_list(self):
 517         """
 518         Scan the requested paths to count file sizes, excluding requested files
 519         and dirs and building the upload file list.
 520         """
 521         # If there aren't special files to be read, reset total bytes count to zero
 522         # to start counting.
 523         if not any([p for p in self.paths
 524                     if not (os.path.isfile(p) or os.path.isdir(p))]):
 525             self.bytes_expected = 0
 526
 527         for path in self.paths:
 528             # Test for stdin first, in case some file named '-' exist
 529             if path == '-':
 530                 if self.dry_run:
 531                     raise ArvPutUploadIsPending()
 532                 self._write_stdin(self.filename or 'stdin')
 533             elif not os.path.exists(path):
 534                  raise PathDoesNotExistError(u"file or directory '{}' does not exist.".format(path))
 535             elif (not self.follow_links) and os.path.islink(path):
 536                 self.logger.warning("Skipping symlink '{}'".format(path))
 537                 continue
 538             elif os.path.isdir(path):
 539                 # Use absolute paths on cache index so CWD doesn't interfere
 540                 # with the caching logic.
 541                 orig_path = path
 542                 path = os.path.abspath(path)
 543                 if orig_path[-1:] == os.sep:
 544                     # When passing a directory reference with a trailing slash,
 545                     # its contents should be uploaded directly to the
 546                     # collection's root.
 547                     prefixdir = path
 548                 else:
 549                     # When passing a directory reference with no trailing slash,
 550                     # upload the directory to the collection's root.
 551                     prefixdir = os.path.dirname(path)
 552                 prefixdir += os.sep
 553                 for root, dirs, files in os.walk(path,
 554                                                  followlinks=self.follow_links):
 555                     root_relpath = os.path.relpath(root, path)
 556                     if root_relpath == '.':
 557                         root_relpath = ''
 558                     # Exclude files/dirs by full path matching pattern
 559                     if self.exclude_paths:
 560                         dirs[:] = [d for d in dirs
 561                                    if not any(pathname_match(
 562                                            os.path.join(root_relpath, d), pat)
 563                                               for pat in self.exclude_paths)]
 564                         files = [f for f in files
 565                                  if not any(pathname_match(
 566                                          os.path.join(root_relpath, f), pat)
 567                                             for pat in self.exclude_paths)]
 568                     # Exclude files/dirs by name matching pattern
 569                     if self.exclude_names is not None:
 570                         dirs[:] = [d for d in dirs
 571                                    if not self.exclude_names.match(d)]
 572                         files = [f for f in files
 573                                  if not self.exclude_names.match(f)]
 574                     # Make os.walk()'s dir traversing order deterministic
 575                     dirs.sort()
 576                     files.sort()
 577                     for f in files:
 578                         filepath = os.path.join(root, f)
 579                         # Add its size to the total bytes count (if applicable)
 580                         if self.follow_links or (not os.path.islink(filepath)):
 581                             if self.bytes_expected is not None:
 582                                 self.bytes_expected += os.path.getsize(filepath)
 583                         self._check_file(filepath,
 584                                          os.path.join(root[len(prefixdir):], f))
 585             else:
 586                 filepath = os.path.abspath(path)
 587                 # Add its size to the total bytes count (if applicable)
 588                 if self.follow_links or (not os.path.islink(filepath)):
 589                     if self.bytes_expected is not None:
 590                         self.bytes_expected += os.path.getsize(filepath)
 591                 self._check_file(filepath,
 592                                  self.filename or os.path.basename(path))
 593         # If dry-mode is on, and got up to this point, then we should notify that
 594         # there aren't any file to upload.
 595         if self.dry_run:
 596             raise ArvPutUploadNotPending()
 597         # Remove local_collection's files that don't exist locally anymore, so the
 598         # bytes_written count is correct.
 599         for f in self.collection_file_paths(self._local_collection,
 600                                             path_prefix=""):
 601             if f != 'stdin' and f != self.filename and not f in self._file_paths:
 602                 self._local_collection.remove(f)
 603
 604     def start(self, save_collection):
 605         """
 606         Start supporting thread & file uploading
 607         """
 608         self._checkpointer.start()
 609         try:
 610             # Update bytes_written from current local collection and
 611             # report initial progress.
 612             self._update()
 613             # Actual file upload
 614             self._upload_started = True # Used by the update thread to start checkpointing
 615             self._upload_files()
 616         except (SystemExit, Exception) as e:
 617             self._checkpoint_before_quit = False
 618             # Log stack trace only when Ctrl-C isn't pressed (SIGINT)
 619             # Note: We're expecting SystemExit instead of
 620             # KeyboardInterrupt because we have a custom signal
 621             # handler in place that raises SystemExit with the catched
 622             # signal's code.
 623             if isinstance(e, PathDoesNotExistError):
 624                 # We aren't interested in the traceback for this case
 625                 pass
 626             elif not isinstance(e, SystemExit) or e.code != -2:
 627                 self.logger.warning("Abnormal termination:\n{}".format(
 628                     traceback.format_exc()))
 629             raise
 630         finally:
 631             if not self.dry_run:
 632                 # Stop the thread before doing anything else
 633                 self._stop_checkpointer.set()
 634                 self._checkpointer.join()
 635                 if self._checkpoint_before_quit:
 636                     # Commit all pending blocks & one last _update()
 637                     self._local_collection.manifest_text()
 638                     self._update(final=True)
 639                     if save_collection:
 640                         self.save_collection()
 641             if self.use_cache:
 642                 self._cache_file.close()
 643
 644     def _collection_trash_at(self):
 645         """
 646         Returns the trash date that the collection should use at save time.
 647         Takes into account absolute/relative trash_at values requested
 648         by the user.
 649         """
 650         if type(self._trash_at) == datetime.timedelta:
 651             # Get an absolute datetime for trash_at
 652             return datetime.datetime.utcnow() + self._trash_at
 653         return self._trash_at
 654
 655     def save_collection(self):
 656         if self.update:
 657             # Check if files should be updated on the remote collection.
 658             for fp in self._file_paths:
 659                 remote_file = self._remote_collection.find(fp)
 660                 if not remote_file:
 661                     # File don't exist on remote collection, copy it.
 662                     self._remote_collection.copy(fp, fp, self._local_collection)
 663                 elif remote_file != self._local_collection.find(fp):
 664                     # A different file exist on remote collection, overwrite it.
 665                     self._remote_collection.copy(fp, fp, self._local_collection, overwrite=True)
 666                 else:
 667                     # The file already exist on remote collection, skip it.
 668                     pass
 669             self._remote_collection.save(num_retries=self.num_retries,
 670                                          trash_at=self._collection_trash_at())
 671         else:
 672             if len(self._local_collection) == 0:
 673                 self.logger.warning("No files were uploaded, skipping collection creation.")
 674                 return
 675             self._local_collection.save_new(
 676                 name=self.name, owner_uuid=self.owner_uuid,
 677                 ensure_unique_name=self.ensure_unique_name,
 678                 num_retries=self.num_retries,
 679                 trash_at=self._collection_trash_at())
 680
 681     def destroy_cache(self):
 682         if self.use_cache:
 683             try:
 684                 os.unlink(self._cache_filename)
 685             except OSError as error:
 686                 # That's what we wanted anyway.
 687                 if error.errno != errno.ENOENT:
 688                     raise
 689             self._cache_file.close()
 690
 691     def _collection_size(self, collection):
 692         """
 693         Recursively get the total size of the collection
 694         """
 695         size = 0
 696         for item in listvalues(collection):
 697             if isinstance(item, arvados.collection.Collection) or isinstance(item, arvados.collection.Subcollection):
 698                 size += self._collection_size(item)
 699             else:
 700                 size += item.size()
 701         return size
 702
 703     def _update_task(self):
 704         """
 705         Periodically called support task. File uploading is
 706         asynchronous so we poll status from the collection.
 707         """
 708         while not self._stop_checkpointer.wait(1 if not self._upload_started else self._update_task_time):
 709             self._update()
 710
 711     def _update(self, final=False):
 712         """
 713         Update cached manifest text and report progress.
 714         """
 715         if self._upload_started:
 716             with self._collection_lock:
 717                 self.bytes_written = self._collection_size(self._local_collection)
 718                 if self.use_cache:
 719                     if final:
 720                         manifest = self._local_collection.manifest_text()
 721                     else:
 722                         # Get the manifest text without comitting pending blocks
 723                         manifest = self._local_collection.manifest_text(strip=False,
 724                                                                         normalize=False,
 725                                                                         only_committed=True)
 726                     # Update cache
 727                     with self._state_lock:
 728                         self._state['manifest'] = manifest
 729             if self.use_cache:
 730                 try:
 731                     self._save_state()
 732                 except Exception as e:
 733                     self.logger.error("Unexpected error trying to save cache file: {}".format(e))
 734             # Keep remote collection's trash_at attribute synced when using relative expire dates
 735             if self._remote_collection is not None and type(self._trash_at) == datetime.timedelta:
 736                 try:
 737                     self._api_client.collections().update(
 738                         uuid=self._remote_collection.manifest_locator(),
 739                         body={'trash_at': self._collection_trash_at().strftime("%Y-%m-%dT%H:%M:%S.%fZ")}
 740                     ).execute(num_retries=self.num_retries)
 741                 except Exception as e:
 742                     self.logger.error("Unexpected error trying to update remote collection's expire date: {}".format(e))
 743         else:
 744             self.bytes_written = self.bytes_skipped
 745         # Call the reporter, if any
 746         self.report_progress()
 747
 748     def report_progress(self):
 749         if self.reporter is not None:
 750             self.reporter(self.bytes_written, self.bytes_expected)
 751
 752     def _write_stdin(self, filename):
 753         output = self._local_collection.open(filename, 'wb')
 754         self._write(sys.stdin.buffer, output)
 755         output.close()
 756
 757     def _check_file(self, source, filename):
 758         """
 759         Check if this file needs to be uploaded
 760         """
 761         # Ignore symlinks when requested
 762         if (not self.follow_links) and os.path.islink(source):
 763             return
 764         resume_offset = 0
 765         should_upload = False
 766         new_file_in_cache = False
 767         # Record file path for updating the remote collection before exiting
 768         self._file_paths.add(filename)
 769
 770         with self._state_lock:
 771             # If no previous cached data on this file, store it for an eventual
 772             # repeated run.
 773             if source not in self._state['files']:
 774                 self._state['files'][source] = {
 775                     'mtime': os.path.getmtime(source),
 776                     'size' : os.path.getsize(source)
 777                 }
 778                 new_file_in_cache = True
 779             cached_file_data = self._state['files'][source]
 780
 781         # Check if file was already uploaded (at least partially)
 782         file_in_local_collection = self._local_collection.find(filename)
 783
 784         # If not resuming, upload the full file.
 785         if not self.resume:
 786             should_upload = True
 787         # New file detected from last run, upload it.
 788         elif new_file_in_cache:
 789             should_upload = True
 790         # Local file didn't change from last run.
 791         elif cached_file_data['mtime'] == os.path.getmtime(source) and cached_file_data['size'] == os.path.getsize(source):
 792             if not file_in_local_collection:
 793                 # File not uploaded yet, upload it completely
 794                 should_upload = True
 795             elif file_in_local_collection.permission_expired():
 796                 # Permission token expired, re-upload file. This will change whenever
 797                 # we have a API for refreshing tokens.
 798                 self.logger.warning(u"Uploaded file '{}' access token expired, will re-upload it from scratch".format(filename))
 799                 should_upload = True
 800                 self._local_collection.remove(filename)
 801             elif cached_file_data['size'] == file_in_local_collection.size():
 802                 # File already there, skip it.
 803                 self.bytes_skipped += cached_file_data['size']
 804             elif cached_file_data['size'] > file_in_local_collection.size():
 805                 # File partially uploaded, resume!
 806                 resume_offset = file_in_local_collection.size()
 807                 self.bytes_skipped += resume_offset
 808                 should_upload = True
 809             else:
 810                 # Inconsistent cache, re-upload the file
 811                 should_upload = True
 812                 self._local_collection.remove(filename)
 813                 self.logger.warning(u"Uploaded version of file '{}' is bigger than local version, will re-upload it from scratch.".format(source))
 814         # Local file differs from cached data, re-upload it.
 815         else:
 816             if file_in_local_collection:
 817                 self._local_collection.remove(filename)
 818             should_upload = True
 819
 820         if should_upload:
 821             try:
 822                 self._files_to_upload.append((source, resume_offset, filename))
 823             except ArvPutUploadIsPending:
 824                 # This could happen when running on dry-mode, close cache file to
 825                 # avoid locking issues.
 826                 self._cache_file.close()
 827                 raise
 828
 829     def _upload_files(self):
 830         for source, resume_offset, filename in self._files_to_upload:
 831             with open(source, 'rb') as source_fd:
 832                 with self._state_lock:
 833                     self._state['files'][source]['mtime'] = os.path.getmtime(source)
 834                     self._state['files'][source]['size'] = os.path.getsize(source)
 835                 if resume_offset > 0:
 836                     # Start upload where we left off
 837                     output = self._local_collection.open(filename, 'ab')
 838                     source_fd.seek(resume_offset)
 839                 else:
 840                     # Start from scratch
 841                     output = self._local_collection.open(filename, 'wb')
 842                 self._write(source_fd, output)
 843                 output.close(flush=False)
 844
 845     def _write(self, source_fd, output):
 846         while True:
 847             data = source_fd.read(arvados.config.KEEP_BLOCK_SIZE)
 848             if not data:
 849                 break
 850             output.write(data)
 851
 852     def _my_collection(self):
 853         return self._remote_collection if self.update else self._local_collection
 854
 855     def _get_cache_filepath(self):
 856         # Set up cache file name from input paths.
 857         md5 = hashlib.md5()
 858         md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost').encode())
 859         realpaths = sorted(os.path.realpath(path) for path in self.paths)
 860         md5.update(b'\0'.join([p.encode() for p in realpaths]))
 861         if self.filename:
 862             md5.update(self.filename.encode())
 863         cache_filename = md5.hexdigest()
 864         cache_filepath = os.path.join(
 865             arv_cmd.make_home_conf_dir(self.CACHE_DIR, 0o700, 'raise'),
 866             cache_filename)
 867         return cache_filepath
 868
 869     def _setup_state(self, update_collection):
 870         """
 871         Create a new cache file or load a previously existing one.
 872         """
 873         # Load an already existing collection for update
 874         if update_collection and re.match(arvados.util.collection_uuid_pattern,
 875                                           update_collection):
 876             try:
 877                 self._remote_collection = arvados.collection.Collection(
 878                     update_collection,
 879                     api_client=self._api_client,
 880                     storage_classes_desired=self.storage_classes,
 881                     num_retries=self.num_retries)
 882             except arvados.errors.ApiError as error:
 883                 raise CollectionUpdateError("Cannot read collection {} ({})".format(update_collection, error))
 884             else:
 885                 self.update = True
 886         elif update_collection:
 887             # Collection locator provided, but unknown format
 888             raise CollectionUpdateError("Collection locator unknown: '{}'".format(update_collection))
 889
 890         if self.use_cache:
 891             cache_filepath = self._get_cache_filepath()
 892             if self.resume and os.path.exists(cache_filepath):
 893                 self.logger.info(u"Resuming upload from cache file {}".format(cache_filepath))
 894                 self._cache_file = open(cache_filepath, 'a+')
 895             else:
 896                 # --no-resume means start with a empty cache file.
 897                 self.logger.info(u"Creating new cache file at {}".format(cache_filepath))
 898                 self._cache_file = open(cache_filepath, 'w+')
 899             self._cache_filename = self._cache_file.name
 900             self._lock_file(self._cache_file)
 901             self._cache_file.seek(0)
 902
 903         with self._state_lock:
 904             if self.use_cache:
 905                 try:
 906                     self._state = json.load(self._cache_file)
 907                     if not set(['manifest', 'files']).issubset(set(self._state.keys())):
 908                         # Cache at least partially incomplete, set up new cache
 909                         self._state = copy.deepcopy(self.EMPTY_STATE)
 910                 except ValueError:
 911                     # Cache file empty, set up new cache
 912                     self._state = copy.deepcopy(self.EMPTY_STATE)
 913             else:
 914                 self.logger.info("No cache usage requested for this run.")
 915                 # No cache file, set empty state
 916                 self._state = copy.deepcopy(self.EMPTY_STATE)
 917             if not self._cached_manifest_valid():
 918                 if not self.batch_mode:
 919                     raise ResumeCacheInvalidError()
 920                 else:
 921                     self.logger.info("Invalid signatures on cache file '{}' while being run in 'batch mode' -- continuing anyways.".format(self._cache_file.name))
 922                     self.use_cache = False # Don't overwrite preexisting cache file.
 923                     self._state = copy.deepcopy(self.EMPTY_STATE)
 924             # Load the previous manifest so we can check if files were modified remotely.
 925             self._local_collection = arvados.collection.Collection(
 926                 self._state['manifest'],
 927                 replication_desired=self.replication_desired,
 928                 storage_classes_desired=self.storage_classes,
 929                 put_threads=self.put_threads,
 930                 api_client=self._api_client,
 931                 num_retries=self.num_retries)
 932
 933     def _cached_manifest_valid(self):
 934         """
 935         Validate the oldest non-expired block signature to check if cached manifest
 936         is usable: checking if the cached manifest was not created with a different
 937         arvados account.
 938         """
 939         if self._state.get('manifest', None) is None:
 940             # No cached manifest yet, all good.
 941             return True
 942         now = datetime.datetime.utcnow()
 943         oldest_exp = None
 944         oldest_loc = None
 945         block_found = False
 946         for m in keep_locator_pattern.finditer(self._state['manifest']):
 947             loc = m.group(0)
 948             try:
 949                 exp = datetime.datetime.utcfromtimestamp(int(loc.split('@')[1], 16))
 950             except IndexError:
 951                 # Locator without signature
 952                 continue
 953             block_found = True
 954             if exp > now and (oldest_exp is None or exp < oldest_exp):
 955                 oldest_exp = exp
 956                 oldest_loc = loc
 957         if not block_found:
 958             # No block signatures found => no invalid block signatures.
 959             return True
 960         if oldest_loc is None:
 961             # Locator signatures found, but all have expired.
 962             # Reset the cache and move on.
 963             self.logger.info('Cache expired, starting from scratch.')
 964             self._state['manifest'] = ''
 965             return True
 966         kc = arvados.KeepClient(api_client=self._api_client,
 967                                 num_retries=self.num_retries)
 968         try:
 969             kc.head(oldest_loc)
 970         except arvados.errors.KeepRequestError:
 971             # Something is wrong, cached manifest is not valid.
 972             return False
 973         return True
 974
 975     def collection_file_paths(self, col, path_prefix='.'):
 976         """Return a list of file paths by recursively go through the entire collection `col`"""
 977         file_paths = []
 978         for name, item in listitems(col):
 979             if isinstance(item, arvados.arvfile.ArvadosFile):
 980                 file_paths.append(os.path.join(path_prefix, name))
 981             elif isinstance(item, arvados.collection.Subcollection):
 982                 new_prefix = os.path.join(path_prefix, name)
 983                 file_paths += self.collection_file_paths(item, path_prefix=new_prefix)
 984         return file_paths
 985
 986     def _lock_file(self, fileobj):
 987         try:
 988             fcntl.flock(fileobj, fcntl.LOCK_EX | fcntl.LOCK_NB)
 989         except IOError:
 990             raise ResumeCacheConflict(u"{} locked".format(fileobj.name))
 991
 992     def _save_state(self):
 993         """
 994         Atomically save current state into cache.
 995         """
 996         with self._state_lock:
 997             # We're not using copy.deepcopy() here because it's a lot slower
 998             # than json.dumps(), and we're already needing JSON format to be
 999             # saved on disk.
1000             state = json.dumps(self._state)
1001         try:
1002             new_cache = tempfile.NamedTemporaryFile(
1003                 mode='w+',
1004                 dir=os.path.dirname(self._cache_filename), delete=False)
1005             self._lock_file(new_cache)
1006             new_cache.write(state)
1007             new_cache.flush()
1008             os.fsync(new_cache)
1009             os.rename(new_cache.name, self._cache_filename)
1010         except (IOError, OSError, ResumeCacheConflict) as error:
1011             self.logger.error("There was a problem while saving the cache file: {}".format(error))
1012             try:
1013                 os.unlink(new_cache_name)
1014             except NameError:  # mkstemp failed.
1015                 pass
1016         else:
1017             self._cache_file.close()
1018             self._cache_file = new_cache
1019
1020     def collection_name(self):
1021         return self._my_collection().api_response()['name'] if self._my_collection().api_response() else None
1022
1023     def collection_trash_at(self):
1024         return self._my_collection().get_trash_at()
1025
1026     def manifest_locator(self):
1027         return self._my_collection().manifest_locator()
1028
1029     def portable_data_hash(self):
1030         pdh = self._my_collection().portable_data_hash()
1031         m = self._my_collection().stripped_manifest().encode()
1032         local_pdh = '{}+{}'.format(hashlib.md5(m).hexdigest(), len(m))
1033         if pdh != local_pdh:
1034             self.logger.warning("\n".join([
1035                 "arv-put: API server provided PDH differs from local manifest.",
1036                 "         This should not happen; showing API server version."]))
1037         return pdh
1038
1039     def manifest_text(self, stream_name=".", strip=False, normalize=False):
1040         return self._my_collection().manifest_text(stream_name, strip, normalize)
1041
1042     def _datablocks_on_item(self, item):
1043         """
1044         Return a list of datablock locators, recursively navigating
1045         through subcollections
1046         """
1047         if isinstance(item, arvados.arvfile.ArvadosFile):
1048             if item.size() == 0:
1049                 # Empty file locator
1050                 return ["d41d8cd98f00b204e9800998ecf8427e+0"]
1051             else:
1052                 locators = []
1053                 for segment in item.segments():
1054                     loc = segment.locator
1055                     locators.append(loc)
1056                 return locators
1057         elif isinstance(item, arvados.collection.Collection):
1058             l = [self._datablocks_on_item(x) for x in listvalues(item)]
1059             # Fast list flattener method taken from:
1060             # http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
1061             return [loc for sublist in l for loc in sublist]
1062         else:
1063             return None
1064
1065     def data_locators(self):
1066         with self._collection_lock:
1067             # Make sure all datablocks are flushed before getting the locators
1068             self._my_collection().manifest_text()
1069             datablocks = self._datablocks_on_item(self._my_collection())
1070         return datablocks
1071
1072 _machine_format = "{} {}: {{}} written {{}} total\n".format(sys.argv[0],
1073                                                             os.getpid())
1074
1075 # Simulate glob.glob() matching behavior without the need to scan the filesystem
1076 # Note: fnmatch() doesn't work correctly when used with pathnames. For example the
1077 # pattern 'tests/*.py' will match 'tests/run_test.py' and also 'tests/subdir/run_test.py',
1078 # so instead we're using it on every path component.
1079 def pathname_match(pathname, pattern):
1080     name = pathname.split(os.sep)
1081     # Fix patterns like 'some/subdir/' or 'some//subdir'
1082     pat = [x for x in pattern.split(os.sep) if x != '' and x != '.']
1083     if len(name) != len(pat):
1084         return False
1085     for i in range(len(name)):
1086         if not fnmatch.fnmatch(name[i], pat[i]):
1087             return False
1088     return True
1089
1090 def machine_progress(bytes_written, bytes_expected):
1091     return _machine_format.format(
1092         bytes_written, -1 if (bytes_expected is None) else bytes_expected)
1093
1094 def human_progress(bytes_written, bytes_expected):
1095     if bytes_expected:
1096         return "\r{}M / {}M {:.1%} ".format(
1097             bytes_written >> 20, bytes_expected >> 20,
1098             float(bytes_written) / bytes_expected)
1099     else:
1100         return "\r{} ".format(bytes_written)
1101
1102 def progress_writer(progress_func, outfile=sys.stderr):
1103     def write_progress(bytes_written, bytes_expected):
1104         outfile.write(progress_func(bytes_written, bytes_expected))
1105     return write_progress
1106
1107 def desired_project_uuid(api_client, project_uuid, num_retries):
1108     if not project_uuid:
1109         query = api_client.users().current()
1110     elif arvados.util.user_uuid_pattern.match(project_uuid):
1111         query = api_client.users().get(uuid=project_uuid)
1112     elif arvados.util.group_uuid_pattern.match(project_uuid):
1113         query = api_client.groups().get(uuid=project_uuid)
1114     else:
1115         raise ValueError("Not a valid project UUID: {}".format(project_uuid))
1116     return query.execute(num_retries=num_retries)['uuid']
1117
1118 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr,
1119          install_sig_handlers=True):
1120     global api_client
1121
1122     args = parse_arguments(arguments)
1123     logger = logging.getLogger('arvados.arv_put')
1124     if args.silent:
1125         logger.setLevel(logging.WARNING)
1126     else:
1127         logger.setLevel(logging.INFO)
1128     status = 0
1129
1130     request_id = arvados.util.new_request_id()
1131
1132     formatter = ArvPutLogFormatter(request_id)
1133     logging.getLogger('arvados').handlers[0].setFormatter(formatter)
1134
1135     if api_client is None:
1136         api_client = arvados.api('v1', request_id=request_id)
1137
1138     if install_sig_handlers:
1139         arv_cmd.install_signal_handlers()
1140
1141     # Trash arguments validation
1142     trash_at = None
1143     if args.trash_at is not None:
1144         # ciso8601 considers YYYYMM as invalid but YYYY-MM as valid, so here we
1145         # make sure the user provides a complete YYYY-MM-DD date.
1146         if not re.match(r'^\d{4}(?P<dash>-?)\d{2}?(?P=dash)\d{2}', args.trash_at):
1147             logger.error("--trash-at argument format invalid, use --help to see examples.")
1148             sys.exit(1)
1149         # Check if no time information was provided. In that case, assume end-of-day.
1150         if re.match(r'^\d{4}(?P<dash>-?)\d{2}?(?P=dash)\d{2}$', args.trash_at):
1151             args.trash_at += 'T23:59:59'
1152         try:
1153             trash_at = ciso8601.parse_datetime(args.trash_at)
1154         except:
1155             logger.error("--trash-at argument format invalid, use --help to see examples.")
1156             sys.exit(1)
1157         else:
1158             if trash_at.tzinfo is not None:
1159                 # Timezone aware datetime provided.
1160                 utcoffset = -trash_at.utcoffset()
1161             else:
1162                 # Timezone naive datetime provided. Assume is local.
1163                 if time.daylight:
1164                     utcoffset = datetime.timedelta(seconds=time.altzone)
1165                 else:
1166                     utcoffset = datetime.timedelta(seconds=time.timezone)
1167             # Convert to UTC timezone naive datetime.
1168             trash_at = trash_at.replace(tzinfo=None) + utcoffset
1169
1170         if trash_at <= datetime.datetime.utcnow():
1171             logger.error("--trash-at argument must be set in the future")
1172             sys.exit(1)
1173     if args.trash_after is not None:
1174         if args.trash_after < 1:
1175             logger.error("--trash-after argument must be >= 1")
1176             sys.exit(1)
1177         trash_at = datetime.timedelta(seconds=(args.trash_after * 24 * 60 * 60))
1178
1179     # Determine the name to use
1180     if args.name:
1181         if args.stream or args.raw:
1182             logger.error("Cannot use --name with --stream or --raw")
1183             sys.exit(1)
1184         elif args.update_collection:
1185             logger.error("Cannot use --name with --update-collection")
1186             sys.exit(1)
1187         collection_name = args.name
1188     else:
1189         collection_name = "Saved at {} by {}@{}".format(
1190             datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC"),
1191             pwd.getpwuid(os.getuid()).pw_name,
1192             socket.gethostname())
1193
1194     if args.project_uuid and (args.stream or args.raw):
1195         logger.error("Cannot use --project-uuid with --stream or --raw")
1196         sys.exit(1)
1197
1198     # Determine the parent project
1199     try:
1200         project_uuid = desired_project_uuid(api_client, args.project_uuid,
1201                                             args.retries)
1202     except (apiclient_errors.Error, ValueError) as error:
1203         logger.error(error)
1204         sys.exit(1)
1205
1206     if args.progress:
1207         reporter = progress_writer(human_progress)
1208     elif args.batch_progress:
1209         reporter = progress_writer(machine_progress)
1210     else:
1211         reporter = None
1212
1213     #  Split storage-classes argument
1214     storage_classes = None
1215     if args.storage_classes:
1216         storage_classes = args.storage_classes.strip().replace(' ', '').split(',')
1217
1218     # Setup exclude regex from all the --exclude arguments provided
1219     name_patterns = []
1220     exclude_paths = []
1221     exclude_names = None
1222     if len(args.exclude) > 0:
1223         # We're supporting 2 kinds of exclusion patterns:
1224         # 1)   --exclude '*.jpg'    (file/dir name patterns, will only match
1225         #                            the name, wherever the file is on the tree)
1226         # 2.1) --exclude 'foo/bar'  (file/dir path patterns, will match the
1227         #                            entire path, and should be relative to
1228         #                            any input dir argument)
1229         # 2.2) --exclude './*.jpg'  (Special case for excluding files/dirs
1230         #                            placed directly underneath the input dir)
1231         for p in args.exclude:
1232             # Only relative paths patterns allowed
1233             if p.startswith(os.sep):
1234                 logger.error("Cannot use absolute paths with --exclude")
1235                 sys.exit(1)
1236             if os.path.dirname(p):
1237                 # We don't support of path patterns with '..'
1238                 p_parts = p.split(os.sep)
1239                 if '..' in p_parts:
1240                     logger.error(
1241                         "Cannot use path patterns that include or '..'")
1242                     sys.exit(1)
1243                 # Path search pattern
1244                 exclude_paths.append(p)
1245             else:
1246                 # Name-only search pattern
1247                 name_patterns.append(p)
1248         # For name only matching, we can combine all patterns into a single
1249         # regexp, for better performance.
1250         exclude_names = re.compile('|'.join(
1251             [fnmatch.translate(p) for p in name_patterns]
1252         )) if len(name_patterns) > 0 else None
1253         # Show the user the patterns to be used, just in case they weren't
1254         # specified inside quotes and got changed by the shell expansion.
1255         logger.info("Exclude patterns: {}".format(args.exclude))
1256
1257     # If this is used by a human, and there's at least one directory to be
1258     # uploaded, the expected bytes calculation can take a moment.
1259     if args.progress and any([os.path.isdir(f) for f in args.paths]):
1260         logger.info("Calculating upload size, this could take some time...")
1261     try:
1262         writer = ArvPutUploadJob(paths = args.paths,
1263                                  resume = args.resume,
1264                                  use_cache = args.use_cache,
1265                                  batch_mode= args.batch,
1266                                  filename = args.filename,
1267                                  reporter = reporter,
1268                                  api_client = api_client,
1269                                  num_retries = args.retries,
1270                                  replication_desired = args.replication,
1271                                  put_threads = args.threads,
1272                                  name = collection_name,
1273                                  owner_uuid = project_uuid,
1274                                  ensure_unique_name = True,
1275                                  update_collection = args.update_collection,
1276                                  storage_classes=storage_classes,
1277                                  logger=logger,
1278                                  dry_run=args.dry_run,
1279                                  follow_links=args.follow_links,
1280                                  exclude_paths=exclude_paths,
1281                                  exclude_names=exclude_names,
1282                                  trash_at=trash_at)
1283     except ResumeCacheConflict:
1284         logger.error("\n".join([
1285             "arv-put: Another process is already uploading this data.",
1286             "         Use --no-cache if this is really what you want."]))
1287         sys.exit(1)
1288     except ResumeCacheInvalidError:
1289         logger.error("\n".join([
1290             "arv-put: Resume cache contains invalid signature: it may have expired",
1291             "         or been created with another Arvados user's credentials.",
1292             "         Switch user or use one of the following options to restart upload:",
1293             "         --no-resume to start a new resume cache.",
1294             "         --no-cache to disable resume cache.",
1295             "         --batch to ignore the resume cache if invalid."]))
1296         sys.exit(1)
1297     except (CollectionUpdateError, PathDoesNotExistError) as error:
1298         logger.error("\n".join([
1299             "arv-put: %s" % str(error)]))
1300         sys.exit(1)
1301     except ArvPutUploadIsPending:
1302         # Dry run check successful, return proper exit code.
1303         sys.exit(2)
1304     except ArvPutUploadNotPending:
1305         # No files pending for upload
1306         sys.exit(0)
1307
1308     if not args.dry_run and not args.update_collection and args.resume and writer.bytes_written > 0:
1309         logger.warning("\n".join([
1310             "arv-put: Resuming previous upload from last checkpoint.",
1311             "         Use the --no-resume option to start over."]))
1312
1313     if not args.dry_run:
1314         writer.report_progress()
1315     output = None
1316     try:
1317         writer.start(save_collection=not(args.stream or args.raw))
1318     except (arvados.errors.ApiError, arvados.errors.KeepWriteError) as error:
1319         logger.error("\n".join([
1320             "arv-put: %s" % str(error)]))
1321         sys.exit(1)
1322
1323     if args.progress:  # Print newline to split stderr from stdout for humans.
1324         logger.info("\n")
1325
1326     if args.stream:
1327         if args.normalize:
1328             output = writer.manifest_text(normalize=True)
1329         else:
1330             output = writer.manifest_text()
1331     elif args.raw:
1332         output = ','.join(writer.data_locators())
1333     elif writer.manifest_locator() is not None:
1334         try:
1335             expiration_notice = ""
1336             if writer.collection_trash_at() is not None:
1337                 # Get the local timezone-naive version, and log it with timezone information.
1338                 if time.daylight:
1339                     local_trash_at = writer.collection_trash_at().replace(tzinfo=None) - datetime.timedelta(seconds=time.altzone)
1340                 else:
1341                     local_trash_at = writer.collection_trash_at().replace(tzinfo=None) - datetime.timedelta(seconds=time.timezone)
1342                 expiration_notice = ". It will expire on {} {}.".format(
1343                     local_trash_at.strftime("%Y-%m-%d %H:%M:%S"), time.strftime("%z"))
1344             if args.update_collection:
1345                 logger.info(u"Collection updated: '{}'{}".format(
1346                     writer.collection_name(), expiration_notice))
1347             else:
1348                 logger.info(u"Collection saved as '{}'{}".format(
1349                     writer.collection_name(), expiration_notice))
1350             if args.portable_data_hash:
1351                 output = writer.portable_data_hash()
1352             else:
1353                 output = writer.manifest_locator()
1354         except apiclient_errors.Error as error:
1355             logger.error(
1356                 "arv-put: Error creating Collection on project: {}.".format(
1357                     error))
1358             status = 1
1359     else:
1360         status = 1
1361
1362     # Print the locator (uuid) of the new collection.
1363     if output is None:
1364         status = status or 1
1365     elif not args.silent:
1366         stdout.write(output)
1367         if not output.endswith('\n'):
1368             stdout.write('\n')
1369
1370     if install_sig_handlers:
1371         arv_cmd.restore_signal_handlers()
1372
1373     if status != 0:
1374         sys.exit(status)
1375
1376     # Success!
1377     return output
1378
1379
1380 if __name__ == '__main__':
1381     main()