Merge branch '8784-dir-listings'

[arvados.git] / sdk / python / arvados / commands / put.py
diff --git a/sdk/python/arvados/commands/put.py b/sdk/python/arvados/commands/put.py

index 714281cc95b0475831f1761470c9cf1b5e91cce5..e5916560bfb8b1670e0dc995fed11d06807d44e6 100644 (file)
--- a/sdk/python/arvados/commands/put.py
+++ b/sdk/python/arvados/commands/put.py
@@ -1,8 +1,7 @@
-#!/usr/bin/env python
-
-# TODO:
-# --md5sum - display md5 of each file as read from disk
-
+from __future__ import division
+from future.utils import listitems, listvalues
+from builtins import str
+from builtins import object
  import argparse
  import arvados
  import arvados.collection
  import argparse
  import arvados
  import arvados.collection
@@ -23,6 +22,8 @@ import sys
  import tempfile
  import threading
  import time
  import tempfile
  import threading
  import time
+import traceback
+
  from apiclient import errors as apiclient_errors
  from arvados._version import __version__
  
  from apiclient import errors as apiclient_errors
  from arvados._version import __version__
  
@@ -38,7 +39,9 @@ upload_opts.add_argument('--version', action='version',
                           help='Print version and exit.')
  upload_opts.add_argument('paths', metavar='path', type=str, nargs='*',
                           help="""
                           help='Print version and exit.')
  upload_opts.add_argument('paths', metavar='path', type=str, nargs='*',
                           help="""
-Local file or directory. Default: read from standard input.
+Local file or directory. If path is a directory reference with a trailing
+slash, then just upload the directory's contents; otherwise upload the
+directory itself. Default: read from standard input.
  """)
  
  _group = upload_opts.add_mutually_exclusive_group()
  """)
  
  _group = upload_opts.add_mutually_exclusive_group()
@@ -133,6 +136,15 @@ physical storage devices (e.g., disks) should have a copy of each data
  block. Default is to use the server-provided default (if any) or 2.
  """)
  
  block. Default is to use the server-provided default (if any) or 2.
  """)
  
+upload_opts.add_argument('--threads', type=int, metavar='N', default=None,
+                         help="""
+Set the number of upload threads to be used. Take into account that
+using lots of threads will increase the RAM requirements. Default is
+to use 2 threads.
+On high latency installations, using a greater number will improve
+overall throughput.
+""")
+
  run_opts = argparse.ArgumentParser(add_help=False)
  
  run_opts.add_argument('--project-uuid', metavar='UUID', help="""
  run_opts = argparse.ArgumentParser(add_help=False)
  
  run_opts.add_argument('--project-uuid', metavar='UUID', help="""
@@ -174,6 +186,16 @@ _group.add_argument('--no-resume', action='store_false', dest='resume',
  Do not continue interrupted uploads from cached state.
  """)
  
  Do not continue interrupted uploads from cached state.
  """)
  
+_group = run_opts.add_mutually_exclusive_group()
+_group.add_argument('--follow-links', action='store_true', default=True,
+                    dest='follow_links', help="""
+Follow file and directory symlinks (default).
+""")
+_group.add_argument('--no-follow-links', action='store_false', dest='follow_links',
+                    help="""
+Do not follow file and directory symlinks.
+""")
+
  _group = run_opts.add_mutually_exclusive_group()
  _group.add_argument('--cache', action='store_true', dest='use_cache', default=True,
                      help="""
  _group = run_opts.add_mutually_exclusive_group()
  _group.add_argument('--cache', action='store_true', dest='use_cache', default=True,
                      help="""
@@ -194,7 +216,7 @@ def parse_arguments(arguments):
      if len(args.paths) == 0:
          args.paths = ['-']
  
      if len(args.paths) == 0:
          args.paths = ['-']
  
-    args.paths = map(lambda x: "-" if x == "/dev/stdin" else x, args.paths)
+    args.paths = ["-" if x == "/dev/stdin" else x for x in args.paths]
  
      if len(args.paths) != 1 or os.path.isdir(args.paths[0]):
          if args.filename:
  
      if len(args.paths) != 1 or os.path.isdir(args.paths[0]):
          if args.filename:
@@ -225,6 +247,10 @@ def parse_arguments(arguments):
      return args
  
  
      return args
  
  
+class PathDoesNotExistError(Exception):
+    pass
+
+
  class CollectionUpdateError(Exception):
      pass
  
  class CollectionUpdateError(Exception):
      pass
  
@@ -267,13 +293,13 @@ class ResumeCache(object):
      @classmethod
      def make_path(cls, args):
          md5 = hashlib.md5()
      @classmethod
      def make_path(cls, args):
          md5 = hashlib.md5()
-        md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost'))
+        md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost').encode())
          realpaths = sorted(os.path.realpath(path) for path in args.paths)
          realpaths = sorted(os.path.realpath(path) for path in args.paths)
-        md5.update('\0'.join(realpaths))
+        md5.update(b'\0'.join([p.encode() for p in realpaths]))
          if any(os.path.isdir(path) for path in realpaths):
          if any(os.path.isdir(path) for path in realpaths):
-            md5.update("-1")
+            md5.update(b'-1')
          elif args.filename:
          elif args.filename:
-            md5.update(args.filename)
+            md5.update(args.filename.encode())
          return os.path.join(
              arv_cmd.make_home_conf_dir(cls.CACHE_DIR, 0o700, 'raise'),
              md5.hexdigest())
          return os.path.join(
              arv_cmd.make_home_conf_dir(cls.CACHE_DIR, 0o700, 'raise'),
              md5.hexdigest())
@@ -347,9 +373,11 @@ class ArvPutUploadJob(object):
  
      def __init__(self, paths, resume=True, use_cache=True, reporter=None,
                   bytes_expected=None, name=None, owner_uuid=None,
  
      def __init__(self, paths, resume=True, use_cache=True, reporter=None,
                   bytes_expected=None, name=None, owner_uuid=None,
-                 ensure_unique_name=False, num_retries=None, replication_desired=None,
-                 filename=None, update_time=20.0, update_collection=None,
-                 logger=logging.getLogger('arvados.arv_put'), dry_run=False):
+                 ensure_unique_name=False, num_retries=None,
+                 put_threads=None, replication_desired=None,
+                 filename=None, update_time=60.0, update_collection=None,
+                 logger=logging.getLogger('arvados.arv_put'), dry_run=False,
+                 follow_links=True):
          self.paths = paths
          self.resume = resume
          self.use_cache = use_cache
          self.paths = paths
          self.resume = resume
          self.use_cache = use_cache
@@ -363,6 +391,7 @@ class ArvPutUploadJob(object):
          self.ensure_unique_name = ensure_unique_name
          self.num_retries = num_retries
          self.replication_desired = replication_desired
          self.ensure_unique_name = ensure_unique_name
          self.num_retries = num_retries
          self.replication_desired = replication_desired
+        self.put_threads = put_threads
          self.filename = filename
          self._state_lock = threading.Lock()
          self._state = None # Previous run state (file list & manifest)
          self.filename = filename
          self._state_lock = threading.Lock()
          self._state = None # Previous run state (file list & manifest)
@@ -371,14 +400,17 @@ class ArvPutUploadJob(object):
          self._collection_lock = threading.Lock()
          self._remote_collection = None # Collection being updated (if asked)
          self._local_collection = None # Collection from previous run manifest
          self._collection_lock = threading.Lock()
          self._remote_collection = None # Collection being updated (if asked)
          self._local_collection = None # Collection from previous run manifest
-        self._file_paths = [] # Files to be updated in remote collection
+        self._file_paths = set() # Files to be updated in remote collection
          self._stop_checkpointer = threading.Event()
          self._checkpointer = threading.Thread(target=self._update_task)
          self._checkpointer.daemon = True
          self._update_task_time = update_time  # How many seconds wait between update runs
          self._files_to_upload = FileUploadList(dry_run=dry_run)
          self._stop_checkpointer = threading.Event()
          self._checkpointer = threading.Thread(target=self._update_task)
          self._checkpointer.daemon = True
          self._update_task_time = update_time  # How many seconds wait between update runs
          self._files_to_upload = FileUploadList(dry_run=dry_run)
+        self._upload_started = False
          self.logger = logger
          self.dry_run = dry_run
          self.logger = logger
          self.dry_run = dry_run
+        self._checkpoint_before_quit = True
+        self.follow_links = follow_links
  
          if not self.use_cache and self.resume:
              raise ArvPutArgumentConflict('resume cannot be True when use_cache is False')
  
          if not self.use_cache and self.resume:
              raise ArvPutArgumentConflict('resume cannot be True when use_cache is False')
@@ -403,13 +435,23 @@ class ArvPutUploadJob(object):
                      if self.dry_run:
                          raise ArvPutUploadIsPending()
                      self._write_stdin(self.filename or 'stdin')
                      if self.dry_run:
                          raise ArvPutUploadIsPending()
                      self._write_stdin(self.filename or 'stdin')
+                elif not os.path.exists(path):
+                     raise PathDoesNotExistError("file or directory '{}' does not exist.".format(path))
                  elif os.path.isdir(path):
                      # Use absolute paths on cache index so CWD doesn't interfere
                      # with the caching logic.
                  elif os.path.isdir(path):
                      # Use absolute paths on cache index so CWD doesn't interfere
                      # with the caching logic.
-                    prefixdir = path = os.path.abspath(path)
-                    if prefixdir != '/':
-                        prefixdir += '/'
-                    for root, dirs, files in os.walk(path):
+                    orig_path = path
+                    path = os.path.abspath(path)
+                    if orig_path[-1:] == os.sep:
+                        # When passing a directory reference with a trailing slash,
+                        # its contents should be uploaded directly to the collection's root.
+                        prefixdir = path
+                    else:
+                        # When passing a directory reference with no trailing slash,
+                        # upload the directory to the collection's root.
+                        prefixdir = os.path.dirname(path)
+                    prefixdir += os.sep
+                    for root, dirs, files in os.walk(path, followlinks=self.follow_links):
                          # Make os.walk()'s dir traversing order deterministic
                          dirs.sort()
                          files.sort()
                          # Make os.walk()'s dir traversing order deterministic
                          dirs.sort()
                          files.sort()
@@ -433,17 +475,33 @@ class ArvPutUploadJob(object):
              # report initial progress.
              self._update()
              # Actual file upload
              # report initial progress.
              self._update()
              # Actual file upload
+            self._upload_started = True # Used by the update thread to start checkpointing
              self._upload_files()
              self._upload_files()
+        except (SystemExit, Exception) as e:
+            self._checkpoint_before_quit = False
+            # Log stack trace only when Ctrl-C isn't pressed (SIGINT)
+            # Note: We're expecting SystemExit instead of
+            # KeyboardInterrupt because we have a custom signal
+            # handler in place that raises SystemExit with the catched
+            # signal's code.
+            if isinstance(e, PathDoesNotExistError):
+                # We aren't interested in the traceback for this case
+                pass
+            elif not isinstance(e, SystemExit) or e.code != -2:
+                self.logger.warning("Abnormal termination:\n{}".format(
+                    traceback.format_exc()))
+            raise
          finally:
              if not self.dry_run:
                  # Stop the thread before doing anything else
                  self._stop_checkpointer.set()
                  self._checkpointer.join()
          finally:
              if not self.dry_run:
                  # Stop the thread before doing anything else
                  self._stop_checkpointer.set()
                  self._checkpointer.join()
-                # Commit all pending blocks & one last _update()
-                self._local_collection.manifest_text()
-                self._update(final=True)
-                if save_collection:
-                    self.save_collection()
+                if self._checkpoint_before_quit:
+                    # Commit all pending blocks & one last _update()
+                    self._local_collection.manifest_text()
+                    self._update(final=True)
+                    if save_collection:
+                        self.save_collection()
              if self.use_cache:
                  self._cache_file.close()
  
              if self.use_cache:
                  self._cache_file.close()
  
@@ -483,7 +541,7 @@ class ArvPutUploadJob(object):
          Recursively get the total size of the collection
          """
          size = 0
          Recursively get the total size of the collection
          """
          size = 0
-        for item in collection.values():
+        for item in listvalues(collection):
              if isinstance(item, arvados.collection.Collection) or isinstance(item, arvados.collection.Subcollection):
                  size += self._collection_size(item)
              else:
              if isinstance(item, arvados.collection.Collection) or isinstance(item, arvados.collection.Subcollection):
                  size += self._collection_size(item)
              else:
@@ -495,24 +553,34 @@ class ArvPutUploadJob(object):
          Periodically called support task. File uploading is
          asynchronous so we poll status from the collection.
          """
          Periodically called support task. File uploading is
          asynchronous so we poll status from the collection.
          """
-        while not self._stop_checkpointer.wait(self._update_task_time):
+        while not self._stop_checkpointer.wait(1 if not self._upload_started else self._update_task_time):
              self._update()
  
      def _update(self, final=False):
          """
          Update cached manifest text and report progress.
          """
              self._update()
  
      def _update(self, final=False):
          """
          Update cached manifest text and report progress.
          """
-        with self._collection_lock:
-            self.bytes_written = self._collection_size(self._local_collection)
-            if self.use_cache:
-                # Update cache
-                with self._state_lock:
+        if self._upload_started:
+            with self._collection_lock:
+                self.bytes_written = self._collection_size(self._local_collection)
+                if self.use_cache:
                      if final:
                      if final:
-                        self._state['manifest'] = self._local_collection.manifest_text()
+                        manifest = self._local_collection.manifest_text()
                      else:
                          # Get the manifest text without comitting pending blocks
                      else:
                          # Get the manifest text without comitting pending blocks
-                        self._state['manifest'] = self._local_collection._get_manifest_text(".", strip=False, normalize=False, only_committed=True)
-                self._save_state()
+                        manifest = self._local_collection.manifest_text(strip=False,
+                                                                        normalize=False,
+                                                                        only_committed=True)
+                    # Update cache
+                    with self._state_lock:
+                        self._state['manifest'] = manifest
+            if self.use_cache:
+                try:
+                    self._save_state()
+                except Exception as e:
+                    self.logger.error("Unexpected error trying to save cache file: {}".format(e))
+        else:
+            self.bytes_written = self.bytes_skipped
          # Call the reporter, if any
          self.report_progress()
  
          # Call the reporter, if any
          self.report_progress()
  
@@ -521,17 +589,22 @@ class ArvPutUploadJob(object):
              self.reporter(self.bytes_written, self.bytes_expected)
  
      def _write_stdin(self, filename):
              self.reporter(self.bytes_written, self.bytes_expected)
  
      def _write_stdin(self, filename):
-        output = self._local_collection.open(filename, 'w')
+        output = self._local_collection.open(filename, 'wb')
          self._write(sys.stdin, output)
          output.close()
  
      def _check_file(self, source, filename):
          self._write(sys.stdin, output)
          output.close()
  
      def _check_file(self, source, filename):
-        """Check if this file needs to be uploaded"""
+        """
+        Check if this file needs to be uploaded
+        """
+        # Ignore symlinks when requested
+        if (not self.follow_links) and os.path.islink(source):
+            return
          resume_offset = 0
          should_upload = False
          new_file_in_cache = False
          # Record file path for updating the remote collection before exiting
          resume_offset = 0
          should_upload = False
          new_file_in_cache = False
          # Record file path for updating the remote collection before exiting
-        self._file_paths.append(filename)
+        self._file_paths.add(filename)
  
          with self._state_lock:
              # If no previous cached data on this file, store it for an eventual
  
          with self._state_lock:
              # If no previous cached data on this file, store it for an eventual
@@ -587,17 +660,17 @@ class ArvPutUploadJob(object):
  
      def _upload_files(self):
          for source, resume_offset, filename in self._files_to_upload:
  
      def _upload_files(self):
          for source, resume_offset, filename in self._files_to_upload:
-            with open(source, 'r') as source_fd:
+            with open(source, 'rb') as source_fd:
                  with self._state_lock:
                      self._state['files'][source]['mtime'] = os.path.getmtime(source)
                      self._state['files'][source]['size'] = os.path.getsize(source)
                  if resume_offset > 0:
                      # Start upload where we left off
                  with self._state_lock:
                      self._state['files'][source]['mtime'] = os.path.getmtime(source)
                      self._state['files'][source]['size'] = os.path.getsize(source)
                  if resume_offset > 0:
                      # Start upload where we left off
-                    output = self._local_collection.open(filename, 'a')
+                    output = self._local_collection.open(filename, 'ab')
                      source_fd.seek(resume_offset)
                  else:
                      # Start from scratch
                      source_fd.seek(resume_offset)
                  else:
                      # Start from scratch
-                    output = self._local_collection.open(filename, 'w')
+                    output = self._local_collection.open(filename, 'wb')
                  self._write(source_fd, output)
                  output.close(flush=False)
  
                  self._write(source_fd, output)
                  output.close(flush=False)
  
@@ -631,19 +704,21 @@ class ArvPutUploadJob(object):
          if self.use_cache:
              # Set up cache file name from input paths.
              md5 = hashlib.md5()
          if self.use_cache:
              # Set up cache file name from input paths.
              md5 = hashlib.md5()
-            md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost'))
+            md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost').encode())
              realpaths = sorted(os.path.realpath(path) for path in self.paths)
              realpaths = sorted(os.path.realpath(path) for path in self.paths)
-            md5.update('\0'.join(realpaths))
+            md5.update(b'\0'.join([p.encode() for p in realpaths]))
              if self.filename:
              if self.filename:
-                md5.update(self.filename)
+                md5.update(self.filename.encode())
              cache_filename = md5.hexdigest()
              cache_filepath = os.path.join(
                  arv_cmd.make_home_conf_dir(self.CACHE_DIR, 0o700, 'raise'),
                  cache_filename)
              cache_filename = md5.hexdigest()
              cache_filepath = os.path.join(
                  arv_cmd.make_home_conf_dir(self.CACHE_DIR, 0o700, 'raise'),
                  cache_filename)
-            if self.resume:
+            if self.resume and os.path.exists(cache_filepath):
+                self.logger.info("Resuming upload from cache file {}".format(cache_filepath))
                  self._cache_file = open(cache_filepath, 'a+')
              else:
                  # --no-resume means start with a empty cache file.
                  self._cache_file = open(cache_filepath, 'a+')
              else:
                  # --no-resume means start with a empty cache file.
+                self.logger.info("Creating new cache file at {}".format(cache_filepath))
                  self._cache_file = open(cache_filepath, 'w+')
              self._cache_filename = self._cache_file.name
              self._lock_file(self._cache_file)
                  self._cache_file = open(cache_filepath, 'w+')
              self._cache_filename = self._cache_file.name
              self._lock_file(self._cache_file)
@@ -660,15 +735,16 @@ class ArvPutUploadJob(object):
                      # Cache file empty, set up new cache
                      self._state = copy.deepcopy(self.EMPTY_STATE)
              else:
                      # Cache file empty, set up new cache
                      self._state = copy.deepcopy(self.EMPTY_STATE)
              else:
+                self.logger.info("No cache usage requested for this run.")
                  # No cache file, set empty state
                  self._state = copy.deepcopy(self.EMPTY_STATE)
              # Load the previous manifest so we can check if files were modified remotely.
                  # No cache file, set empty state
                  self._state = copy.deepcopy(self.EMPTY_STATE)
              # Load the previous manifest so we can check if files were modified remotely.
-            self._local_collection = arvados.collection.Collection(self._state['manifest'], replication_desired=self.replication_desired)
+            self._local_collection = arvados.collection.Collection(self._state['manifest'], replication_desired=self.replication_desired, put_threads=self.put_threads)
  
      def collection_file_paths(self, col, path_prefix='.'):
          """Return a list of file paths by recursively go through the entire collection `col`"""
          file_paths = []
  
      def collection_file_paths(self, col, path_prefix='.'):
          """Return a list of file paths by recursively go through the entire collection `col`"""
          file_paths = []
-        for name, item in col.items():
+        for name, item in listitems(col):
              if isinstance(item, arvados.arvfile.ArvadosFile):
                  file_paths.append(os.path.join(path_prefix, name))
              elif isinstance(item, arvados.collection.Subcollection):
              if isinstance(item, arvados.arvfile.ArvadosFile):
                  file_paths.append(os.path.join(path_prefix, name))
              elif isinstance(item, arvados.collection.Subcollection):
@@ -686,17 +762,20 @@ class ArvPutUploadJob(object):
          """
          Atomically save current state into cache.
          """
          """
          Atomically save current state into cache.
          """
+        with self._state_lock:
+            # We're not using copy.deepcopy() here because it's a lot slower
+            # than json.dumps(), and we're already needing JSON format to be
+            # saved on disk.
+            state = json.dumps(self._state)
          try:
          try:
-            with self._state_lock:
-                state = copy.deepcopy(self._state)
-            new_cache_fd, new_cache_name = tempfile.mkstemp(
-                dir=os.path.dirname(self._cache_filename))
-            self._lock_file(new_cache_fd)
-            new_cache = os.fdopen(new_cache_fd, 'r+')
-            json.dump(state, new_cache)
+            new_cache = tempfile.NamedTemporaryFile(
+                mode='w+',
+                dir=os.path.dirname(self._cache_filename), delete=False)
+            self._lock_file(new_cache)
+            new_cache.write(state)
              new_cache.flush()
              os.fsync(new_cache)
              new_cache.flush()
              os.fsync(new_cache)
-            os.rename(new_cache_name, self._cache_filename)
+            os.rename(new_cache.name, self._cache_filename)
          except (IOError, OSError, ResumeCacheConflict) as error:
              self.logger.error("There was a problem while saving the cache file: {}".format(error))
              try:
          except (IOError, OSError, ResumeCacheConflict) as error:
              self.logger.error("There was a problem while saving the cache file: {}".format(error))
              try:
@@ -714,7 +793,14 @@ class ArvPutUploadJob(object):
          return self._my_collection().manifest_locator()
  
      def portable_data_hash(self):
          return self._my_collection().manifest_locator()
  
      def portable_data_hash(self):
-        return self._my_collection().portable_data_hash()
+        pdh = self._my_collection().portable_data_hash()
+        m = self._my_collection().stripped_manifest().encode()
+        local_pdh = '{}+{}'.format(hashlib.md5(m).hexdigest(), len(m))
+        if pdh != local_pdh:
+            logger.warning("\n".join([
+                "arv-put: API server provided PDH differs from local manifest.",
+                "         This should not happen; showing API server version."]))
+        return pdh
  
      def manifest_text(self, stream_name=".", strip=False, normalize=False):
          return self._my_collection().manifest_text(stream_name, strip, normalize)
  
      def manifest_text(self, stream_name=".", strip=False, normalize=False):
          return self._my_collection().manifest_text(stream_name, strip, normalize)
@@ -735,7 +821,7 @@ class ArvPutUploadJob(object):
                      locators.append(loc)
                  return locators
          elif isinstance(item, arvados.collection.Collection):
                      locators.append(loc)
                  return locators
          elif isinstance(item, arvados.collection.Collection):
-            l = [self._datablocks_on_item(x) for x in item.values()]
+            l = [self._datablocks_on_item(x) for x in listvalues(item)]
              # Fast list flattener method taken from:
              # http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
              return [loc for sublist in l for loc in sublist]
              # Fast list flattener method taken from:
              # http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
              return [loc for sublist in l for loc in sublist]
@@ -750,14 +836,20 @@ class ArvPutUploadJob(object):
          return datablocks
  
  
          return datablocks
  
  
-def expected_bytes_for(pathlist):
+def expected_bytes_for(pathlist, follow_links=True):
      # Walk the given directory trees and stat files, adding up file sizes,
      # so we can display progress as percent
      bytesum = 0
      for path in pathlist:
          if os.path.isdir(path):
      # Walk the given directory trees and stat files, adding up file sizes,
      # so we can display progress as percent
      bytesum = 0
      for path in pathlist:
          if os.path.isdir(path):
-            for filename in arvados.util.listdir_recursive(path):
-                bytesum += os.path.getsize(os.path.join(path, filename))
+            for root, dirs, files in os.walk(path, followlinks=follow_links):
+                # Sum file sizes
+                for f in files:
+                    filepath = os.path.join(root, f)
+                    # Ignore symlinked files when requested
+                    if (not follow_links) and os.path.islink(filepath):
+                        continue
+                    bytesum += os.path.getsize(filepath)
          elif not os.path.isfile(path):
              return None
          else:
          elif not os.path.isfile(path):
              return None
          else:
@@ -801,6 +893,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
      global api_client
  
      logger = logging.getLogger('arvados.arv_put')
      global api_client
  
      logger = logging.getLogger('arvados.arv_put')
+    logger.setLevel(logging.INFO)
      args = parse_arguments(arguments)
      status = 0
      if api_client is None:
      args = parse_arguments(arguments)
      status = 0
      if api_client is None:
@@ -840,7 +933,11 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
      else:
          reporter = None
  
      else:
          reporter = None
  
-    bytes_expected = expected_bytes_for(args.paths)
+    # If this is used by a human, and there's at least one directory to be
+    # uploaded, the expected bytes calculation can take a moment.
+    if args.progress and any([os.path.isdir(f) for f in args.paths]):
+        logger.info("Calculating upload size, this could take some time...")
+    bytes_expected = expected_bytes_for(args.paths, follow_links=args.follow_links)
  
      try:
          writer = ArvPutUploadJob(paths = args.paths,
  
      try:
          writer = ArvPutUploadJob(paths = args.paths,
@@ -851,12 +948,14 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
                                   bytes_expected = bytes_expected,
                                   num_retries = args.retries,
                                   replication_desired = args.replication,
                                   bytes_expected = bytes_expected,
                                   num_retries = args.retries,
                                   replication_desired = args.replication,
+                                 put_threads = args.threads,
                                   name = collection_name,
                                   owner_uuid = project_uuid,
                                   ensure_unique_name = True,
                                   update_collection = args.update_collection,
                                   logger=logger,
                                   name = collection_name,
                                   owner_uuid = project_uuid,
                                   ensure_unique_name = True,
                                   update_collection = args.update_collection,
                                   logger=logger,
-                                 dry_run=args.dry_run)
+                                 dry_run=args.dry_run,
+                                 follow_links=args.follow_links)
      except ResumeCacheConflict:
          logger.error("\n".join([
              "arv-put: Another process is already uploading this data.",
      except ResumeCacheConflict:
          logger.error("\n".join([
              "arv-put: Another process is already uploading this data.",
@@ -898,6 +997,10 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
      except ArvPutUploadNotPending:
          # No files pending for upload
          sys.exit(0)
      except ArvPutUploadNotPending:
          # No files pending for upload
          sys.exit(0)
+    except PathDoesNotExistError as error:
+        logger.error("\n".join([
+            "arv-put: %s" % str(error)]))
+        sys.exit(1)
  
      if args.progress:  # Print newline to split stderr from stdout for humans.
          logger.info("\n")
  
      if args.progress:  # Print newline to split stderr from stdout for humans.
          logger.info("\n")
@@ -933,7 +1036,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
          if not output.endswith('\n'):
              stdout.write('\n')
  
          if not output.endswith('\n'):
              stdout.write('\n')
  
-    for sigcode, orig_handler in orig_signal_handlers.items():
+    for sigcode, orig_handler in listitems(orig_signal_handlers):
          signal.signal(sigcode, orig_handler)
  
      if status != 0:
          signal.signal(sigcode, orig_handler)
  
      if status != 0: