11308: Fix bytes vs. strings from recent merge.

[arvados.git] / sdk / python / arvados / commands / put.py
diff --git a/sdk/python/arvados/commands/put.py b/sdk/python/arvados/commands/put.py

index 8f7a7b7531b251b4aa1e3882910451580ec9d082..ed9d55cfc2df31a2b175254a441b4aa9a046cb02 100644 (file)
--- a/sdk/python/arvados/commands/put.py
+++ b/sdk/python/arvados/commands/put.py
@@ -1,8 +1,7 @@
-#!/usr/bin/env python
-
-# TODO:
-# --md5sum - display md5 of each file as read from disk
-
+from __future__ import division
+from future.utils import listitems, listvalues
+from builtins import str
+from builtins import object
  import argparse
  import arvados
  import arvados.collection
  import argparse
  import arvados
  import arvados.collection
@@ -23,7 +22,10 @@ import sys
  import tempfile
  import threading
  import time
  import tempfile
  import threading
  import time
+import traceback
+
  from apiclient import errors as apiclient_errors
  from apiclient import errors as apiclient_errors
+from arvados._version import __version__
  
  import arvados.commands._util as arv_cmd
  
  
  import arvados.commands._util as arv_cmd
  
@@ -32,6 +34,9 @@ api_client = None
  
  upload_opts = argparse.ArgumentParser(add_help=False)
  
  
  upload_opts = argparse.ArgumentParser(add_help=False)
  
+upload_opts.add_argument('--version', action='version',
+                         version="%s %s" % (sys.argv[0], __version__),
+                         help='Print version and exit.')
  upload_opts.add_argument('paths', metavar='path', type=str, nargs='*',
                           help="""
  Local file or directory. Default: read from standard input.
  upload_opts.add_argument('paths', metavar='path', type=str, nargs='*',
                           help="""
  Local file or directory. Default: read from standard input.
@@ -48,6 +53,12 @@ Normalize the manifest by re-ordering files and streams after writing
  data.
  """)
  
  data.
  """)
  
+_group.add_argument('--dry-run', action='store_true', default=False,
+                    help="""
+Don't actually upload files, but only check if any file should be
+uploaded. Exit with code=2 when files are pending for upload.
+""")
+
  _group = upload_opts.add_mutually_exclusive_group()
  
  _group.add_argument('--as-stream', action='store_true', dest='stream',
  _group = upload_opts.add_mutually_exclusive_group()
  
  _group.add_argument('--as-stream', action='store_true', dest='stream',
@@ -123,6 +134,15 @@ physical storage devices (e.g., disks) should have a copy of each data
  block. Default is to use the server-provided default (if any) or 2.
  """)
  
  block. Default is to use the server-provided default (if any) or 2.
  """)
  
+upload_opts.add_argument('--threads', type=int, metavar='N', default=None,
+                         help="""
+Set the number of upload threads to be used. Take into account that
+using lots of threads will increase the RAM requirements. Default is
+to use 2 threads.
+On high latency installations, using a greater number will improve
+overall throughput.
+""")
+
  run_opts = argparse.ArgumentParser(add_help=False)
  
  run_opts.add_argument('--project-uuid', metavar='UUID', help="""
  run_opts = argparse.ArgumentParser(add_help=False)
  
  run_opts.add_argument('--project-uuid', metavar='UUID', help="""
@@ -184,7 +204,7 @@ def parse_arguments(arguments):
      if len(args.paths) == 0:
          args.paths = ['-']
  
      if len(args.paths) == 0:
          args.paths = ['-']
  
-    args.paths = map(lambda x: "-" if x == "/dev/stdin" else x, args.paths)
+    args.paths = ["-" if x == "/dev/stdin" else x for x in args.paths]
  
      if len(args.paths) != 1 or os.path.isdir(args.paths[0]):
          if args.filename:
  
      if len(args.paths) != 1 or os.path.isdir(args.paths[0]):
          if args.filename:
@@ -223,6 +243,29 @@ class ResumeCacheConflict(Exception):
      pass
  
  
      pass
  
  
+class ArvPutArgumentConflict(Exception):
+    pass
+
+
+class ArvPutUploadIsPending(Exception):
+    pass
+
+
+class ArvPutUploadNotPending(Exception):
+    pass
+
+
+class FileUploadList(list):
+    def __init__(self, dry_run=False):
+        list.__init__(self)
+        self.dry_run = dry_run
+
+    def append(self, other):
+        if self.dry_run:
+            raise ArvPutUploadIsPending()
+        super(FileUploadList, self).append(other)
+
+
  class ResumeCache(object):
      CACHE_DIR = '.cache/arvados/arv-put'
  
  class ResumeCache(object):
      CACHE_DIR = '.cache/arvados/arv-put'
  
@@ -234,13 +277,13 @@ class ResumeCache(object):
      @classmethod
      def make_path(cls, args):
          md5 = hashlib.md5()
      @classmethod
      def make_path(cls, args):
          md5 = hashlib.md5()
-        md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost'))
+        md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost').encode())
          realpaths = sorted(os.path.realpath(path) for path in args.paths)
          realpaths = sorted(os.path.realpath(path) for path in args.paths)
-        md5.update('\0'.join(realpaths))
+        md5.update(b'\0'.join([p.encode() for p in realpaths]))
          if any(os.path.isdir(path) for path in realpaths):
          if any(os.path.isdir(path) for path in realpaths):
-            md5.update("-1")
+            md5.update(b'-1')
          elif args.filename:
          elif args.filename:
-            md5.update(args.filename)
+            md5.update(args.filename.encode())
          return os.path.join(
              arv_cmd.make_home_conf_dir(cls.CACHE_DIR, 0o700, 'raise'),
              md5.hexdigest())
          return os.path.join(
              arv_cmd.make_home_conf_dir(cls.CACHE_DIR, 0o700, 'raise'),
              md5.hexdigest())
@@ -314,8 +357,10 @@ class ArvPutUploadJob(object):
  
      def __init__(self, paths, resume=True, use_cache=True, reporter=None,
                   bytes_expected=None, name=None, owner_uuid=None,
  
      def __init__(self, paths, resume=True, use_cache=True, reporter=None,
                   bytes_expected=None, name=None, owner_uuid=None,
-                 ensure_unique_name=False, num_retries=None, replication_desired=None,
-                 filename=None, update_time=1.0, update_collection=None):
+                 ensure_unique_name=False, num_retries=None,
+                 put_threads=None, replication_desired=None,
+                 filename=None, update_time=60.0, update_collection=None,
+                 logger=logging.getLogger('arvados.arv_put'), dry_run=False):
          self.paths = paths
          self.resume = resume
          self.use_cache = use_cache
          self.paths = paths
          self.resume = resume
          self.use_cache = use_cache
@@ -329,6 +374,7 @@ class ArvPutUploadJob(object):
          self.ensure_unique_name = ensure_unique_name
          self.num_retries = num_retries
          self.replication_desired = replication_desired
          self.ensure_unique_name = ensure_unique_name
          self.num_retries = num_retries
          self.replication_desired = replication_desired
+        self.put_threads = put_threads
          self.filename = filename
          self._state_lock = threading.Lock()
          self._state = None # Previous run state (file list & manifest)
          self.filename = filename
          self._state_lock = threading.Lock()
          self._state = None # Previous run state (file list & manifest)
@@ -337,14 +383,23 @@ class ArvPutUploadJob(object):
          self._collection_lock = threading.Lock()
          self._remote_collection = None # Collection being updated (if asked)
          self._local_collection = None # Collection from previous run manifest
          self._collection_lock = threading.Lock()
          self._remote_collection = None # Collection being updated (if asked)
          self._local_collection = None # Collection from previous run manifest
-        self._file_paths = [] # Files to be updated in remote collection
+        self._file_paths = set() # Files to be updated in remote collection
          self._stop_checkpointer = threading.Event()
          self._checkpointer = threading.Thread(target=self._update_task)
          self._stop_checkpointer = threading.Event()
          self._checkpointer = threading.Thread(target=self._update_task)
+        self._checkpointer.daemon = True
          self._update_task_time = update_time  # How many seconds wait between update runs
          self._update_task_time = update_time  # How many seconds wait between update runs
-        self.logger = logging.getLogger('arvados.arv_put')
+        self._files_to_upload = FileUploadList(dry_run=dry_run)
+        self._upload_started = False
+        self.logger = logger
+        self.dry_run = dry_run
+        self._checkpoint_before_quit = True
  
          if not self.use_cache and self.resume:
  
          if not self.use_cache and self.resume:
-            raise ArgumentError('resume cannot be True when use_cache is False')
+            raise ArvPutArgumentConflict('resume cannot be True when use_cache is False')
+
+        # Check for obvious dry-run responses
+        if self.dry_run and (not self.use_cache or not self.resume):
+            raise ArvPutUploadIsPending()
  
          # Load cached data if any and if needed
          self._setup_state(update_collection)
  
          # Load cached data if any and if needed
          self._setup_state(update_collection)
@@ -353,12 +408,14 @@ class ArvPutUploadJob(object):
          """
          Start supporting thread & file uploading
          """
          """
          Start supporting thread & file uploading
          """
-        self._checkpointer.daemon = True
-        self._checkpointer.start()
+        if not self.dry_run:
+            self._checkpointer.start()
          try:
              for path in self.paths:
                  # Test for stdin first, in case some file named '-' exist
                  if path == '-':
          try:
              for path in self.paths:
                  # Test for stdin first, in case some file named '-' exist
                  if path == '-':
+                    if self.dry_run:
+                        raise ArvPutUploadIsPending()
                      self._write_stdin(self.filename or 'stdin')
                  elif os.path.isdir(path):
                      # Use absolute paths on cache index so CWD doesn't interfere
                      self._write_stdin(self.filename or 'stdin')
                  elif os.path.isdir(path):
                      # Use absolute paths on cache index so CWD doesn't interfere
@@ -371,24 +428,50 @@ class ArvPutUploadJob(object):
                          dirs.sort()
                          files.sort()
                          for f in files:
                          dirs.sort()
                          files.sort()
                          for f in files:
-                            self._write_file(os.path.join(root, f),
+                            self._check_file(os.path.join(root, f),
                                               os.path.join(root[len(prefixdir):], f))
                  else:
                                               os.path.join(root[len(prefixdir):], f))
                  else:
-                    self._write_file(os.path.abspath(path),
+                    self._check_file(os.path.abspath(path),
                                       self.filename or os.path.basename(path))
                                       self.filename or os.path.basename(path))
+            # If dry-mode is on, and got up to this point, then we should notify that
+            # there aren't any file to upload.
+            if self.dry_run:
+                raise ArvPutUploadNotPending()
+            # Remove local_collection's files that don't exist locally anymore, so the
+            # bytes_written count is correct.
+            for f in self.collection_file_paths(self._local_collection,
+                                                path_prefix=""):
+                if f != 'stdin' and f != self.filename and not f in self._file_paths:
+                    self._local_collection.remove(f)
+            # Update bytes_written from current local collection and
+            # report initial progress.
+            self._update()
+            # Actual file upload
+            self._upload_started = True # Used by the update thread to start checkpointing
+            self._upload_files()
+        except (SystemExit, Exception) as e:
+            self._checkpoint_before_quit = False
+            # Log stack trace only when Ctrl-C isn't pressed (SIGINT)
+            # Note: We're expecting SystemExit instead of KeyboardInterrupt because
+            #   we have a custom signal handler in place that raises SystemExit with
+            #   the catched signal's code.
+            if not isinstance(e, SystemExit) or e.code != -2:
+                self.logger.warning("Abnormal termination:\n{}".format(
+                    traceback.format_exc()))
+            raise
          finally:
          finally:
-            # Stop the thread before doing anything else
-            self._stop_checkpointer.set()
-            self._checkpointer.join()
-            # Commit all pending blocks & one last _update()
-            self._local_collection.manifest_text()
-            self._update(final=True)
+            if not self.dry_run:
+                # Stop the thread before doing anything else
+                self._stop_checkpointer.set()
+                self._checkpointer.join()
+                if self._checkpoint_before_quit:
+                    # Commit all pending blocks & one last _update()
+                    self._local_collection.manifest_text()
+                    self._update(final=True)
+                    if save_collection:
+                        self.save_collection()
              if self.use_cache:
                  self._cache_file.close()
              if self.use_cache:
                  self._cache_file.close()
-            if save_collection:
-                self.save_collection()
-            # Correct the final written bytes count
-            self.bytes_written -= self.bytes_skipped
  
      def save_collection(self):
          if self.update:
  
      def save_collection(self):
          if self.update:
@@ -412,7 +495,7 @@ class ArvPutUploadJob(object):
                  num_retries=self.num_retries)
  
      def destroy_cache(self):
                  num_retries=self.num_retries)
  
      def destroy_cache(self):
-        if self.resume:
+        if self.use_cache:
              try:
                  os.unlink(self._cache_filename)
              except OSError as error:
              try:
                  os.unlink(self._cache_filename)
              except OSError as error:
@@ -426,7 +509,7 @@ class ArvPutUploadJob(object):
          Recursively get the total size of the collection
          """
          size = 0
          Recursively get the total size of the collection
          """
          size = 0
-        for item in collection.values():
+        for item in listvalues(collection):
              if isinstance(item, arvados.collection.Collection) or isinstance(item, arvados.collection.Subcollection):
                  size += self._collection_size(item)
              else:
              if isinstance(item, arvados.collection.Collection) or isinstance(item, arvados.collection.Subcollection):
                  size += self._collection_size(item)
              else:
@@ -438,24 +521,34 @@ class ArvPutUploadJob(object):
          Periodically called support task. File uploading is
          asynchronous so we poll status from the collection.
          """
          Periodically called support task. File uploading is
          asynchronous so we poll status from the collection.
          """
-        while not self._stop_checkpointer.wait(self._update_task_time):
+        while not self._stop_checkpointer.wait(1 if not self._upload_started else self._update_task_time):
              self._update()
  
      def _update(self, final=False):
          """
          Update cached manifest text and report progress.
          """
              self._update()
  
      def _update(self, final=False):
          """
          Update cached manifest text and report progress.
          """
-        with self._collection_lock:
-            self.bytes_written = self._collection_size(self._local_collection)
-            if self.use_cache:
-                # Update cache
-                with self._state_lock:
+        if self._upload_started:
+            with self._collection_lock:
+                self.bytes_written = self._collection_size(self._local_collection)
+                if self.use_cache:
                      if final:
                      if final:
-                        self._state['manifest'] = self._local_collection.manifest_text()
+                        manifest = self._local_collection.manifest_text()
                      else:
                          # Get the manifest text without comitting pending blocks
                      else:
                          # Get the manifest text without comitting pending blocks
-                        self._state['manifest'] = self._local_collection._get_manifest_text(".", strip=False, normalize=False, only_committed=True)
-                self._save_state()
+                        manifest = self._local_collection.manifest_text(strip=False,
+                                                                        normalize=False,
+                                                                        only_committed=True)
+                    # Update cache
+                    with self._state_lock:
+                        self._state['manifest'] = manifest
+            if self.use_cache:
+                try:
+                    self._save_state()
+                except Exception as e:
+                    self.logger.error("Unexpected error trying to save cache file: {}".format(e))
+        else:
+            self.bytes_written = self.bytes_skipped
          # Call the reporter, if any
          self.report_progress()
  
          # Call the reporter, if any
          self.report_progress()
  
@@ -464,17 +557,17 @@ class ArvPutUploadJob(object):
              self.reporter(self.bytes_written, self.bytes_expected)
  
      def _write_stdin(self, filename):
              self.reporter(self.bytes_written, self.bytes_expected)
  
      def _write_stdin(self, filename):
-        output = self._local_collection.open(filename, 'w')
+        output = self._local_collection.open(filename, 'wb')
          self._write(sys.stdin, output)
          output.close()
  
          self._write(sys.stdin, output)
          output.close()
  
-    def _write_file(self, source, filename):
+    def _check_file(self, source, filename):
+        """Check if this file needs to be uploaded"""
          resume_offset = 0
          should_upload = False
          new_file_in_cache = False
          resume_offset = 0
          should_upload = False
          new_file_in_cache = False
-
          # Record file path for updating the remote collection before exiting
          # Record file path for updating the remote collection before exiting
-        self._file_paths.append(filename)
+        self._file_paths.add(filename)
  
          with self._state_lock:
              # If no previous cached data on this file, store it for an eventual
  
          with self._state_lock:
              # If no previous cached data on this file, store it for an eventual
@@ -501,34 +594,46 @@ class ArvPutUploadJob(object):
              if not file_in_local_collection:
                  # File not uploaded yet, upload it completely
                  should_upload = True
              if not file_in_local_collection:
                  # File not uploaded yet, upload it completely
                  should_upload = True
+            elif file_in_local_collection.permission_expired():
+                # Permission token expired, re-upload file. This will change whenever
+                # we have a API for refreshing tokens.
+                should_upload = True
+                self._local_collection.remove(filename)
              elif cached_file_data['size'] == file_in_local_collection.size():
                  # File already there, skip it.
                  self.bytes_skipped += cached_file_data['size']
              elif cached_file_data['size'] > file_in_local_collection.size():
                  # File partially uploaded, resume!
                  resume_offset = file_in_local_collection.size()
              elif cached_file_data['size'] == file_in_local_collection.size():
                  # File already there, skip it.
                  self.bytes_skipped += cached_file_data['size']
              elif cached_file_data['size'] > file_in_local_collection.size():
                  # File partially uploaded, resume!
                  resume_offset = file_in_local_collection.size()
+                self.bytes_skipped += resume_offset
                  should_upload = True
              else:
                  # Inconsistent cache, re-upload the file
                  should_upload = True
                  should_upload = True
              else:
                  # Inconsistent cache, re-upload the file
                  should_upload = True
+                self._local_collection.remove(filename)
                  self.logger.warning("Uploaded version of file '{}' is bigger than local version, will re-upload it from scratch.".format(source))
          # Local file differs from cached data, re-upload it.
          else:
                  self.logger.warning("Uploaded version of file '{}' is bigger than local version, will re-upload it from scratch.".format(source))
          # Local file differs from cached data, re-upload it.
          else:
+            if file_in_local_collection:
+                self._local_collection.remove(filename)
              should_upload = True
  
          if should_upload:
              should_upload = True
  
          if should_upload:
-            with open(source, 'r') as source_fd:
+            self._files_to_upload.append((source, resume_offset, filename))
+
+    def _upload_files(self):
+        for source, resume_offset, filename in self._files_to_upload:
+            with open(source, 'rb') as source_fd:
                  with self._state_lock:
                      self._state['files'][source]['mtime'] = os.path.getmtime(source)
                      self._state['files'][source]['size'] = os.path.getsize(source)
                  if resume_offset > 0:
                      # Start upload where we left off
                  with self._state_lock:
                      self._state['files'][source]['mtime'] = os.path.getmtime(source)
                      self._state['files'][source]['size'] = os.path.getsize(source)
                  if resume_offset > 0:
                      # Start upload where we left off
-                    output = self._local_collection.open(filename, 'a')
+                    output = self._local_collection.open(filename, 'ab')
                      source_fd.seek(resume_offset)
                      source_fd.seek(resume_offset)
-                    self.bytes_skipped += resume_offset
                  else:
                      # Start from scratch
                  else:
                      # Start from scratch
-                    output = self._local_collection.open(filename, 'w')
+                    output = self._local_collection.open(filename, 'wb')
                  self._write(source_fd, output)
                  output.close(flush=False)
  
                  self._write(source_fd, output)
                  output.close(flush=False)
  
@@ -562,15 +667,22 @@ class ArvPutUploadJob(object):
          if self.use_cache:
              # Set up cache file name from input paths.
              md5 = hashlib.md5()
          if self.use_cache:
              # Set up cache file name from input paths.
              md5 = hashlib.md5()
-            md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost'))
+            md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost').encode())
              realpaths = sorted(os.path.realpath(path) for path in self.paths)
              realpaths = sorted(os.path.realpath(path) for path in self.paths)
-            md5.update('\0'.join(realpaths))
+            md5.update(b'\0'.join([p.encode() for p in realpaths]))
              if self.filename:
              if self.filename:
-                md5.update(self.filename)
+                md5.update(self.filename.encode())
              cache_filename = md5.hexdigest()
              cache_filename = md5.hexdigest()
-            self._cache_file = open(os.path.join(
+            cache_filepath = os.path.join(
                  arv_cmd.make_home_conf_dir(self.CACHE_DIR, 0o700, 'raise'),
                  arv_cmd.make_home_conf_dir(self.CACHE_DIR, 0o700, 'raise'),
-                cache_filename), 'a+')
+                cache_filename)
+            if self.resume and os.path.exists(cache_filepath):
+                self.logger.info("Resuming upload from cache file {}".format(cache_filepath))
+                self._cache_file = open(cache_filepath, 'a+')
+            else:
+                # --no-resume means start with a empty cache file.
+                self.logger.info("Creating new cache file at {}".format(cache_filepath))
+                self._cache_file = open(cache_filepath, 'w+')
              self._cache_filename = self._cache_file.name
              self._lock_file(self._cache_file)
              self._cache_file.seek(0)
              self._cache_filename = self._cache_file.name
              self._lock_file(self._cache_file)
              self._cache_file.seek(0)
@@ -586,13 +698,22 @@ class ArvPutUploadJob(object):
                      # Cache file empty, set up new cache
                      self._state = copy.deepcopy(self.EMPTY_STATE)
              else:
                      # Cache file empty, set up new cache
                      self._state = copy.deepcopy(self.EMPTY_STATE)
              else:
+                self.logger.info("No cache usage requested for this run.")
                  # No cache file, set empty state
                  self._state = copy.deepcopy(self.EMPTY_STATE)
              # Load the previous manifest so we can check if files were modified remotely.
                  # No cache file, set empty state
                  self._state = copy.deepcopy(self.EMPTY_STATE)
              # Load the previous manifest so we can check if files were modified remotely.
-            self._local_collection = arvados.collection.Collection(self._state['manifest'], replication_desired=self.replication_desired)
-        # Load how many bytes were uploaded on previous run
-        with self._collection_lock:
-            self.bytes_written = self._collection_size(self._local_collection)
+            self._local_collection = arvados.collection.Collection(self._state['manifest'], replication_desired=self.replication_desired, put_threads=self.put_threads)
+
+    def collection_file_paths(self, col, path_prefix='.'):
+        """Return a list of file paths by recursively go through the entire collection `col`"""
+        file_paths = []
+        for name, item in listitems(col):
+            if isinstance(item, arvados.arvfile.ArvadosFile):
+                file_paths.append(os.path.join(path_prefix, name))
+            elif isinstance(item, arvados.collection.Subcollection):
+                new_prefix = os.path.join(path_prefix, name)
+                file_paths += self.collection_file_paths(item, path_prefix=new_prefix)
+        return file_paths
  
      def _lock_file(self, fileobj):
          try:
  
      def _lock_file(self, fileobj):
          try:
@@ -604,17 +725,20 @@ class ArvPutUploadJob(object):
          """
          Atomically save current state into cache.
          """
          """
          Atomically save current state into cache.
          """
+        with self._state_lock:
+            # We're not using copy.deepcopy() here because it's a lot slower
+            # than json.dumps(), and we're already needing JSON format to be
+            # saved on disk.
+            state = json.dumps(self._state)
          try:
          try:
-            with self._state_lock:
-                state = copy.deepcopy(self._state)
-            new_cache_fd, new_cache_name = tempfile.mkstemp(
-                dir=os.path.dirname(self._cache_filename))
-            self._lock_file(new_cache_fd)
-            new_cache = os.fdopen(new_cache_fd, 'r+')
-            json.dump(state, new_cache)
+            new_cache = tempfile.NamedTemporaryFile(
+                mode='w+',
+                dir=os.path.dirname(self._cache_filename), delete=False)
+            self._lock_file(new_cache)
+            new_cache.write(state)
              new_cache.flush()
              os.fsync(new_cache)
              new_cache.flush()
              os.fsync(new_cache)
-            os.rename(new_cache_name, self._cache_filename)
+            os.rename(new_cache.name, self._cache_filename)
          except (IOError, OSError, ResumeCacheConflict) as error:
              self.logger.error("There was a problem while saving the cache file: {}".format(error))
              try:
          except (IOError, OSError, ResumeCacheConflict) as error:
              self.logger.error("There was a problem while saving the cache file: {}".format(error))
              try:
@@ -632,7 +756,14 @@ class ArvPutUploadJob(object):
          return self._my_collection().manifest_locator()
  
      def portable_data_hash(self):
          return self._my_collection().manifest_locator()
  
      def portable_data_hash(self):
-        return self._my_collection().portable_data_hash()
+        pdh = self._my_collection().portable_data_hash()
+        m = self._my_collection().stripped_manifest().encode()
+        local_pdh = '{}+{}'.format(hashlib.md5(m).hexdigest(), len(m))
+        if pdh != local_pdh:
+            logger.warning("\n".join([
+                "arv-put: API server provided PDH differs from local manifest.",
+                "         This should not happen; showing API server version."]))
+        return pdh
  
      def manifest_text(self, stream_name=".", strip=False, normalize=False):
          return self._my_collection().manifest_text(stream_name, strip, normalize)
  
      def manifest_text(self, stream_name=".", strip=False, normalize=False):
          return self._my_collection().manifest_text(stream_name, strip, normalize)
@@ -653,7 +784,7 @@ class ArvPutUploadJob(object):
                      locators.append(loc)
                  return locators
          elif isinstance(item, arvados.collection.Collection):
                      locators.append(loc)
                  return locators
          elif isinstance(item, arvados.collection.Collection):
-            l = [self._datablocks_on_item(x) for x in item.values()]
+            l = [self._datablocks_on_item(x) for x in listvalues(item)]
              # Fast list flattener method taken from:
              # http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
              return [loc for sublist in l for loc in sublist]
              # Fast list flattener method taken from:
              # http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
              return [loc for sublist in l for loc in sublist]
@@ -718,6 +849,8 @@ def desired_project_uuid(api_client, project_uuid, num_retries):
  def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
      global api_client
  
  def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
      global api_client
  
+    logger = logging.getLogger('arvados.arv_put')
+    logger.setLevel(logging.INFO)
      args = parse_arguments(arguments)
      status = 0
      if api_client is None:
      args = parse_arguments(arguments)
      status = 0
      if api_client is None:
@@ -726,7 +859,10 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
      # Determine the name to use
      if args.name:
          if args.stream or args.raw:
      # Determine the name to use
      if args.name:
          if args.stream or args.raw:
-            print >>stderr, "Cannot use --name with --stream or --raw"
+            logger.error("Cannot use --name with --stream or --raw")
+            sys.exit(1)
+        elif args.update_collection:
+            logger.error("Cannot use --name with --update-collection")
              sys.exit(1)
          collection_name = args.name
      else:
              sys.exit(1)
          collection_name = args.name
      else:
@@ -736,7 +872,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
              socket.gethostname())
  
      if args.project_uuid and (args.stream or args.raw):
              socket.gethostname())
  
      if args.project_uuid and (args.stream or args.raw):
-        print >>stderr, "Cannot use --project-uuid with --stream or --raw"
+        logger.error("Cannot use --project-uuid with --stream or --raw")
          sys.exit(1)
  
      # Determine the parent project
          sys.exit(1)
  
      # Determine the parent project
@@ -744,7 +880,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
          project_uuid = desired_project_uuid(api_client, args.project_uuid,
                                              args.retries)
      except (apiclient_errors.Error, ValueError) as error:
          project_uuid = desired_project_uuid(api_client, args.project_uuid,
                                              args.retries)
      except (apiclient_errors.Error, ValueError) as error:
-        print >>stderr, error
+        logger.error(error)
          sys.exit(1)
  
      if args.progress:
          sys.exit(1)
  
      if args.progress:
@@ -754,6 +890,10 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
      else:
          reporter = None
  
      else:
          reporter = None
  
+    # If this is used by a human, and there's at least one directory to be
+    # uploaded, the expected bytes calculation can take a moment.
+    if args.progress and any([os.path.isdir(f) for f in args.paths]):
+        logger.info("Calculating upload size, this could take some time...")
      bytes_expected = expected_bytes_for(args.paths)
  
      try:
      bytes_expected = expected_bytes_for(args.paths)
  
      try:
@@ -765,41 +905,57 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
                                   bytes_expected = bytes_expected,
                                   num_retries = args.retries,
                                   replication_desired = args.replication,
                                   bytes_expected = bytes_expected,
                                   num_retries = args.retries,
                                   replication_desired = args.replication,
+                                 put_threads = args.threads,
                                   name = collection_name,
                                   owner_uuid = project_uuid,
                                   ensure_unique_name = True,
                                   name = collection_name,
                                   owner_uuid = project_uuid,
                                   ensure_unique_name = True,
-                                 update_collection = args.update_collection)
+                                 update_collection = args.update_collection,
+                                 logger=logger,
+                                 dry_run=args.dry_run)
      except ResumeCacheConflict:
      except ResumeCacheConflict:
-        print >>stderr, "\n".join([
+        logger.error("\n".join([
              "arv-put: Another process is already uploading this data.",
              "arv-put: Another process is already uploading this data.",
-            "         Use --no-cache if this is really what you want."])
+            "         Use --no-cache if this is really what you want."]))
          sys.exit(1)
      except CollectionUpdateError as error:
          sys.exit(1)
      except CollectionUpdateError as error:
-        print >>stderr, "\n".join([
-            "arv-put: %s" % str(error)])
+        logger.error("\n".join([
+            "arv-put: %s" % str(error)]))
          sys.exit(1)
          sys.exit(1)
+    except ArvPutUploadIsPending:
+        # Dry run check successful, return proper exit code.
+        sys.exit(2)
+    except ArvPutUploadNotPending:
+        # No files pending for upload
+        sys.exit(0)
  
      # Install our signal handler for each code in CAUGHT_SIGNALS, and save
      # the originals.
      orig_signal_handlers = {sigcode: signal.signal(sigcode, exit_signal_handler)
                              for sigcode in CAUGHT_SIGNALS}
  
  
      # Install our signal handler for each code in CAUGHT_SIGNALS, and save
      # the originals.
      orig_signal_handlers = {sigcode: signal.signal(sigcode, exit_signal_handler)
                              for sigcode in CAUGHT_SIGNALS}
  
-    if not args.update_collection and args.resume and writer.bytes_written > 0:
-        print >>stderr, "\n".join([
-                "arv-put: Resuming previous upload from last checkpoint.",
-                "         Use the --no-resume option to start over."])
+    if not args.dry_run and not args.update_collection and args.resume and writer.bytes_written > 0:
+        logger.warning("\n".join([
+            "arv-put: Resuming previous upload from last checkpoint.",
+            "         Use the --no-resume option to start over."]))
  
  
-    writer.report_progress()
+    if not args.dry_run:
+        writer.report_progress()
      output = None
      try:
          writer.start(save_collection=not(args.stream or args.raw))
      except arvados.errors.ApiError as error:
      output = None
      try:
          writer.start(save_collection=not(args.stream or args.raw))
      except arvados.errors.ApiError as error:
-        print >>stderr, "\n".join([
-            "arv-put: %s" % str(error)])
+        logger.error("\n".join([
+            "arv-put: %s" % str(error)]))
          sys.exit(1)
          sys.exit(1)
+    except ArvPutUploadIsPending:
+        # Dry run check successful, return proper exit code.
+        sys.exit(2)
+    except ArvPutUploadNotPending:
+        # No files pending for upload
+        sys.exit(0)
  
      if args.progress:  # Print newline to split stderr from stdout for humans.
  
      if args.progress:  # Print newline to split stderr from stdout for humans.
-        print >>stderr
+        logger.info("\n")
  
      if args.stream:
          if args.normalize:
  
      if args.stream:
          if args.normalize:
@@ -811,15 +967,15 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
      else:
          try:
              if args.update_collection:
      else:
          try:
              if args.update_collection:
-                print >>stderr, "Collection updated: '{}'".format(writer.collection_name())
+                logger.info("Collection updated: '{}'".format(writer.collection_name()))
              else:
              else:
-                print >>stderr, "Collection saved as '{}'".format(writer.collection_name())
+                logger.info("Collection saved as '{}'".format(writer.collection_name()))
              if args.portable_data_hash:
                  output = writer.portable_data_hash()
              else:
                  output = writer.manifest_locator()
          except apiclient_errors.Error as error:
              if args.portable_data_hash:
                  output = writer.portable_data_hash()
              else:
                  output = writer.manifest_locator()
          except apiclient_errors.Error as error:
-            print >>stderr, (
+            logger.error(
                  "arv-put: Error creating Collection on project: {}.".format(
                      error))
              status = 1
                  "arv-put: Error creating Collection on project: {}.".format(
                      error))
              status = 1
@@ -832,7 +988,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
          if not output.endswith('\n'):
              stdout.write('\n')
  
          if not output.endswith('\n'):
              stdout.write('\n')
  
-    for sigcode, orig_handler in orig_signal_handlers.items():
+    for sigcode, orig_handler in listitems(orig_signal_handlers):
          signal.signal(sigcode, orig_handler)
  
      if status != 0:
          signal.signal(sigcode, orig_handler)
  
      if status != 0: