11308: Fix bytes vs. strings from recent merge.

[arvados.git] / sdk / python / arvados / commands / put.py
diff --git a/sdk/python/arvados/commands/put.py b/sdk/python/arvados/commands/put.py

index 714281cc95b0475831f1761470c9cf1b5e91cce5..ed9d55cfc2df31a2b175254a441b4aa9a046cb02 100644 (file)
--- a/sdk/python/arvados/commands/put.py
+++ b/sdk/python/arvados/commands/put.py
@@ -1,8 +1,7 @@
-#!/usr/bin/env python
-
-# TODO:
-# --md5sum - display md5 of each file as read from disk
-
+from __future__ import division
+from future.utils import listitems, listvalues
+from builtins import str
+from builtins import object
  import argparse
  import arvados
  import arvados.collection
@@ -23,6 +22,8 @@ import sys
  import tempfile
  import threading
  import time
+import traceback
+
  from apiclient import errors as apiclient_errors
  from arvados._version import __version__
  
@@ -133,6 +134,15 @@ physical storage devices (e.g., disks) should have a copy of each data
  block. Default is to use the server-provided default (if any) or 2.
  """)
  
+upload_opts.add_argument('--threads', type=int, metavar='N', default=None,
+                         help="""
+Set the number of upload threads to be used. Take into account that
+using lots of threads will increase the RAM requirements. Default is
+to use 2 threads.
+On high latency installations, using a greater number will improve
+overall throughput.
+""")
+
  run_opts = argparse.ArgumentParser(add_help=False)
  
  run_opts.add_argument('--project-uuid', metavar='UUID', help="""
@@ -194,7 +204,7 @@ def parse_arguments(arguments):
      if len(args.paths) == 0:
          args.paths = ['-']
  
-    args.paths = map(lambda x: "-" if x == "/dev/stdin" else x, args.paths)
+    args.paths = ["-" if x == "/dev/stdin" else x for x in args.paths]
  
      if len(args.paths) != 1 or os.path.isdir(args.paths[0]):
          if args.filename:
@@ -267,13 +277,13 @@ class ResumeCache(object):
      @classmethod
      def make_path(cls, args):
          md5 = hashlib.md5()
-        md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost'))
+        md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost').encode())
          realpaths = sorted(os.path.realpath(path) for path in args.paths)
-        md5.update('\0'.join(realpaths))
+        md5.update(b'\0'.join([p.encode() for p in realpaths]))
          if any(os.path.isdir(path) for path in realpaths):
-            md5.update("-1")
+            md5.update(b'-1')
          elif args.filename:
-            md5.update(args.filename)
+            md5.update(args.filename.encode())
          return os.path.join(
              arv_cmd.make_home_conf_dir(cls.CACHE_DIR, 0o700, 'raise'),
              md5.hexdigest())
@@ -347,8 +357,9 @@ class ArvPutUploadJob(object):
  
      def __init__(self, paths, resume=True, use_cache=True, reporter=None,
                   bytes_expected=None, name=None, owner_uuid=None,
-                 ensure_unique_name=False, num_retries=None, replication_desired=None,
-                 filename=None, update_time=20.0, update_collection=None,
+                 ensure_unique_name=False, num_retries=None,
+                 put_threads=None, replication_desired=None,
+                 filename=None, update_time=60.0, update_collection=None,
                   logger=logging.getLogger('arvados.arv_put'), dry_run=False):
          self.paths = paths
          self.resume = resume
@@ -363,6 +374,7 @@ class ArvPutUploadJob(object):
          self.ensure_unique_name = ensure_unique_name
          self.num_retries = num_retries
          self.replication_desired = replication_desired
+        self.put_threads = put_threads
          self.filename = filename
          self._state_lock = threading.Lock()
          self._state = None # Previous run state (file list & manifest)
@@ -371,14 +383,16 @@ class ArvPutUploadJob(object):
          self._collection_lock = threading.Lock()
          self._remote_collection = None # Collection being updated (if asked)
          self._local_collection = None # Collection from previous run manifest
-        self._file_paths = [] # Files to be updated in remote collection
+        self._file_paths = set() # Files to be updated in remote collection
          self._stop_checkpointer = threading.Event()
          self._checkpointer = threading.Thread(target=self._update_task)
          self._checkpointer.daemon = True
          self._update_task_time = update_time  # How many seconds wait between update runs
          self._files_to_upload = FileUploadList(dry_run=dry_run)
+        self._upload_started = False
          self.logger = logger
          self.dry_run = dry_run
+        self._checkpoint_before_quit = True
  
          if not self.use_cache and self.resume:
              raise ArvPutArgumentConflict('resume cannot be True when use_cache is False')
@@ -433,17 +447,29 @@ class ArvPutUploadJob(object):
              # report initial progress.
              self._update()
              # Actual file upload
+            self._upload_started = True # Used by the update thread to start checkpointing
              self._upload_files()
+        except (SystemExit, Exception) as e:
+            self._checkpoint_before_quit = False
+            # Log stack trace only when Ctrl-C isn't pressed (SIGINT)
+            # Note: We're expecting SystemExit instead of KeyboardInterrupt because
+            #   we have a custom signal handler in place that raises SystemExit with
+            #   the catched signal's code.
+            if not isinstance(e, SystemExit) or e.code != -2:
+                self.logger.warning("Abnormal termination:\n{}".format(
+                    traceback.format_exc()))
+            raise
          finally:
              if not self.dry_run:
                  # Stop the thread before doing anything else
                  self._stop_checkpointer.set()
                  self._checkpointer.join()
-                # Commit all pending blocks & one last _update()
-                self._local_collection.manifest_text()
-                self._update(final=True)
-                if save_collection:
-                    self.save_collection()
+                if self._checkpoint_before_quit:
+                    # Commit all pending blocks & one last _update()
+                    self._local_collection.manifest_text()
+                    self._update(final=True)
+                    if save_collection:
+                        self.save_collection()
              if self.use_cache:
                  self._cache_file.close()
  
@@ -483,7 +509,7 @@ class ArvPutUploadJob(object):
          Recursively get the total size of the collection
          """
          size = 0
-        for item in collection.values():
+        for item in listvalues(collection):
              if isinstance(item, arvados.collection.Collection) or isinstance(item, arvados.collection.Subcollection):
                  size += self._collection_size(item)
              else:
@@ -495,24 +521,34 @@ class ArvPutUploadJob(object):
          Periodically called support task. File uploading is
          asynchronous so we poll status from the collection.
          """
-        while not self._stop_checkpointer.wait(self._update_task_time):
+        while not self._stop_checkpointer.wait(1 if not self._upload_started else self._update_task_time):
              self._update()
  
      def _update(self, final=False):
          """
          Update cached manifest text and report progress.
          """
-        with self._collection_lock:
-            self.bytes_written = self._collection_size(self._local_collection)
-            if self.use_cache:
-                # Update cache
-                with self._state_lock:
+        if self._upload_started:
+            with self._collection_lock:
+                self.bytes_written = self._collection_size(self._local_collection)
+                if self.use_cache:
                      if final:
-                        self._state['manifest'] = self._local_collection.manifest_text()
+                        manifest = self._local_collection.manifest_text()
                      else:
                          # Get the manifest text without comitting pending blocks
-                        self._state['manifest'] = self._local_collection._get_manifest_text(".", strip=False, normalize=False, only_committed=True)
-                self._save_state()
+                        manifest = self._local_collection.manifest_text(strip=False,
+                                                                        normalize=False,
+                                                                        only_committed=True)
+                    # Update cache
+                    with self._state_lock:
+                        self._state['manifest'] = manifest
+            if self.use_cache:
+                try:
+                    self._save_state()
+                except Exception as e:
+                    self.logger.error("Unexpected error trying to save cache file: {}".format(e))
+        else:
+            self.bytes_written = self.bytes_skipped
          # Call the reporter, if any
          self.report_progress()
  
@@ -521,7 +557,7 @@ class ArvPutUploadJob(object):
              self.reporter(self.bytes_written, self.bytes_expected)
  
      def _write_stdin(self, filename):
-        output = self._local_collection.open(filename, 'w')
+        output = self._local_collection.open(filename, 'wb')
          self._write(sys.stdin, output)
          output.close()
  
@@ -531,7 +567,7 @@ class ArvPutUploadJob(object):
          should_upload = False
          new_file_in_cache = False
          # Record file path for updating the remote collection before exiting
-        self._file_paths.append(filename)
+        self._file_paths.add(filename)
  
          with self._state_lock:
              # If no previous cached data on this file, store it for an eventual
@@ -587,17 +623,17 @@ class ArvPutUploadJob(object):
  
      def _upload_files(self):
          for source, resume_offset, filename in self._files_to_upload:
-            with open(source, 'r') as source_fd:
+            with open(source, 'rb') as source_fd:
                  with self._state_lock:
                      self._state['files'][source]['mtime'] = os.path.getmtime(source)
                      self._state['files'][source]['size'] = os.path.getsize(source)
                  if resume_offset > 0:
                      # Start upload where we left off
-                    output = self._local_collection.open(filename, 'a')
+                    output = self._local_collection.open(filename, 'ab')
                      source_fd.seek(resume_offset)
                  else:
                      # Start from scratch
-                    output = self._local_collection.open(filename, 'w')
+                    output = self._local_collection.open(filename, 'wb')
                  self._write(source_fd, output)
                  output.close(flush=False)
  
@@ -631,19 +667,21 @@ class ArvPutUploadJob(object):
          if self.use_cache:
              # Set up cache file name from input paths.
              md5 = hashlib.md5()
-            md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost'))
+            md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost').encode())
              realpaths = sorted(os.path.realpath(path) for path in self.paths)
-            md5.update('\0'.join(realpaths))
+            md5.update(b'\0'.join([p.encode() for p in realpaths]))
              if self.filename:
-                md5.update(self.filename)
+                md5.update(self.filename.encode())
              cache_filename = md5.hexdigest()
              cache_filepath = os.path.join(
                  arv_cmd.make_home_conf_dir(self.CACHE_DIR, 0o700, 'raise'),
                  cache_filename)
-            if self.resume:
+            if self.resume and os.path.exists(cache_filepath):
+                self.logger.info("Resuming upload from cache file {}".format(cache_filepath))
                  self._cache_file = open(cache_filepath, 'a+')
              else:
                  # --no-resume means start with a empty cache file.
+                self.logger.info("Creating new cache file at {}".format(cache_filepath))
                  self._cache_file = open(cache_filepath, 'w+')
              self._cache_filename = self._cache_file.name
              self._lock_file(self._cache_file)
@@ -660,15 +698,16 @@ class ArvPutUploadJob(object):
                      # Cache file empty, set up new cache
                      self._state = copy.deepcopy(self.EMPTY_STATE)
              else:
+                self.logger.info("No cache usage requested for this run.")
                  # No cache file, set empty state
                  self._state = copy.deepcopy(self.EMPTY_STATE)
              # Load the previous manifest so we can check if files were modified remotely.
-            self._local_collection = arvados.collection.Collection(self._state['manifest'], replication_desired=self.replication_desired)
+            self._local_collection = arvados.collection.Collection(self._state['manifest'], replication_desired=self.replication_desired, put_threads=self.put_threads)
  
      def collection_file_paths(self, col, path_prefix='.'):
          """Return a list of file paths by recursively go through the entire collection `col`"""
          file_paths = []
-        for name, item in col.items():
+        for name, item in listitems(col):
              if isinstance(item, arvados.arvfile.ArvadosFile):
                  file_paths.append(os.path.join(path_prefix, name))
              elif isinstance(item, arvados.collection.Subcollection):
@@ -686,17 +725,20 @@ class ArvPutUploadJob(object):
          """
          Atomically save current state into cache.
          """
+        with self._state_lock:
+            # We're not using copy.deepcopy() here because it's a lot slower
+            # than json.dumps(), and we're already needing JSON format to be
+            # saved on disk.
+            state = json.dumps(self._state)
          try:
-            with self._state_lock:
-                state = copy.deepcopy(self._state)
-            new_cache_fd, new_cache_name = tempfile.mkstemp(
-                dir=os.path.dirname(self._cache_filename))
-            self._lock_file(new_cache_fd)
-            new_cache = os.fdopen(new_cache_fd, 'r+')
-            json.dump(state, new_cache)
+            new_cache = tempfile.NamedTemporaryFile(
+                mode='w+',
+                dir=os.path.dirname(self._cache_filename), delete=False)
+            self._lock_file(new_cache)
+            new_cache.write(state)
              new_cache.flush()
              os.fsync(new_cache)
-            os.rename(new_cache_name, self._cache_filename)
+            os.rename(new_cache.name, self._cache_filename)
          except (IOError, OSError, ResumeCacheConflict) as error:
              self.logger.error("There was a problem while saving the cache file: {}".format(error))
              try:
@@ -714,7 +756,14 @@ class ArvPutUploadJob(object):
          return self._my_collection().manifest_locator()
  
      def portable_data_hash(self):
-        return self._my_collection().portable_data_hash()
+        pdh = self._my_collection().portable_data_hash()
+        m = self._my_collection().stripped_manifest().encode()
+        local_pdh = '{}+{}'.format(hashlib.md5(m).hexdigest(), len(m))
+        if pdh != local_pdh:
+            logger.warning("\n".join([
+                "arv-put: API server provided PDH differs from local manifest.",
+                "         This should not happen; showing API server version."]))
+        return pdh
  
      def manifest_text(self, stream_name=".", strip=False, normalize=False):
          return self._my_collection().manifest_text(stream_name, strip, normalize)
@@ -735,7 +784,7 @@ class ArvPutUploadJob(object):
                      locators.append(loc)
                  return locators
          elif isinstance(item, arvados.collection.Collection):
-            l = [self._datablocks_on_item(x) for x in item.values()]
+            l = [self._datablocks_on_item(x) for x in listvalues(item)]
              # Fast list flattener method taken from:
              # http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
              return [loc for sublist in l for loc in sublist]
@@ -801,6 +850,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
      global api_client
  
      logger = logging.getLogger('arvados.arv_put')
+    logger.setLevel(logging.INFO)
      args = parse_arguments(arguments)
      status = 0
      if api_client is None:
@@ -840,6 +890,10 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
      else:
          reporter = None
  
+    # If this is used by a human, and there's at least one directory to be
+    # uploaded, the expected bytes calculation can take a moment.
+    if args.progress and any([os.path.isdir(f) for f in args.paths]):
+        logger.info("Calculating upload size, this could take some time...")
      bytes_expected = expected_bytes_for(args.paths)
  
      try:
@@ -851,6 +905,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
                                   bytes_expected = bytes_expected,
                                   num_retries = args.retries,
                                   replication_desired = args.replication,
+                                 put_threads = args.threads,
                                   name = collection_name,
                                   owner_uuid = project_uuid,
                                   ensure_unique_name = True,
@@ -933,7 +988,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr):
          if not output.endswith('\n'):
              stdout.write('\n')
  
-    for sigcode, orig_handler in orig_signal_handlers.items():
+    for sigcode, orig_handler in listitems(orig_signal_handlers):
          signal.signal(sigcode, orig_handler)
  
      if status != 0: