Merge branch '18027-unmount-fuse'

[arvados.git] / sdk / python / arvados / commands / put.py
diff --git a/sdk/python/arvados/commands/put.py b/sdk/python/arvados/commands/put.py

index d13c2cd50f44d5e76793a2cdb9137714411110ba..f6f85ba69619ba930cca9efd20d3b4f134f28527 100644 (file)
--- a/sdk/python/arvados/commands/put.py
+++ b/sdk/python/arvados/commands/put.py
@@ -77,8 +77,7 @@ Synonym for --stream.
  _group.add_argument('--stream', action='store_true',
                      help="""
  Store the file content and display the resulting manifest on
-stdout. Do not write the manifest to Keep or save a Collection object
-in Arvados.
+stdout. Do not save a Collection object in Arvados.
  """)
  
  _group.add_argument('--as-manifest', action='store_true', dest='manifest',
@@ -174,7 +173,8 @@ Follow file and directory symlinks (default).
  """)
  _group.add_argument('--no-follow-links', action='store_false', dest='follow_links',
                      help="""
-Do not follow file and directory symlinks.
+Ignore file and directory symlinks. Even paths given explicitly on the
+command line will be skipped if they are symlinks.
  """)
  
  
@@ -215,6 +215,12 @@ Do not print any debug messages to console. (Any error messages will
  still be displayed.)
  """)
  
+run_opts.add_argument('--batch', action='store_true', default=False,
+                      help="""
+Retries with '--no-resume --no-cache' if cached state contains invalid/expired
+block signatures.
+""")
+
  _group = run_opts.add_mutually_exclusive_group()
  _group.add_argument('--resume', action='store_true', default=True,
                      help="""
@@ -236,7 +242,7 @@ Do not save upload state in a cache file for resuming.
  """)
  
  _group = upload_opts.add_mutually_exclusive_group()
-_group.add_argument('--trash-at', metavar='YYYY-MM-DD HH:MM', default=None,
+_group.add_argument('--trash-at', metavar='YYYY-MM-DDTHH:MM', default=None,
                      help="""
  Set the trash date of the resulting collection to an absolute date in the future.
  The accepted format is defined by the ISO 8601 standard. Examples: 20090103, 2009-01-03, 20090103T181505, 2009-01-03T18:15:05.\n
@@ -260,9 +266,8 @@ def parse_arguments(arguments):
  
      args.paths = ["-" if x == "/dev/stdin" else x for x in args.paths]
  
-    if len(args.paths) != 1 or os.path.isdir(args.paths[0]):
-        if args.filename:
-            arg_parser.error("""
+    if args.filename and (len(args.paths) != 1 or os.path.isdir(args.paths[0])):
+        arg_parser.error("""
      --filename argument cannot be used when storing a directory or
      multiple files.
      """)
@@ -439,7 +444,7 @@ class ArvPutUploadJob(object):
      }
  
      def __init__(self, paths, resume=True, use_cache=True, reporter=None,
-                 name=None, owner_uuid=None, api_client=None,
+                 name=None, owner_uuid=None, api_client=None, batch_mode=False,
                   ensure_unique_name=False, num_retries=None,
                   put_threads=None, replication_desired=None, filename=None,
                   update_time=60.0, update_collection=None, storage_classes=None,
@@ -449,6 +454,7 @@ class ArvPutUploadJob(object):
          self.paths = paths
          self.resume = resume
          self.use_cache = use_cache
+        self.batch_mode = batch_mode
          self.update = False
          self.reporter = reporter
          # This will set to 0 before start counting, if no special files are going
@@ -526,6 +532,9 @@ class ArvPutUploadJob(object):
                  self._write_stdin(self.filename or 'stdin')
              elif not os.path.exists(path):
                   raise PathDoesNotExistError(u"file or directory '{}' does not exist.".format(path))
+            elif (not self.follow_links) and os.path.islink(path):
+                self.logger.warning("Skipping symlink '{}'".format(path))
+                continue
              elif os.path.isdir(path):
                  # Use absolute paths on cache index so CWD doesn't interfere
                  # with the caching logic.
@@ -657,15 +666,14 @@ class ArvPutUploadJob(object):
                  else:
                      # The file already exist on remote collection, skip it.
                      pass
-            self._remote_collection.save(storage_classes=self.storage_classes,
-                                         num_retries=self.num_retries,
+            self._remote_collection.save(num_retries=self.num_retries,
                                           trash_at=self._collection_trash_at())
          else:
-            if self.storage_classes is None:
-                self.storage_classes = ['default']
+            if len(self._local_collection) == 0:
+                self.logger.warning("No files were uploaded, skipping collection creation.")
+                return
              self._local_collection.save_new(
                  name=self.name, owner_uuid=self.owner_uuid,
-                storage_classes=self.storage_classes,
                  ensure_unique_name=self.ensure_unique_name,
                  num_retries=self.num_retries,
                  trash_at=self._collection_trash_at())
@@ -723,6 +731,15 @@ class ArvPutUploadJob(object):
                      self._save_state()
                  except Exception as e:
                      self.logger.error("Unexpected error trying to save cache file: {}".format(e))
+            # Keep remote collection's trash_at attribute synced when using relative expire dates
+            if self._remote_collection is not None and type(self._trash_at) == datetime.timedelta:
+                try:
+                    self._api_client.collections().update(
+                        uuid=self._remote_collection.manifest_locator(),
+                        body={'trash_at': self._collection_trash_at().strftime("%Y-%m-%dT%H:%M:%S.%fZ")}
+                    ).execute(num_retries=self.num_retries)
+                except Exception as e:
+                    self.logger.error("Unexpected error trying to update remote collection's expire date: {}".format(e))
          else:
              self.bytes_written = self.bytes_skipped
          # Call the reporter, if any
@@ -734,7 +751,7 @@ class ArvPutUploadJob(object):
  
      def _write_stdin(self, filename):
          output = self._local_collection.open(filename, 'wb')
-        self._write(sys.stdin, output)
+        self._write(sys.stdin.buffer, output)
          output.close()
  
      def _check_file(self, source, filename):
@@ -858,7 +875,10 @@ class ArvPutUploadJob(object):
                                            update_collection):
              try:
                  self._remote_collection = arvados.collection.Collection(
-                    update_collection, api_client=self._api_client)
+                    update_collection,
+                    api_client=self._api_client,
+                    storage_classes_desired=self.storage_classes,
+                    num_retries=self.num_retries)
              except arvados.errors.ApiError as error:
                  raise CollectionUpdateError("Cannot read collection {} ({})".format(update_collection, error))
              else:
@@ -895,13 +915,20 @@ class ArvPutUploadJob(object):
                  # No cache file, set empty state
                  self._state = copy.deepcopy(self.EMPTY_STATE)
              if not self._cached_manifest_valid():
-                raise ResumeCacheInvalidError()
+                if not self.batch_mode:
+                    raise ResumeCacheInvalidError()
+                else:
+                    self.logger.info("Invalid signatures on cache file '{}' while being run in 'batch mode' -- continuing anyways.".format(self._cache_file.name))
+                    self.use_cache = False # Don't overwrite preexisting cache file.
+                    self._state = copy.deepcopy(self.EMPTY_STATE)
              # Load the previous manifest so we can check if files were modified remotely.
              self._local_collection = arvados.collection.Collection(
                  self._state['manifest'],
                  replication_desired=self.replication_desired,
+                storage_classes_desired=self.storage_classes,
                  put_threads=self.put_threads,
-                api_client=self._api_client)
+                api_client=self._api_client,
+                num_retries=self.num_retries)
  
      def _cached_manifest_valid(self):
          """
@@ -993,6 +1020,9 @@ class ArvPutUploadJob(object):
      def collection_name(self):
          return self._my_collection().api_response()['name'] if self._my_collection().api_response() else None
  
+    def collection_trash_at(self):
+        return self._my_collection().get_trash_at()
+
      def manifest_locator(self):
          return self._my_collection().manifest_locator()
  
@@ -1111,6 +1141,14 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr,
      # Trash arguments validation
      trash_at = None
      if args.trash_at is not None:
+        # ciso8601 considers YYYYMM as invalid but YYYY-MM as valid, so here we
+        # make sure the user provides a complete YYYY-MM-DD date.
+        if not re.match(r'^\d{4}(?P<dash>-?)\d{2}?(?P=dash)\d{2}', args.trash_at):
+            logger.error("--trash-at argument format invalid, use --help to see examples.")
+            sys.exit(1)
+        # Check if no time information was provided. In that case, assume end-of-day.
+        if re.match(r'^\d{4}(?P<dash>-?)\d{2}?(?P=dash)\d{2}$', args.trash_at):
+            args.trash_at += 'T23:59:59'
          try:
              trash_at = ciso8601.parse_datetime(args.trash_at)
          except:
@@ -1119,19 +1157,22 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr,
          else:
              if trash_at.tzinfo is not None:
                  # Timezone aware datetime provided.
-                delta = trash_at.utcoffset()
+                utcoffset = -trash_at.utcoffset()
              else:
                  # Timezone naive datetime provided. Assume is local.
-                delta = datetime.datetime.now() - datetime.datetime.utcnow()
+                if time.daylight:
+                    utcoffset = datetime.timedelta(seconds=time.altzone)
+                else:
+                    utcoffset = datetime.timedelta(seconds=time.timezone)
              # Convert to UTC timezone naive datetime.
-            trash_at = trash_at.replace(tzinfo=None) - delta
+            trash_at = trash_at.replace(tzinfo=None) + utcoffset
  
          if trash_at <= datetime.datetime.utcnow():
-            logger.error("--trash-at argument should be set in the future")
+            logger.error("--trash-at argument must be set in the future")
              sys.exit(1)
      if args.trash_after is not None:
          if args.trash_after < 1:
-            logger.error("--trash-after argument should be >= 1")
+            logger.error("--trash-after argument must be >= 1")
              sys.exit(1)
          trash_at = datetime.timedelta(seconds=(args.trash_after * 24 * 60 * 60))
  
@@ -1172,11 +1213,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr,
      #  Split storage-classes argument
      storage_classes = None
      if args.storage_classes:
-        storage_classes = args.storage_classes.strip().split(',')
-        if len(storage_classes) > 1:
-            logger.error("Multiple storage classes are not supported currently.")
-            sys.exit(1)
-
+        storage_classes = args.storage_classes.strip().replace(' ', '').split(',')
  
      # Setup exclude regex from all the --exclude arguments provided
      name_patterns = []
@@ -1225,6 +1262,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr,
          writer = ArvPutUploadJob(paths = args.paths,
                                   resume = args.resume,
                                   use_cache = args.use_cache,
+                                 batch_mode= args.batch,
                                   filename = args.filename,
                                   reporter = reporter,
                                   api_client = api_client,
@@ -1253,7 +1291,8 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr,
              "         or been created with another Arvados user's credentials.",
              "         Switch user or use one of the following options to restart upload:",
              "         --no-resume to start a new resume cache.",
-            "         --no-cache to disable resume cache."]))
+            "         --no-cache to disable resume cache.",
+            "         --batch to ignore the resume cache if invalid."]))
          sys.exit(1)
      except (CollectionUpdateError, PathDoesNotExistError) as error:
          logger.error("\n".join([
@@ -1276,7 +1315,7 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr,
      output = None
      try:
          writer.start(save_collection=not(args.stream or args.raw))
-    except arvados.errors.ApiError as error:
+    except (arvados.errors.ApiError, arvados.errors.KeepWriteError) as error:
          logger.error("\n".join([
              "arv-put: %s" % str(error)]))
          sys.exit(1)
@@ -1291,12 +1330,23 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr,
              output = writer.manifest_text()
      elif args.raw:
          output = ','.join(writer.data_locators())
-    else:
+    elif writer.manifest_locator() is not None:
          try:
+            expiration_notice = ""
+            if writer.collection_trash_at() is not None:
+                # Get the local timezone-naive version, and log it with timezone information.
+                if time.daylight:
+                    local_trash_at = writer.collection_trash_at().replace(tzinfo=None) - datetime.timedelta(seconds=time.altzone)
+                else:
+                    local_trash_at = writer.collection_trash_at().replace(tzinfo=None) - datetime.timedelta(seconds=time.timezone)
+                expiration_notice = ". It will expire on {} {}.".format(
+                    local_trash_at.strftime("%Y-%m-%d %H:%M:%S"), time.strftime("%z"))
              if args.update_collection:
-                logger.info(u"Collection updated: '{}'".format(writer.collection_name()))
+                logger.info(u"Collection updated: '{}'{}".format(
+                    writer.collection_name(), expiration_notice))
              else:
-                logger.info(u"Collection saved as '{}'".format(writer.collection_name()))
+                logger.info(u"Collection saved as '{}'{}".format(
+                    writer.collection_name(), expiration_notice))
              if args.portable_data_hash:
                  output = writer.portable_data_hash()
              else:
@@ -1306,6 +1356,8 @@ def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr,
                  "arv-put: Error creating Collection on project: {}.".format(
                      error))
              status = 1
+    else:
+        status = 1
  
      # Print the locator (uuid) of the new collection.
      if output is None: