21855: Update URL path for RPMs
[arvados.git] / sdk / python / arvados / commands / put.py
1 # Copyright (C) The Arvados Authors. All rights reserved.
2 #
3 # SPDX-License-Identifier: Apache-2.0
4
5 import argparse
6 import arvados
7 import arvados.collection
8 import base64
9 import ciso8601
10 import copy
11 import datetime
12 import errno
13 import fcntl
14 import fnmatch
15 import hashlib
16 import json
17 import logging
18 import os
19 import pwd
20 import re
21 import signal
22 import socket
23 import sys
24 import tempfile
25 import threading
26 import time
27 import traceback
28
29 from pathlib import Path
30
31 from apiclient import errors as apiclient_errors
32 from arvados._version import __version__
33
34 import arvados.util
35 import arvados.commands._util as arv_cmd
36
37 api_client = None
38
39 upload_opts = argparse.ArgumentParser(add_help=False)
40
41 upload_opts.add_argument('--version', action='version',
42                          version="%s %s" % (sys.argv[0], __version__),
43                          help='Print version and exit.')
44 upload_opts.add_argument('paths', metavar='path', type=str, nargs='*',
45                          help="""
46 Local file or directory. If path is a directory reference with a trailing
47 slash, then just upload the directory's contents; otherwise upload the
48 directory itself. Default: read from standard input.
49 """)
50
51 _group = upload_opts.add_mutually_exclusive_group()
52
53 _group.add_argument('--max-manifest-depth', type=int, metavar='N',
54                     default=-1, help=argparse.SUPPRESS)
55
56 _group.add_argument('--normalize', action='store_true',
57                     help="""
58 Normalize the manifest by re-ordering files and streams after writing
59 data.
60 """)
61
62 _group.add_argument('--dry-run', action='store_true', default=False,
63                     help="""
64 Don't actually upload files, but only check if any file should be
65 uploaded. Exit with code=2 when files are pending for upload.
66 """)
67
68 _group = upload_opts.add_mutually_exclusive_group()
69
70 _group.add_argument('--as-stream', action='store_true', dest='stream',
71                     help="""
72 Synonym for --stream.
73 """)
74
75 _group.add_argument('--stream', action='store_true',
76                     help="""
77 Store the file content and display the resulting manifest on
78 stdout. Do not save a Collection object in Arvados.
79 """)
80
81 _group.add_argument('--as-manifest', action='store_true', dest='manifest',
82                     help="""
83 Synonym for --manifest.
84 """)
85
86 _group.add_argument('--in-manifest', action='store_true', dest='manifest',
87                     help="""
88 Synonym for --manifest.
89 """)
90
91 _group.add_argument('--manifest', action='store_true',
92                     help="""
93 Store the file data and resulting manifest in Keep, save a Collection
94 object in Arvados, and display the manifest locator (Collection uuid)
95 on stdout. This is the default behavior.
96 """)
97
98 _group.add_argument('--as-raw', action='store_true', dest='raw',
99                     help="""
100 Synonym for --raw.
101 """)
102
103 _group.add_argument('--raw', action='store_true',
104                     help="""
105 Store the file content and display the data block locators on stdout,
106 separated by commas, with a trailing newline. Do not store a
107 manifest.
108 """)
109
110 upload_opts.add_argument('--update-collection', type=str, default=None,
111                          dest='update_collection', metavar="UUID", help="""
112 Update an existing collection identified by the given Arvados collection
113 UUID. All new local files will be uploaded.
114 """)
115
116 upload_opts.add_argument('--use-filename', type=str, default=None,
117                          dest='filename', help="""
118 Synonym for --filename.
119 """)
120
121 upload_opts.add_argument('--filename', type=str, default=None,
122                          help="""
123 Use the given filename in the manifest, instead of the name of the
124 local file. This is useful when "-" or "/dev/stdin" is given as an
125 input file. It can be used only if there is exactly one path given and
126 it is not a directory. Implies --manifest.
127 """)
128
129 upload_opts.add_argument('--portable-data-hash', action='store_true',
130                          help="""
131 Print the portable data hash instead of the Arvados UUID for the collection
132 created by the upload.
133 """)
134
135 upload_opts.add_argument('--replication', type=int, metavar='N', default=None,
136                          help="""
137 Set the replication level for the new collection: how many different
138 physical storage devices (e.g., disks) should have a copy of each data
139 block. Default is to use the server-provided default (if any) or 2.
140 """)
141
142 upload_opts.add_argument('--storage-classes', help="""
143 Specify comma separated list of storage classes to be used when saving data to Keep.
144 """)
145
146 upload_opts.add_argument('--threads', type=int, metavar='N', default=None,
147                          help="""
148 Set the number of upload threads to be used. Take into account that
149 using lots of threads will increase the RAM requirements. Default is
150 to use 2 threads.
151 On high latency installations, using a greater number will improve
152 overall throughput.
153 """)
154
155 upload_opts.add_argument('--exclude', metavar='PATTERN', default=[],
156                       action='append', help="""
157 Exclude files and directories whose names match the given glob pattern. When
158 using a path-like pattern like 'subdir/*.txt', all text files inside 'subdir'
159 directory, relative to the provided input dirs will be excluded.
160 When using a filename pattern like '*.txt', any text file will be excluded
161 no matter where it is placed.
162 For the special case of needing to exclude only files or dirs directly below
163 the given input directory, you can use a pattern like './exclude_this.gif'.
164 You can specify multiple patterns by using this argument more than once.
165 """)
166
167 _group = upload_opts.add_mutually_exclusive_group()
168 _group.add_argument('--follow-links', action='store_true', default=True,
169                     dest='follow_links', help="""
170 Follow file and directory symlinks (default).
171 """)
172 _group.add_argument('--no-follow-links', action='store_false', dest='follow_links',
173                     help="""
174 Ignore file and directory symlinks. Even paths given explicitly on the
175 command line will be skipped if they are symlinks.
176 """)
177
178
179 run_opts = argparse.ArgumentParser(add_help=False)
180
181 run_opts.add_argument('--project-uuid', metavar='UUID', help="""
182 Store the collection in the specified project, instead of your Home
183 project.
184 """)
185
186 run_opts.add_argument('--name', help="""
187 Save the collection with the specified name.
188 """)
189
190 _group = run_opts.add_mutually_exclusive_group()
191 _group.add_argument('--progress', action='store_true',
192                     help="""
193 Display human-readable progress on stderr (bytes and, if possible,
194 percentage of total data size). This is the default behavior when
195 stderr is a tty.
196 """)
197
198 _group.add_argument('--no-progress', action='store_true',
199                     help="""
200 Do not display human-readable progress on stderr, even if stderr is a
201 tty.
202 """)
203
204 _group.add_argument('--batch-progress', action='store_true',
205                     help="""
206 Display machine-readable progress on stderr (bytes and, if known,
207 total data size).
208 """)
209
210 run_opts.add_argument('--silent', action='store_true',
211                       help="""
212 Do not print any debug messages to console. (Any error messages will
213 still be displayed.)
214 """)
215
216 run_opts.add_argument('--batch', action='store_true', default=False,
217                       help="""
218 Retries with '--no-resume --no-cache' if cached state contains invalid/expired
219 block signatures.
220 """)
221
222 _group = run_opts.add_mutually_exclusive_group()
223 _group.add_argument('--resume', action='store_true', default=True,
224                     help="""
225 Continue interrupted uploads from cached state (default).
226 """)
227 _group.add_argument('--no-resume', action='store_false', dest='resume',
228                     help="""
229 Do not continue interrupted uploads from cached state.
230 """)
231
232 _group = run_opts.add_mutually_exclusive_group()
233 _group.add_argument('--cache', action='store_true', dest='use_cache', default=True,
234                     help="""
235 Save upload state in a cache file for resuming (default).
236 """)
237 _group.add_argument('--no-cache', action='store_false', dest='use_cache',
238                     help="""
239 Do not save upload state in a cache file for resuming.
240 """)
241
242 _group = upload_opts.add_mutually_exclusive_group()
243 _group.add_argument('--trash-at', metavar='YYYY-MM-DDTHH:MM', default=None,
244                     help="""
245 Set the trash date of the resulting collection to an absolute date in the future.
246 The accepted format is defined by the ISO 8601 standard. Examples: 20090103, 2009-01-03, 20090103T181505, 2009-01-03T18:15:05.\n
247 Timezone information can be added. If not, the provided date/time is assumed as being in the local system's timezone.
248 """)
249 _group.add_argument('--trash-after', type=int, metavar='DAYS', default=None,
250                     help="""
251 Set the trash date of the resulting collection to an amount of days from the
252 date/time that the upload process finishes.
253 """)
254
255 arg_parser = argparse.ArgumentParser(
256     description='Copy data from the local filesystem to Keep.',
257     parents=[upload_opts, run_opts, arv_cmd.retry_opt])
258
259 def parse_arguments(arguments):
260     args = arg_parser.parse_args(arguments)
261
262     if len(args.paths) == 0:
263         args.paths = ['-']
264
265     args.paths = ["-" if x == "/dev/stdin" else x for x in args.paths]
266
267     if args.filename and (len(args.paths) != 1 or os.path.isdir(args.paths[0])):
268         arg_parser.error("""
269     --filename argument cannot be used when storing a directory or
270     multiple files.
271     """)
272
273     # Turn on --progress by default if stderr is a tty.
274     if (not (args.batch_progress or args.no_progress or args.silent)
275         and os.isatty(sys.stderr.fileno())):
276         args.progress = True
277
278     # Turn off --resume (default) if --no-cache is used.
279     if not args.use_cache:
280         args.resume = False
281
282     if args.paths == ['-']:
283         if args.update_collection:
284             arg_parser.error("""
285     --update-collection cannot be used when reading from stdin.
286     """)
287         args.resume = False
288         args.use_cache = False
289         if not args.filename:
290             args.filename = 'stdin'
291
292     # Remove possible duplicated patterns
293     if len(args.exclude) > 0:
294         args.exclude = list(set(args.exclude))
295
296     return args
297
298
299 class PathDoesNotExistError(Exception):
300     pass
301
302
303 class CollectionUpdateError(Exception):
304     pass
305
306
307 class ResumeCacheConflict(Exception):
308     pass
309
310
311 class ResumeCacheInvalidError(Exception):
312     pass
313
314 class ArvPutArgumentConflict(Exception):
315     pass
316
317
318 class ArvPutUploadIsPending(Exception):
319     pass
320
321
322 class ArvPutUploadNotPending(Exception):
323     pass
324
325
326 class FileUploadList(list):
327     def __init__(self, dry_run=False):
328         list.__init__(self)
329         self.dry_run = dry_run
330
331     def append(self, other):
332         if self.dry_run:
333             raise ArvPutUploadIsPending()
334         super(FileUploadList, self).append(other)
335
336
337 # Appends the X-Request-Id to the log message when log level is ERROR or DEBUG
338 class ArvPutLogFormatter(logging.Formatter):
339     std_fmtr = logging.Formatter(arvados.log_format, arvados.log_date_format)
340     err_fmtr = None
341     request_id_informed = False
342
343     def __init__(self, request_id):
344         self.err_fmtr = logging.Formatter(
345             arvados.log_format+' (X-Request-Id: {})'.format(request_id),
346             arvados.log_date_format)
347
348     def format(self, record):
349         if (not self.request_id_informed) and (record.levelno in (logging.DEBUG, logging.ERROR)):
350             self.request_id_informed = True
351             return self.err_fmtr.format(record)
352         return self.std_fmtr.format(record)
353
354
355 class ResumeCache(object):
356     CACHE_DIR = 'arv-put'
357
358     def __init__(self, file_spec):
359         self.cache_file = open(file_spec, 'a+')
360         self._lock_file(self.cache_file)
361         self.filename = self.cache_file.name
362
363     @classmethod
364     def make_path(cls, args):
365         md5 = hashlib.md5()
366         md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost').encode())
367         realpaths = sorted(os.path.realpath(path) for path in args.paths)
368         md5.update(b'\0'.join([p.encode() for p in realpaths]))
369         if any(os.path.isdir(path) for path in realpaths):
370             md5.update(b'-1')
371         elif args.filename:
372             md5.update(args.filename.encode())
373         cache_path = Path(cls.CACHE_DIR)
374         if len(cache_path.parts) == 1:
375             cache_path = arvados.util._BaseDirectories('CACHE').storage_path(cache_path)
376         else:
377             # Note this is a noop if cache_path is absolute, which is what we want.
378             cache_path = Path.home() / cache_path
379             cache_path.mkdir(parents=True, exist_ok=True, mode=0o700)
380         return str(cache_path / md5.hexdigest())
381
382     def _lock_file(self, fileobj):
383         try:
384             fcntl.flock(fileobj, fcntl.LOCK_EX | fcntl.LOCK_NB)
385         except IOError:
386             raise ResumeCacheConflict(u"{} locked".format(fileobj.name))
387
388     def load(self):
389         self.cache_file.seek(0)
390         return json.load(self.cache_file)
391
392     def check_cache(self, api_client=None, num_retries=0):
393         try:
394             state = self.load()
395             locator = None
396             try:
397                 if "_finished_streams" in state and len(state["_finished_streams"]) > 0:
398                     locator = state["_finished_streams"][0][1][0]
399                 elif "_current_stream_locators" in state and len(state["_current_stream_locators"]) > 0:
400                     locator = state["_current_stream_locators"][0]
401                 if locator is not None:
402                     kc = arvados.keep.KeepClient(api_client=api_client)
403                     kc.head(locator, num_retries=num_retries)
404             except Exception as e:
405                 self.restart()
406         except (ValueError):
407             pass
408
409     def save(self, data):
410         try:
411             new_cache_fd, new_cache_name = tempfile.mkstemp(
412                 dir=os.path.dirname(self.filename))
413             self._lock_file(new_cache_fd)
414             new_cache = os.fdopen(new_cache_fd, 'r+')
415             json.dump(data, new_cache)
416             os.rename(new_cache_name, self.filename)
417         except (IOError, OSError, ResumeCacheConflict):
418             try:
419                 os.unlink(new_cache_name)
420             except NameError:  # mkstemp failed.
421                 pass
422         else:
423             self.cache_file.close()
424             self.cache_file = new_cache
425
426     def close(self):
427         self.cache_file.close()
428
429     def destroy(self):
430         try:
431             os.unlink(self.filename)
432         except OSError as error:
433             if error.errno != errno.ENOENT:  # That's what we wanted anyway.
434                 raise
435         self.close()
436
437     def restart(self):
438         self.destroy()
439         self.__init__(self.filename)
440
441
442 class ArvPutUploadJob(object):
443     CACHE_DIR = 'arv-put'
444     EMPTY_STATE = {
445         'manifest' : None, # Last saved manifest checkpoint
446         'files' : {} # Previous run file list: {path : {size, mtime}}
447     }
448
449     def __init__(self, paths, resume=True, use_cache=True, reporter=None,
450                  name=None, owner_uuid=None, api_client=None, batch_mode=False,
451                  ensure_unique_name=False, num_retries=None,
452                  put_threads=None, replication_desired=None, filename=None,
453                  update_time=60.0, update_collection=None, storage_classes=None,
454                  logger=logging.getLogger('arvados.arv_put'), dry_run=False,
455                  follow_links=True, exclude_paths=[], exclude_names=None,
456                  trash_at=None):
457         self.paths = paths
458         self.resume = resume
459         self.use_cache = use_cache
460         self.batch_mode = batch_mode
461         self.update = False
462         self.reporter = reporter
463         # This will set to 0 before start counting, if no special files are going
464         # to be read.
465         self.bytes_expected = None
466         self.bytes_written = 0
467         self.bytes_skipped = 0
468         self.name = name
469         self.owner_uuid = owner_uuid
470         self.ensure_unique_name = ensure_unique_name
471         self.num_retries = num_retries
472         self.replication_desired = replication_desired
473         self.put_threads = put_threads
474         self.filename = filename
475         self.storage_classes = storage_classes
476         self._api_client = api_client
477         self._state_lock = threading.Lock()
478         self._state = None # Previous run state (file list & manifest)
479         self._current_files = [] # Current run file list
480         self._cache_file = None
481         self._collection_lock = threading.Lock()
482         self._remote_collection = None # Collection being updated (if asked)
483         self._local_collection = None # Collection from previous run manifest
484         self._file_paths = set() # Files to be updated in remote collection
485         self._stop_checkpointer = threading.Event()
486         self._checkpointer = threading.Thread(target=self._update_task)
487         self._checkpointer.daemon = True
488         self._update_task_time = update_time  # How many seconds wait between update runs
489         self._files_to_upload = FileUploadList(dry_run=dry_run)
490         self._upload_started = False
491         self.logger = logger
492         self.dry_run = dry_run
493         self._checkpoint_before_quit = True
494         self.follow_links = follow_links
495         self.exclude_paths = exclude_paths
496         self.exclude_names = exclude_names
497         self._trash_at = trash_at
498
499         if self._trash_at is not None:
500             if type(self._trash_at) not in [datetime.datetime, datetime.timedelta]:
501                 raise TypeError('trash_at should be None, timezone-naive datetime or timedelta')
502             if type(self._trash_at) == datetime.datetime and self._trash_at.tzinfo is not None:
503                 raise TypeError('provided trash_at datetime should be timezone-naive')
504
505         if not self.use_cache and self.resume:
506             raise ArvPutArgumentConflict('resume cannot be True when use_cache is False')
507
508         # Check for obvious dry-run responses
509         if self.dry_run and (not self.use_cache or not self.resume):
510             raise ArvPutUploadIsPending()
511
512         # Load cached data if any and if needed
513         self._setup_state(update_collection)
514
515         # Build the upload file list, excluding requested files and counting the
516         # bytes expected to be uploaded.
517         self._build_upload_list()
518
519     def _build_upload_list(self):
520         """
521         Scan the requested paths to count file sizes, excluding requested files
522         and dirs and building the upload file list.
523         """
524         # If there aren't special files to be read, reset total bytes count to zero
525         # to start counting.
526         if not any([p for p in self.paths
527                     if not (os.path.isfile(p) or os.path.isdir(p))]):
528             self.bytes_expected = 0
529
530         for path in self.paths:
531             # Test for stdin first, in case some file named '-' exist
532             if path == '-':
533                 if self.dry_run:
534                     raise ArvPutUploadIsPending()
535                 self._write_stdin(self.filename or 'stdin')
536             elif not os.path.exists(path):
537                  raise PathDoesNotExistError(u"file or directory '{}' does not exist.".format(path))
538             elif (not self.follow_links) and os.path.islink(path):
539                 self.logger.warning("Skipping symlink '{}'".format(path))
540                 continue
541             elif os.path.isdir(path):
542                 # Use absolute paths on cache index so CWD doesn't interfere
543                 # with the caching logic.
544                 orig_path = path
545                 path = os.path.abspath(path)
546                 if orig_path[-1:] == os.sep:
547                     # When passing a directory reference with a trailing slash,
548                     # its contents should be uploaded directly to the
549                     # collection's root.
550                     prefixdir = path
551                 else:
552                     # When passing a directory reference with no trailing slash,
553                     # upload the directory to the collection's root.
554                     prefixdir = os.path.dirname(path)
555                 prefixdir += os.sep
556                 for root, dirs, files in os.walk(path,
557                                                  followlinks=self.follow_links):
558                     root_relpath = os.path.relpath(root, path)
559                     if root_relpath == '.':
560                         root_relpath = ''
561                     # Exclude files/dirs by full path matching pattern
562                     if self.exclude_paths:
563                         dirs[:] = [d for d in dirs
564                                    if not any(pathname_match(
565                                            os.path.join(root_relpath, d), pat)
566                                               for pat in self.exclude_paths)]
567                         files = [f for f in files
568                                  if not any(pathname_match(
569                                          os.path.join(root_relpath, f), pat)
570                                             for pat in self.exclude_paths)]
571                     # Exclude files/dirs by name matching pattern
572                     if self.exclude_names is not None:
573                         dirs[:] = [d for d in dirs
574                                    if not self.exclude_names.match(d)]
575                         files = [f for f in files
576                                  if not self.exclude_names.match(f)]
577                     # Make os.walk()'s dir traversing order deterministic
578                     dirs.sort()
579                     files.sort()
580                     for f in files:
581                         filepath = os.path.join(root, f)
582                         if not os.path.isfile(filepath):
583                             self.logger.warning("Skipping non-regular file '{}'".format(filepath))
584                             continue
585                         # Add its size to the total bytes count (if applicable)
586                         if self.follow_links or (not os.path.islink(filepath)):
587                             if self.bytes_expected is not None:
588                                 self.bytes_expected += os.path.getsize(filepath)
589                         self._check_file(filepath,
590                                          os.path.join(root[len(prefixdir):], f))
591             else:
592                 filepath = os.path.abspath(path)
593                 # Add its size to the total bytes count (if applicable)
594                 if self.follow_links or (not os.path.islink(filepath)):
595                     if self.bytes_expected is not None:
596                         self.bytes_expected += os.path.getsize(filepath)
597                 self._check_file(filepath,
598                                  self.filename or os.path.basename(path))
599         # If dry-mode is on, and got up to this point, then we should notify that
600         # there aren't any file to upload.
601         if self.dry_run:
602             raise ArvPutUploadNotPending()
603         # Remove local_collection's files that don't exist locally anymore, so the
604         # bytes_written count is correct.
605         for f in self.collection_file_paths(self._local_collection,
606                                             path_prefix=""):
607             if f != 'stdin' and f != self.filename and not f in self._file_paths:
608                 self._local_collection.remove(f)
609
610     def start(self, save_collection):
611         """
612         Start supporting thread & file uploading
613         """
614         self._checkpointer.start()
615         try:
616             # Update bytes_written from current local collection and
617             # report initial progress.
618             self._update()
619             # Actual file upload
620             self._upload_started = True # Used by the update thread to start checkpointing
621             self._upload_files()
622         except (SystemExit, Exception) as e:
623             self._checkpoint_before_quit = False
624             # Log stack trace only when Ctrl-C isn't pressed (SIGINT)
625             # Note: We're expecting SystemExit instead of
626             # KeyboardInterrupt because we have a custom signal
627             # handler in place that raises SystemExit with the catched
628             # signal's code.
629             if isinstance(e, PathDoesNotExistError):
630                 # We aren't interested in the traceback for this case
631                 pass
632             elif not isinstance(e, SystemExit) or e.code != -2:
633                 self.logger.warning("Abnormal termination:\n{}".format(
634                     traceback.format_exc()))
635             raise
636         finally:
637             if not self.dry_run:
638                 # Stop the thread before doing anything else
639                 self._stop_checkpointer.set()
640                 self._checkpointer.join()
641                 if self._checkpoint_before_quit:
642                     # Commit all pending blocks & one last _update()
643                     self._local_collection.manifest_text()
644                     self._update(final=True)
645                     if save_collection:
646                         self.save_collection()
647             if self.use_cache:
648                 self._cache_file.close()
649
650     def _collection_trash_at(self):
651         """
652         Returns the trash date that the collection should use at save time.
653         Takes into account absolute/relative trash_at values requested
654         by the user.
655         """
656         if type(self._trash_at) == datetime.timedelta:
657             # Get an absolute datetime for trash_at
658             return datetime.datetime.utcnow() + self._trash_at
659         return self._trash_at
660
661     def save_collection(self):
662         if self.update:
663             # Check if files should be updated on the remote collection.
664             for fp in self._file_paths:
665                 remote_file = self._remote_collection.find(fp)
666                 if not remote_file:
667                     # File don't exist on remote collection, copy it.
668                     self._remote_collection.copy(fp, fp, self._local_collection)
669                 elif remote_file != self._local_collection.find(fp):
670                     # A different file exist on remote collection, overwrite it.
671                     self._remote_collection.copy(fp, fp, self._local_collection, overwrite=True)
672                 else:
673                     # The file already exist on remote collection, skip it.
674                     pass
675             self._remote_collection.save(num_retries=self.num_retries,
676                                          trash_at=self._collection_trash_at())
677         else:
678             if len(self._local_collection) == 0:
679                 self.logger.warning("No files were uploaded, skipping collection creation.")
680                 return
681             self._local_collection.save_new(
682                 name=self.name, owner_uuid=self.owner_uuid,
683                 ensure_unique_name=self.ensure_unique_name,
684                 num_retries=self.num_retries,
685                 trash_at=self._collection_trash_at())
686
687     def destroy_cache(self):
688         if self.use_cache:
689             try:
690                 os.unlink(self._cache_filename)
691             except OSError as error:
692                 # That's what we wanted anyway.
693                 if error.errno != errno.ENOENT:
694                     raise
695             self._cache_file.close()
696
697     def _collection_size(self, collection):
698         """
699         Recursively get the total size of the collection
700         """
701         size = 0
702         for item in collection.values():
703             if isinstance(item, arvados.collection.Collection) or isinstance(item, arvados.collection.Subcollection):
704                 size += self._collection_size(item)
705             else:
706                 size += item.size()
707         return size
708
709     def _update_task(self):
710         """
711         Periodically called support task. File uploading is
712         asynchronous so we poll status from the collection.
713         """
714         while not self._stop_checkpointer.wait(1 if not self._upload_started else self._update_task_time):
715             self._update()
716
717     def _update(self, final=False):
718         """
719         Update cached manifest text and report progress.
720         """
721         if self._upload_started:
722             with self._collection_lock:
723                 self.bytes_written = self._collection_size(self._local_collection)
724                 if self.use_cache:
725                     if final:
726                         manifest = self._local_collection.manifest_text()
727                     else:
728                         # Get the manifest text without comitting pending blocks
729                         manifest = self._local_collection.manifest_text(strip=False,
730                                                                         normalize=False,
731                                                                         only_committed=True)
732                     # Update cache
733                     with self._state_lock:
734                         self._state['manifest'] = manifest
735             if self.use_cache:
736                 try:
737                     self._save_state()
738                 except Exception as e:
739                     self.logger.error("Unexpected error trying to save cache file: {}".format(e))
740             # Keep remote collection's trash_at attribute synced when using relative expire dates
741             if self._remote_collection is not None and type(self._trash_at) == datetime.timedelta:
742                 try:
743                     self._api_client.collections().update(
744                         uuid=self._remote_collection.manifest_locator(),
745                         body={'trash_at': self._collection_trash_at().strftime("%Y-%m-%dT%H:%M:%S.%fZ")}
746                     ).execute(num_retries=self.num_retries)
747                 except Exception as e:
748                     self.logger.error("Unexpected error trying to update remote collection's expire date: {}".format(e))
749         else:
750             self.bytes_written = self.bytes_skipped
751         # Call the reporter, if any
752         self.report_progress()
753
754     def report_progress(self):
755         if self.reporter is not None:
756             self.reporter(self.bytes_written, self.bytes_expected)
757
758     def _write_stdin(self, filename):
759         output = self._local_collection.open(filename, 'wb')
760         self._write(sys.stdin.buffer, output)
761         output.close()
762
763     def _check_file(self, source, filename):
764         """
765         Check if this file needs to be uploaded
766         """
767         # Ignore symlinks when requested
768         if (not self.follow_links) and os.path.islink(source):
769             return
770         resume_offset = 0
771         should_upload = False
772         new_file_in_cache = False
773         # Record file path for updating the remote collection before exiting
774         self._file_paths.add(filename)
775
776         with self._state_lock:
777             # If no previous cached data on this file, store it for an eventual
778             # repeated run.
779             if source not in self._state['files']:
780                 self._state['files'][source] = {
781                     'mtime': os.path.getmtime(source),
782                     'size' : os.path.getsize(source)
783                 }
784                 new_file_in_cache = True
785             cached_file_data = self._state['files'][source]
786
787         # Check if file was already uploaded (at least partially)
788         file_in_local_collection = self._local_collection.find(filename)
789
790         # If not resuming, upload the full file.
791         if not self.resume:
792             should_upload = True
793         # New file detected from last run, upload it.
794         elif new_file_in_cache:
795             should_upload = True
796         # Local file didn't change from last run.
797         elif cached_file_data['mtime'] == os.path.getmtime(source) and cached_file_data['size'] == os.path.getsize(source):
798             if not file_in_local_collection:
799                 # File not uploaded yet, upload it completely
800                 should_upload = True
801             elif file_in_local_collection.permission_expired():
802                 # Permission token expired, re-upload file. This will change whenever
803                 # we have a API for refreshing tokens.
804                 self.logger.warning(u"Uploaded file '{}' access token expired, will re-upload it from scratch".format(filename))
805                 should_upload = True
806                 self._local_collection.remove(filename)
807             elif cached_file_data['size'] == file_in_local_collection.size():
808                 # File already there, skip it.
809                 self.bytes_skipped += cached_file_data['size']
810             elif cached_file_data['size'] > file_in_local_collection.size():
811                 # File partially uploaded, resume!
812                 resume_offset = file_in_local_collection.size()
813                 self.bytes_skipped += resume_offset
814                 should_upload = True
815             else:
816                 # Inconsistent cache, re-upload the file
817                 should_upload = True
818                 self._local_collection.remove(filename)
819                 self.logger.warning(u"Uploaded version of file '{}' is bigger than local version, will re-upload it from scratch.".format(source))
820         # Local file differs from cached data, re-upload it.
821         else:
822             if file_in_local_collection:
823                 self._local_collection.remove(filename)
824             should_upload = True
825
826         if should_upload:
827             try:
828                 self._files_to_upload.append((source, resume_offset, filename))
829             except ArvPutUploadIsPending:
830                 # This could happen when running on dry-mode, close cache file to
831                 # avoid locking issues.
832                 self._cache_file.close()
833                 raise
834
835     def _upload_files(self):
836         for source, resume_offset, filename in self._files_to_upload:
837             with open(source, 'rb') as source_fd:
838                 with self._state_lock:
839                     self._state['files'][source]['mtime'] = os.path.getmtime(source)
840                     self._state['files'][source]['size'] = os.path.getsize(source)
841                 if resume_offset > 0:
842                     # Start upload where we left off
843                     output = self._local_collection.open(filename, 'ab')
844                     source_fd.seek(resume_offset)
845                 else:
846                     # Start from scratch
847                     output = self._local_collection.open(filename, 'wb')
848                 self._write(source_fd, output)
849                 output.close(flush=False)
850
851     def _write(self, source_fd, output):
852         while True:
853             data = source_fd.read(arvados.config.KEEP_BLOCK_SIZE)
854             if not data:
855                 break
856             output.write(data)
857
858     def _my_collection(self):
859         return self._remote_collection if self.update else self._local_collection
860
861     def _get_cache_filepath(self):
862         # Set up cache file name from input paths.
863         md5 = hashlib.md5()
864         md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost').encode())
865         realpaths = sorted(os.path.realpath(path) for path in self.paths)
866         md5.update(b'\0'.join([p.encode() for p in realpaths]))
867         if self.filename:
868             md5.update(self.filename.encode())
869         cache_path = Path(self.CACHE_DIR)
870         if len(cache_path.parts) == 1:
871             cache_path = arvados.util._BaseDirectories('CACHE').storage_path(cache_path)
872         else:
873             # Note this is a noop if cache_path is absolute, which is what we want.
874             cache_path = Path.home() / cache_path
875             cache_path.mkdir(parents=True, exist_ok=True, mode=0o700)
876         return str(cache_path / md5.hexdigest())
877
878     def _setup_state(self, update_collection):
879         """
880         Create a new cache file or load a previously existing one.
881         """
882         # Load an already existing collection for update
883         if update_collection and re.match(arvados.util.collection_uuid_pattern,
884                                           update_collection):
885             try:
886                 self._remote_collection = arvados.collection.Collection(
887                     update_collection,
888                     api_client=self._api_client,
889                     storage_classes_desired=self.storage_classes,
890                     num_retries=self.num_retries)
891             except arvados.errors.ApiError as error:
892                 raise CollectionUpdateError("Cannot read collection {} ({})".format(update_collection, error))
893             else:
894                 self.update = True
895         elif update_collection:
896             # Collection locator provided, but unknown format
897             raise CollectionUpdateError("Collection locator unknown: '{}'".format(update_collection))
898
899         if self.use_cache:
900             cache_filepath = self._get_cache_filepath()
901             if self.resume and os.path.exists(cache_filepath):
902                 self.logger.info(u"Resuming upload from cache file {}".format(cache_filepath))
903                 self._cache_file = open(cache_filepath, 'a+')
904             else:
905                 # --no-resume means start with a empty cache file.
906                 self.logger.info(u"Creating new cache file at {}".format(cache_filepath))
907                 self._cache_file = open(cache_filepath, 'w+')
908             self._cache_filename = self._cache_file.name
909             self._lock_file(self._cache_file)
910             self._cache_file.seek(0)
911
912         with self._state_lock:
913             if self.use_cache:
914                 try:
915                     self._state = json.load(self._cache_file)
916                     if not set(['manifest', 'files']).issubset(set(self._state.keys())):
917                         # Cache at least partially incomplete, set up new cache
918                         self._state = copy.deepcopy(self.EMPTY_STATE)
919                 except ValueError:
920                     # Cache file empty, set up new cache
921                     self._state = copy.deepcopy(self.EMPTY_STATE)
922             else:
923                 self.logger.info("No cache usage requested for this run.")
924                 # No cache file, set empty state
925                 self._state = copy.deepcopy(self.EMPTY_STATE)
926             if not self._cached_manifest_valid():
927                 if not self.batch_mode:
928                     raise ResumeCacheInvalidError()
929                 else:
930                     self.logger.info("Invalid signatures on cache file '{}' while being run in 'batch mode' -- continuing anyways.".format(self._cache_file.name))
931                     self.use_cache = False # Don't overwrite preexisting cache file.
932                     self._state = copy.deepcopy(self.EMPTY_STATE)
933             # Load the previous manifest so we can check if files were modified remotely.
934             self._local_collection = arvados.collection.Collection(
935                 self._state['manifest'],
936                 replication_desired=self.replication_desired,
937                 storage_classes_desired=self.storage_classes,
938                 put_threads=self.put_threads,
939                 api_client=self._api_client,
940                 num_retries=self.num_retries)
941
942     def _cached_manifest_valid(self):
943         """
944         Validate the oldest non-expired block signature to check if cached manifest
945         is usable: checking if the cached manifest was not created with a different
946         arvados account.
947         """
948         if self._state.get('manifest', None) is None:
949             # No cached manifest yet, all good.
950             return True
951         now = datetime.datetime.utcnow()
952         oldest_exp = None
953         oldest_loc = None
954         block_found = False
955         for m in arvados.util.keep_locator_pattern.finditer(self._state['manifest']):
956             loc = m.group(0)
957             try:
958                 exp = datetime.datetime.utcfromtimestamp(int(loc.split('@')[1], 16))
959             except IndexError:
960                 # Locator without signature
961                 continue
962             block_found = True
963             if exp > now and (oldest_exp is None or exp < oldest_exp):
964                 oldest_exp = exp
965                 oldest_loc = loc
966         if not block_found:
967             # No block signatures found => no invalid block signatures.
968             return True
969         if oldest_loc is None:
970             # Locator signatures found, but all have expired.
971             # Reset the cache and move on.
972             self.logger.info('Cache expired, starting from scratch.')
973             self._state['manifest'] = ''
974             return True
975         kc = arvados.KeepClient(api_client=self._api_client,
976                                 num_retries=self.num_retries)
977         try:
978             kc.head(oldest_loc)
979         except arvados.errors.KeepRequestError:
980             # Something is wrong, cached manifest is not valid.
981             return False
982         return True
983
984     def collection_file_paths(self, col, path_prefix='.'):
985         """Return a list of file paths by recursively go through the entire collection `col`"""
986         file_paths = []
987         for name, item in col.items():
988             if isinstance(item, arvados.arvfile.ArvadosFile):
989                 file_paths.append(os.path.join(path_prefix, name))
990             elif isinstance(item, arvados.collection.Subcollection):
991                 new_prefix = os.path.join(path_prefix, name)
992                 file_paths += self.collection_file_paths(item, path_prefix=new_prefix)
993         return file_paths
994
995     def _lock_file(self, fileobj):
996         try:
997             fcntl.flock(fileobj, fcntl.LOCK_EX | fcntl.LOCK_NB)
998         except IOError:
999             raise ResumeCacheConflict(u"{} locked".format(fileobj.name))
1000
1001     def _save_state(self):
1002         """
1003         Atomically save current state into cache.
1004         """
1005         with self._state_lock:
1006             # We're not using copy.deepcopy() here because it's a lot slower
1007             # than json.dumps(), and we're already needing JSON format to be
1008             # saved on disk.
1009             state = json.dumps(self._state)
1010         try:
1011             new_cache = tempfile.NamedTemporaryFile(
1012                 mode='w+',
1013                 dir=os.path.dirname(self._cache_filename), delete=False)
1014             self._lock_file(new_cache)
1015             new_cache.write(state)
1016             new_cache.flush()
1017             os.fsync(new_cache)
1018             os.rename(new_cache.name, self._cache_filename)
1019         except (IOError, OSError, ResumeCacheConflict) as error:
1020             self.logger.error("There was a problem while saving the cache file: {}".format(error))
1021             try:
1022                 os.unlink(new_cache_name)
1023             except NameError:  # mkstemp failed.
1024                 pass
1025         else:
1026             self._cache_file.close()
1027             self._cache_file = new_cache
1028
1029     def collection_name(self):
1030         return self._my_collection().api_response()['name'] if self._my_collection().api_response() else None
1031
1032     def collection_trash_at(self):
1033         return self._my_collection().get_trash_at()
1034
1035     def manifest_locator(self):
1036         return self._my_collection().manifest_locator()
1037
1038     def portable_data_hash(self):
1039         pdh = self._my_collection().portable_data_hash()
1040         m = self._my_collection().stripped_manifest().encode()
1041         local_pdh = '{}+{}'.format(hashlib.md5(m).hexdigest(), len(m))
1042         if pdh != local_pdh:
1043             self.logger.warning("\n".join([
1044                 "arv-put: API server provided PDH differs from local manifest.",
1045                 "         This should not happen; showing API server version."]))
1046         return pdh
1047
1048     def manifest_text(self, stream_name=".", strip=False, normalize=False):
1049         return self._my_collection().manifest_text(stream_name, strip, normalize)
1050
1051     def _datablocks_on_item(self, item):
1052         """
1053         Return a list of datablock locators, recursively navigating
1054         through subcollections
1055         """
1056         if isinstance(item, arvados.arvfile.ArvadosFile):
1057             if item.size() == 0:
1058                 # Empty file locator
1059                 return ["d41d8cd98f00b204e9800998ecf8427e+0"]
1060             else:
1061                 locators = []
1062                 for segment in item.segments():
1063                     loc = segment.locator
1064                     locators.append(loc)
1065                 return locators
1066         elif isinstance(item, arvados.collection.Collection):
1067             l = [self._datablocks_on_item(x) for x in item.values()]
1068             # Fast list flattener method taken from:
1069             # http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
1070             return [loc for sublist in l for loc in sublist]
1071         else:
1072             return None
1073
1074     def data_locators(self):
1075         with self._collection_lock:
1076             # Make sure all datablocks are flushed before getting the locators
1077             self._my_collection().manifest_text()
1078             datablocks = self._datablocks_on_item(self._my_collection())
1079         return datablocks
1080
1081 _machine_format = "{} {}: {{}} written {{}} total\n".format(sys.argv[0],
1082                                                             os.getpid())
1083
1084 # Simulate glob.glob() matching behavior without the need to scan the filesystem
1085 # Note: fnmatch() doesn't work correctly when used with pathnames. For example the
1086 # pattern 'tests/*.py' will match 'tests/run_test.py' and also 'tests/subdir/run_test.py',
1087 # so instead we're using it on every path component.
1088 def pathname_match(pathname, pattern):
1089     name = pathname.split(os.sep)
1090     # Fix patterns like 'some/subdir/' or 'some//subdir'
1091     pat = [x for x in pattern.split(os.sep) if x != '' and x != '.']
1092     if len(name) != len(pat):
1093         return False
1094     for i in range(len(name)):
1095         if not fnmatch.fnmatch(name[i], pat[i]):
1096             return False
1097     return True
1098
1099 def machine_progress(bytes_written, bytes_expected):
1100     return _machine_format.format(
1101         bytes_written, -1 if (bytes_expected is None) else bytes_expected)
1102
1103 def human_progress(bytes_written, bytes_expected):
1104     if bytes_expected:
1105         return "\r{}M / {}M {:.1%} ".format(
1106             bytes_written >> 20, bytes_expected >> 20,
1107             float(bytes_written) / bytes_expected)
1108     else:
1109         return "\r{} ".format(bytes_written)
1110
1111 def progress_writer(progress_func, outfile=sys.stderr):
1112     def write_progress(bytes_written, bytes_expected):
1113         outfile.write(progress_func(bytes_written, bytes_expected))
1114     return write_progress
1115
1116 def desired_project_uuid(api_client, project_uuid, num_retries):
1117     if not project_uuid:
1118         query = api_client.users().current()
1119     elif arvados.util.user_uuid_pattern.match(project_uuid):
1120         query = api_client.users().get(uuid=project_uuid)
1121     elif arvados.util.group_uuid_pattern.match(project_uuid):
1122         query = api_client.groups().get(uuid=project_uuid)
1123     else:
1124         raise ValueError("Not a valid project UUID: {}".format(project_uuid))
1125     return query.execute(num_retries=num_retries)['uuid']
1126
1127 def main(arguments=None, stdout=sys.stdout, stderr=sys.stderr,
1128          install_sig_handlers=True):
1129     global api_client
1130
1131     args = parse_arguments(arguments)
1132     logger = logging.getLogger('arvados.arv_put')
1133     if args.silent:
1134         logger.setLevel(logging.WARNING)
1135     else:
1136         logger.setLevel(logging.INFO)
1137     status = 0
1138
1139     request_id = arvados.util.new_request_id()
1140
1141     formatter = ArvPutLogFormatter(request_id)
1142     logging.getLogger('arvados').handlers[0].setFormatter(formatter)
1143
1144     if api_client is None:
1145         api_client = arvados.api('v1', request_id=request_id, num_retries=args.retries)
1146
1147     if install_sig_handlers:
1148         arv_cmd.install_signal_handlers()
1149
1150     # Trash arguments validation
1151     trash_at = None
1152     if args.trash_at is not None:
1153         # ciso8601 considers YYYYMM as invalid but YYYY-MM as valid, so here we
1154         # make sure the user provides a complete YYYY-MM-DD date.
1155         if not re.match(r'^\d{4}(?P<dash>-?)\d{2}?(?P=dash)\d{2}', args.trash_at):
1156             logger.error("--trash-at argument format invalid, use --help to see examples.")
1157             sys.exit(1)
1158         # Check if no time information was provided. In that case, assume end-of-day.
1159         if re.match(r'^\d{4}(?P<dash>-?)\d{2}?(?P=dash)\d{2}$', args.trash_at):
1160             args.trash_at += 'T23:59:59'
1161         try:
1162             trash_at = ciso8601.parse_datetime(args.trash_at)
1163         except:
1164             logger.error("--trash-at argument format invalid, use --help to see examples.")
1165             sys.exit(1)
1166         else:
1167             if trash_at.tzinfo is not None:
1168                 # Timezone aware datetime provided.
1169                 utcoffset = -trash_at.utcoffset()
1170             else:
1171                 # Timezone naive datetime provided. Assume is local.
1172                 if time.daylight:
1173                     utcoffset = datetime.timedelta(seconds=time.altzone)
1174                 else:
1175                     utcoffset = datetime.timedelta(seconds=time.timezone)
1176             # Convert to UTC timezone naive datetime.
1177             trash_at = trash_at.replace(tzinfo=None) + utcoffset
1178
1179         if trash_at <= datetime.datetime.utcnow():
1180             logger.error("--trash-at argument must be set in the future")
1181             sys.exit(1)
1182     if args.trash_after is not None:
1183         if args.trash_after < 1:
1184             logger.error("--trash-after argument must be >= 1")
1185             sys.exit(1)
1186         trash_at = datetime.timedelta(seconds=(args.trash_after * 24 * 60 * 60))
1187
1188     # Determine the name to use
1189     if args.name:
1190         if args.stream or args.raw:
1191             logger.error("Cannot use --name with --stream or --raw")
1192             sys.exit(1)
1193         elif args.update_collection:
1194             logger.error("Cannot use --name with --update-collection")
1195             sys.exit(1)
1196         collection_name = args.name
1197     else:
1198         collection_name = "Saved at {} by {}@{}".format(
1199             datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC"),
1200             pwd.getpwuid(os.getuid()).pw_name,
1201             socket.gethostname())
1202
1203     if args.project_uuid and (args.stream or args.raw):
1204         logger.error("Cannot use --project-uuid with --stream or --raw")
1205         sys.exit(1)
1206
1207     # Determine the parent project
1208     try:
1209         project_uuid = desired_project_uuid(api_client, args.project_uuid,
1210                                             args.retries)
1211     except (apiclient_errors.Error, ValueError) as error:
1212         logger.error(error)
1213         sys.exit(1)
1214
1215     if args.progress:
1216         reporter = progress_writer(human_progress)
1217     elif args.batch_progress:
1218         reporter = progress_writer(machine_progress)
1219     else:
1220         reporter = None
1221
1222     #  Split storage-classes argument
1223     storage_classes = None
1224     if args.storage_classes:
1225         storage_classes = args.storage_classes.strip().replace(' ', '').split(',')
1226
1227     # Setup exclude regex from all the --exclude arguments provided
1228     name_patterns = []
1229     exclude_paths = []
1230     exclude_names = None
1231     if len(args.exclude) > 0:
1232         # We're supporting 2 kinds of exclusion patterns:
1233         # 1)   --exclude '*.jpg'    (file/dir name patterns, will only match
1234         #                            the name, wherever the file is on the tree)
1235         # 2.1) --exclude 'foo/bar'  (file/dir path patterns, will match the
1236         #                            entire path, and should be relative to
1237         #                            any input dir argument)
1238         # 2.2) --exclude './*.jpg'  (Special case for excluding files/dirs
1239         #                            placed directly underneath the input dir)
1240         for p in args.exclude:
1241             # Only relative paths patterns allowed
1242             if p.startswith(os.sep):
1243                 logger.error("Cannot use absolute paths with --exclude")
1244                 sys.exit(1)
1245             if os.path.dirname(p):
1246                 # We don't support of path patterns with '..'
1247                 p_parts = p.split(os.sep)
1248                 if '..' in p_parts:
1249                     logger.error(
1250                         "Cannot use path patterns that include or '..'")
1251                     sys.exit(1)
1252                 # Path search pattern
1253                 exclude_paths.append(p)
1254             else:
1255                 # Name-only search pattern
1256                 name_patterns.append(p)
1257         # For name only matching, we can combine all patterns into a single
1258         # regexp, for better performance.
1259         exclude_names = re.compile('|'.join(
1260             [fnmatch.translate(p) for p in name_patterns]
1261         )) if len(name_patterns) > 0 else None
1262         # Show the user the patterns to be used, just in case they weren't
1263         # specified inside quotes and got changed by the shell expansion.
1264         logger.info("Exclude patterns: {}".format(args.exclude))
1265
1266     # If this is used by a human, and there's at least one directory to be
1267     # uploaded, the expected bytes calculation can take a moment.
1268     if args.progress and any([os.path.isdir(f) for f in args.paths]):
1269         logger.info("Calculating upload size, this could take some time...")
1270     try:
1271         writer = ArvPutUploadJob(paths = args.paths,
1272                                  resume = args.resume,
1273                                  use_cache = args.use_cache,
1274                                  batch_mode= args.batch,
1275                                  filename = args.filename,
1276                                  reporter = reporter,
1277                                  api_client = api_client,
1278                                  num_retries = args.retries,
1279                                  replication_desired = args.replication,
1280                                  put_threads = args.threads,
1281                                  name = collection_name,
1282                                  owner_uuid = project_uuid,
1283                                  ensure_unique_name = True,
1284                                  update_collection = args.update_collection,
1285                                  storage_classes=storage_classes,
1286                                  logger=logger,
1287                                  dry_run=args.dry_run,
1288                                  follow_links=args.follow_links,
1289                                  exclude_paths=exclude_paths,
1290                                  exclude_names=exclude_names,
1291                                  trash_at=trash_at)
1292     except ResumeCacheConflict:
1293         logger.error("\n".join([
1294             "arv-put: Another process is already uploading this data.",
1295             "         Use --no-cache if this is really what you want."]))
1296         sys.exit(1)
1297     except ResumeCacheInvalidError:
1298         logger.error("\n".join([
1299             "arv-put: Resume cache contains invalid signature: it may have expired",
1300             "         or been created with another Arvados user's credentials.",
1301             "         Switch user or use one of the following options to restart upload:",
1302             "         --no-resume to start a new resume cache.",
1303             "         --no-cache to disable resume cache.",
1304             "         --batch to ignore the resume cache if invalid."]))
1305         sys.exit(1)
1306     except (CollectionUpdateError, PathDoesNotExistError) as error:
1307         logger.error("\n".join([
1308             "arv-put: %s" % str(error)]))
1309         sys.exit(1)
1310     except ArvPutUploadIsPending:
1311         # Dry run check successful, return proper exit code.
1312         sys.exit(2)
1313     except ArvPutUploadNotPending:
1314         # No files pending for upload
1315         sys.exit(0)
1316
1317     if not args.dry_run and not args.update_collection and args.resume and writer.bytes_written > 0:
1318         logger.warning("\n".join([
1319             "arv-put: Resuming previous upload from last checkpoint.",
1320             "         Use the --no-resume option to start over."]))
1321
1322     if not args.dry_run:
1323         writer.report_progress()
1324     output = None
1325     try:
1326         writer.start(save_collection=not(args.stream or args.raw))
1327     except (arvados.errors.ApiError, arvados.errors.KeepWriteError) as error:
1328         logger.error("\n".join([
1329             "arv-put: %s" % str(error)]))
1330         sys.exit(1)
1331
1332     if args.progress:  # Print newline to split stderr from stdout for humans.
1333         logger.info("\n")
1334
1335     if args.stream:
1336         if args.normalize:
1337             output = writer.manifest_text(normalize=True)
1338         else:
1339             output = writer.manifest_text()
1340     elif args.raw:
1341         output = ','.join(writer.data_locators())
1342     elif writer.manifest_locator() is not None:
1343         try:
1344             expiration_notice = ""
1345             if writer.collection_trash_at() is not None:
1346                 # Get the local timezone-naive version, and log it with timezone information.
1347                 if time.daylight:
1348                     local_trash_at = writer.collection_trash_at().replace(tzinfo=None) - datetime.timedelta(seconds=time.altzone)
1349                 else:
1350                     local_trash_at = writer.collection_trash_at().replace(tzinfo=None) - datetime.timedelta(seconds=time.timezone)
1351                 expiration_notice = ". It will expire on {} {}.".format(
1352                     local_trash_at.strftime("%Y-%m-%d %H:%M:%S"), time.strftime("%z"))
1353             if args.update_collection:
1354                 logger.info(u"Collection updated: '{}'{}".format(
1355                     writer.collection_name(), expiration_notice))
1356             else:
1357                 logger.info(u"Collection saved as '{}'{}".format(
1358                     writer.collection_name(), expiration_notice))
1359             if args.portable_data_hash:
1360                 output = writer.portable_data_hash()
1361             else:
1362                 output = writer.manifest_locator()
1363         except apiclient_errors.Error as error:
1364             logger.error(
1365                 "arv-put: Error creating Collection on project: {}.".format(
1366                     error))
1367             status = 1
1368     else:
1369         status = 1
1370
1371     # Print the locator (uuid) of the new collection.
1372     if output is None:
1373         status = status or 1
1374     elif not args.silent:
1375         stdout.write(output)
1376         if not output.endswith('\n'):
1377             stdout.write('\n')
1378
1379     if install_sig_handlers:
1380         arv_cmd.restore_signal_handlers()
1381
1382     if status != 0:
1383         sys.exit(status)
1384
1385     # Success!
1386     return output
1387
1388
1389 if __name__ == '__main__':
1390     main()