+ with self._collection_lock:
+ self.bytes_written = self._collection_size(self._local_collection)
+ if self.use_cache:
+ if final:
+ manifest = self._local_collection.manifest_text()
+ else:
+ # Get the manifest text without comitting pending blocks
+ manifest = self._local_collection.manifest_text(strip=False,
+ normalize=False,
+ only_committed=True)
+ # Update cache
+ with self._state_lock:
+ self._state['manifest'] = manifest
+ if self.use_cache:
+ self._save_state()
+ # Call the reporter, if any
+ self.report_progress()
+
+ def report_progress(self):
+ if self.reporter is not None:
+ self.reporter(self.bytes_written, self.bytes_expected)
+
+ def _write_stdin(self, filename):
+ output = self._local_collection.open(filename, 'w')
+ self._write(sys.stdin, output)
+ output.close()
+
+ def _check_file(self, source, filename):
+ """Check if this file needs to be uploaded"""
+ resume_offset = 0
+ should_upload = False
+ new_file_in_cache = False
+ # Record file path for updating the remote collection before exiting
+ self._file_paths.append(filename)
+
+ with self._state_lock:
+ # If no previous cached data on this file, store it for an eventual
+ # repeated run.
+ if source not in self._state['files']:
+ self._state['files'][source] = {
+ 'mtime': os.path.getmtime(source),
+ 'size' : os.path.getsize(source)
+ }
+ new_file_in_cache = True
+ cached_file_data = self._state['files'][source]
+
+ # Check if file was already uploaded (at least partially)
+ file_in_local_collection = self._local_collection.find(filename)
+
+ # If not resuming, upload the full file.
+ if not self.resume:
+ should_upload = True
+ # New file detected from last run, upload it.
+ elif new_file_in_cache:
+ should_upload = True
+ # Local file didn't change from last run.
+ elif cached_file_data['mtime'] == os.path.getmtime(source) and cached_file_data['size'] == os.path.getsize(source):
+ if not file_in_local_collection:
+ # File not uploaded yet, upload it completely
+ should_upload = True
+ elif file_in_local_collection.permission_expired():
+ # Permission token expired, re-upload file. This will change whenever
+ # we have a API for refreshing tokens.
+ should_upload = True
+ self._local_collection.remove(filename)
+ elif cached_file_data['size'] == file_in_local_collection.size():
+ # File already there, skip it.
+ self.bytes_skipped += cached_file_data['size']
+ elif cached_file_data['size'] > file_in_local_collection.size():
+ # File partially uploaded, resume!
+ resume_offset = file_in_local_collection.size()
+ self.bytes_skipped += resume_offset
+ should_upload = True
+ else:
+ # Inconsistent cache, re-upload the file
+ should_upload = True
+ self._local_collection.remove(filename)
+ self.logger.warning("Uploaded version of file '{}' is bigger than local version, will re-upload it from scratch.".format(source))
+ # Local file differs from cached data, re-upload it.
+ else:
+ if file_in_local_collection:
+ self._local_collection.remove(filename)
+ should_upload = True
+
+ if should_upload:
+ self._files_to_upload.append((source, resume_offset, filename))
+
+ def _upload_files(self):
+ for source, resume_offset, filename in self._files_to_upload:
+ with open(source, 'r') as source_fd:
+ with self._state_lock:
+ self._state['files'][source]['mtime'] = os.path.getmtime(source)
+ self._state['files'][source]['size'] = os.path.getsize(source)
+ if resume_offset > 0:
+ # Start upload where we left off
+ output = self._local_collection.open(filename, 'a')
+ source_fd.seek(resume_offset)
+ else:
+ # Start from scratch
+ output = self._local_collection.open(filename, 'w')
+ self._write(source_fd, output)
+ output.close(flush=False)
+
+ def _write(self, source_fd, output):
+ while True:
+ data = source_fd.read(arvados.config.KEEP_BLOCK_SIZE)
+ if not data:
+ break
+ output.write(data)
+
+ def _my_collection(self):
+ return self._remote_collection if self.update else self._local_collection
+
+ def _setup_state(self, update_collection):