+ def _write_directory_tree(self, path, stream_name="."):
+ # TODO: Check what happens when multiple directories are passed as
+ # arguments.
+ # If the code below is uncommented, integration test
+ # test_ArvPutSignedManifest (tests.test_arv_put.ArvPutIntegrationTest)
+ # fails, I suppose it is because the manifest_uuid changes because
+ # of the dir addition to stream_name.
+
+ # if stream_name == '.':
+ # stream_name = os.path.join('.', os.path.basename(path))
+ for item in os.listdir(path):
+ if os.path.isdir(os.path.join(path, item)):
+ self._write_directory_tree(os.path.join(path, item),
+ os.path.join(stream_name, item))
+ else:
+ self._write_file(os.path.join(path, item),
+ os.path.join(stream_name, item))
+
+ def _write_stdin(self, filename):
+ with self._collection_lock:
+ output = self._my_collection().open(filename, 'w')
+ self._write(sys.stdin, output)
+ output.close()
+
+ def _write_file(self, source, filename):
+ resume_offset = 0
+ if self.resume:
+ # Check if file was already uploaded (at least partially)
+ with self._collection_lock:
+ try:
+ file_in_collection = self._my_collection().find(filename)
+ except IOError:
+ # Not found
+ file_in_collection = None
+ # If no previous cached data on this file, store it for an eventual
+ # repeated run.
+ if source not in self._state['files']:
+ with self._state_lock:
+ self._state['files'][source] = {
+ 'mtime': os.path.getmtime(source),
+ 'size' : os.path.getsize(source)
+ }
+ with self._state_lock:
+ cached_file_data = self._state['files'][source]
+ # See if this file was already uploaded at least partially
+ if file_in_collection:
+ if cached_file_data['mtime'] == os.path.getmtime(source) and cached_file_data['size'] == os.path.getsize(source):
+ if cached_file_data['size'] == file_in_collection.size():
+ # File already there, skip it.
+ self.bytes_skipped += cached_file_data['size']
+ return
+ elif cached_file_data['size'] > file_in_collection.size():
+ # File partially uploaded, resume!
+ resume_offset = file_in_collection.size()
+ else:
+ # Inconsistent cache, re-upload the file
+ self.logger.warning("Uploaded version of file '{}' is bigger than local version, will re-upload it from scratch.".format(source))
+ else:
+ # Local file differs from cached data, re-upload it
+ pass
+ with open(source, 'r') as source_fd:
+ if resume_offset > 0:
+ # Start upload where we left off
+ with self._collection_lock:
+ output = self._my_collection().open(filename, 'a')
+ source_fd.seek(resume_offset)
+ self.bytes_skipped += resume_offset
+ else:
+ # Start from scratch
+ with self._collection_lock:
+ output = self._my_collection().open(filename, 'w')
+ self._write(source_fd, output)
+ output.close()
+
+ def _write(self, source_fd, output):
+ first_read = True
+ while True:
+ data = source_fd.read(arvados.config.KEEP_BLOCK_SIZE)
+ # Allow an empty file to be written
+ if not data and not first_read:
+ break
+ if first_read:
+ first_read = False
+ output.write(data)
+
+ def _my_collection(self):
+ """
+ Create a new collection if none cached. Load it from cache otherwise.
+ """
+ if self._collection is None:
+ with self._state_lock:
+ manifest = self._state['manifest']
+ if self.resume and manifest is not None:
+ # Create collection from saved state
+ self._collection = arvados.collection.Collection(
+ manifest,
+ replication_desired=self.replication_desired)
+ else:
+ # Create new collection
+ self._collection = arvados.collection.Collection(
+ replication_desired=self.replication_desired)
+ return self._collection
+
+ def _setup_state(self):
+ """
+ Create a new cache file or load a previously existing one.
+ """
+ if self.resume:
+ md5 = hashlib.md5()
+ md5.update(arvados.config.get('ARVADOS_API_HOST', '!nohost'))
+ realpaths = sorted(os.path.realpath(path) for path in self.paths)
+ md5.update('\0'.join(realpaths))
+ if self.filename:
+ md5.update(self.filename)
+ cache_filename = md5.hexdigest()
+ self._cache_file = open(os.path.join(
+ arv_cmd.make_home_conf_dir(self.CACHE_DIR, 0o700, 'raise'),
+ cache_filename), 'a+')
+ self._cache_filename = self._cache_file.name
+ self._lock_file(self._cache_file)
+ self._cache_file.seek(0)
+ with self._state_lock:
+ try:
+ self._state = json.load(self._cache_file)
+ if not set(['manifest', 'files']).issubset(set(self._state.keys())):
+ # Cache at least partially incomplete, set up new cache
+ self._state = copy.deepcopy(self.EMPTY_STATE)
+ except ValueError:
+ # Cache file empty, set up new cache
+ self._state = copy.deepcopy(self.EMPTY_STATE)
+ # Load how many bytes were uploaded on previous run
+ with self._collection_lock:
+ self.bytes_written = self._collection_size(self._my_collection())
+ # No resume required
+ else:
+ with self._state_lock:
+ self._state = copy.deepcopy(self.EMPTY_STATE)
+
+ def _lock_file(self, fileobj):
+ try:
+ fcntl.flock(fileobj, fcntl.LOCK_EX | fcntl.LOCK_NB)
+ except IOError:
+ raise ResumeCacheConflict("{} locked".format(fileobj.name))
+
+ def _save_state(self):
+ """
+ Atomically save current state into cache.
+ """
+ try:
+ with self._state_lock:
+ state = self._state
+ new_cache_fd, new_cache_name = tempfile.mkstemp(
+ dir=os.path.dirname(self._cache_filename))
+ self._lock_file(new_cache_fd)
+ new_cache = os.fdopen(new_cache_fd, 'r+')
+ json.dump(state, new_cache)
+ new_cache.flush()
+ os.fsync(new_cache)
+ os.rename(new_cache_name, self._cache_filename)
+ except (IOError, OSError, ResumeCacheConflict) as error:
+ self.logger.error("There was a problem while saving the cache file: {}".format(error))
+ try:
+ os.unlink(new_cache_name)
+ except NameError: # mkstemp failed.
+ pass
+ else:
+ self._cache_file.close()
+ self._cache_file = new_cache
+
+ def collection_name(self):
+ with self._collection_lock:
+ name = self._my_collection().api_response()['name'] if self._my_collection().api_response() else None
+ return name
+
+ def manifest_locator(self):
+ with self._collection_lock:
+ locator = self._my_collection().manifest_locator()
+ return locator
+
+ def portable_data_hash(self):
+ with self._collection_lock:
+ datahash = self._my_collection().portable_data_hash()
+ return datahash
+
+ def manifest_text(self, stream_name=".", strip=False, normalize=False):
+ with self._collection_lock:
+ manifest = self._my_collection().manifest_text(stream_name, strip, normalize)
+ return manifest
+
+ def _datablocks_on_item(self, item):
+ """
+ Return a list of datablock locators, recursively navigating
+ through subcollections
+ """
+ if isinstance(item, arvados.arvfile.ArvadosFile):
+ if item.size() == 0:
+ # Empty file locator
+ return ["d41d8cd98f00b204e9800998ecf8427e+0"]
+ else:
+ locators = []
+ for segment in item.segments():
+ loc = segment.locator
+ locators.append(loc)
+ return locators
+ elif isinstance(item, arvados.collection.Collection):
+ l = [self._datablocks_on_item(x) for x in item.values()]
+ # Fast list flattener method taken from:
+ # http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
+ return [loc for sublist in l for loc in sublist]
+ else:
+ return None
+
+ def data_locators(self):
+ with self._collection_lock:
+ # Make sure all datablocks are flushed before getting the locators
+ self._my_collection().manifest_text()
+ datablocks = self._datablocks_on_item(self._my_collection())
+ return datablocks