X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/ffe3cdbc8c37e2b4a4e3ea4f67c1c9ca5d81e2ed..2024ca087c3b9c99ebb792011b60fecdf1486467:/sdk/python/arvados/collection.py diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py index e4c008efb8..6cf4d07552 100644 --- a/sdk/python/arvados/collection.py +++ b/sdk/python/arvados/collection.py @@ -27,6 +27,8 @@ import config import errors import util +_logger = logging.getLogger('arvados.collection') + def normalize_stream(s, stream): stream_tokens = [s] sortedfiles = list(stream.keys()) @@ -41,6 +43,9 @@ def normalize_stream(s, stream): blocks[b[arvados.LOCATOR]] = streamoffset streamoffset += b[arvados.BLOCKSIZE] + if len(stream_tokens) == 1: + stream_tokens.append(config.EMPTY_BLOCK_LOCATOR) + for f in sortedfiles: current_span = None fout = f.replace(' ', '\\040') @@ -88,10 +93,10 @@ def normalize(collection): class CollectionReader(object): def __init__(self, manifest_locator_or_text): - if re.search(r'^[a-f0-9]{32}(\+\d+)?(\+\S+)*$', manifest_locator_or_text): + if re.match(r'[a-f0-9]{32}(\+\d+)?(\+\S+)*$', manifest_locator_or_text): self._manifest_locator = manifest_locator_or_text self._manifest_text = None - elif re.search(r'^\S+( [a-f0-9]{32,}(\+\S+)*)*( \d+:\d+:\S+)+\n', manifest_locator_or_text): + elif re.match(r'(\S+)( [a-f0-9]{32}(\+\d+)(\+\S+)*)+( \d+:\d+:\S+)+\n', manifest_locator_or_text): self._manifest_text = manifest_locator_or_text self._manifest_locator = None else: @@ -114,8 +119,8 @@ class CollectionReader(object): uuid=self._manifest_locator).execute() self._manifest_text = c['manifest_text'] except Exception as e: - logging.warning("API lookup failed for collection %s (%s: %s)" % - (self._manifest_locator, type(e), str(e))) + _logger.warning("API lookup failed for collection %s (%s: %s)", + self._manifest_locator, type(e), str(e)) self._manifest_text = Keep.get(self._manifest_locator) self._streams = [] for stream_line in self._manifest_text.split("\n"): @@ -143,9 +148,13 @@ class CollectionReader(object): for f in s.all_files(): yield f - def manifest_text(self): + def manifest_text(self, strip=False): self._populate() - return self._manifest_text + if strip: + m = ''.join([StreamReader(stream).manifest_text(strip=True) for stream in self._streams]) + return m + else: + return self._manifest_text class CollectionWriter(object): KEEP_BLOCK_SIZE = 2**26 @@ -228,7 +237,11 @@ class CollectionWriter(object): path, stream_name, max_manifest_depth = self._queued_trees[0] make_dirents = (util.listdir_recursive if (max_manifest_depth == 0) else os.listdir) - self._queue_dirents(stream_name, make_dirents(path)) + d = make_dirents(path) + if len(d) > 0: + self._queue_dirents(stream_name, d) + else: + self._queued_trees.popleft() def _queue_file(self, source, filename=None): assert (self._queued_file is None), "tried to queue more than one file" @@ -341,7 +354,24 @@ class CollectionWriter(object): self._current_file_name = None def finish(self): - return Keep.put(self.manifest_text()) + # Send the stripped manifest to Keep, to ensure that we use the + # same UUID regardless of what hints are used on the collection. + return Keep.put(self.stripped_manifest()) + + def stripped_manifest(self): + """ + Return the manifest for the current collection with all permission + hints removed from the locators in the manifest. + """ + raw = self.manifest_text() + clean = '' + for line in raw.split("\n"): + fields = line.split() + if len(fields) > 0: + locators = [ re.sub(r'\+A[a-z0-9@_-]+', '', x) + for x in fields[1:-1] ] + clean += fields[0] + ' ' + ' '.join(locators) + ' ' + fields[-1] + "\n" + return clean def manifest_text(self): self.finish_current_stream() @@ -355,10 +385,10 @@ class CollectionWriter(object): manifest += ' ' + ' '.join("%d:%d:%s" % (sfile[0], sfile[1], sfile[2].replace(' ', '\\040')) for sfile in stream[2]) manifest += "\n" - #print 'writer',manifest - #print 'after reader',CollectionReader(manifest).manifest_text() - - return CollectionReader(manifest).manifest_text() + if len(manifest) > 0: + return CollectionReader(manifest).manifest_text() + else: + return "" def data_locators(self): ret = []