X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/ad69b48e324c3ce29a4d2c84732dfd3d0288ebb3..fd0074f2200bc41bc63be770fffbe2446fb0cc03:/sdk/python/arvados/collection.py diff --git a/sdk/python/arvados/collection.py b/sdk/python/arvados/collection.py index a77f3093f8..3d48652dd5 100644 --- a/sdk/python/arvados/collection.py +++ b/sdk/python/arvados/collection.py @@ -10,7 +10,7 @@ import threading from collections import deque from stat import * -from .arvfile import split, _FileLikeObjectBase, ArvadosFile, ArvadosFileWriter, ArvadosFileReader, _BlockManager, synchronized, must_be_writable, SYNC_READONLY, SYNC_EXPLICIT, NoopLock +from .arvfile import split, _FileLikeObjectBase, ArvadosFile, ArvadosFileWriter, ArvadosFileReader, _BlockManager, synchronized, must_be_writable, NoopLock from keep import KeepLocator, KeepClient from .stream import StreamReader from ._normalize_stream import normalize_stream @@ -38,7 +38,8 @@ class CollectionBase(object): return self._keep_client def stripped_manifest(self): - """ + """Get the manifest with locator hints stripped. + Return the manifest for the current collection with all non-portable hints (i.e., permission signatures and other hints other than size hints) removed from the locators. @@ -469,13 +470,14 @@ class ResumableCollectionWriter(CollectionWriter): "resumable writer can't accept unsourced data") return super(ResumableCollectionWriter, self).write(data) + ADD = "add" DEL = "del" MOD = "mod" FILE = "file" COLLECTION = "collection" -class SynchronizedCollectionBase(CollectionBase): +class RichCollectionBase(CollectionBase): """Base class for Collections and Subcollections. Implements the majority of functionality relating to accessing items in the @@ -497,7 +499,7 @@ class SynchronizedCollectionBase(CollectionBase): def _my_block_manager(self): raise NotImplementedError() - def sync_mode(self): + def writable(self): raise NotImplementedError() def root_collection(self): @@ -520,20 +522,18 @@ class SynchronizedCollectionBase(CollectionBase): the path. :create_type: - One of `arvado.collection.FILE` or - `arvado.collection.COLLECTION`. If the path is not found, and value + One of `arvados.collection.FILE` or + `arvados.collection.COLLECTION`. If the path is not found, and value of create_type is FILE then create and return a new ArvadosFile for the last path component. If COLLECTION, then create and return a new Collection for the last path component. """ - pathcomponents = path.split("/") - - if pathcomponents and pathcomponents[0]: + pathcomponents = path.split("/", 1) + if pathcomponents[0]: item = self._items.get(pathcomponents[0]) if len(pathcomponents) == 1: - # item must be a file if item is None: # create new file if create_type == COLLECTION: @@ -551,9 +551,8 @@ class SynchronizedCollectionBase(CollectionBase): self._items[pathcomponents[0]] = item self._modified = True self.notify(ADD, self, pathcomponents[0], item) - del pathcomponents[0] - if isinstance(item, SynchronizedCollectionBase): - return item.find_or_create("/".join(pathcomponents), create_type) + if isinstance(item, RichCollectionBase): + return item.find_or_create(pathcomponents[1], create_type) else: raise IOError((errno.ENOTDIR, "Interior path components must be subcollection")) else: @@ -567,21 +566,21 @@ class SynchronizedCollectionBase(CollectionBase): found. """ - pathcomponents = path.split("/") + if not path: + raise errors.ArgumentError("Parameter 'path' must not be empty.") - if pathcomponents and pathcomponents[0]: - item = self._items.get(pathcomponents[0]) - if len(pathcomponents) == 1: - # item must be a file - return item - else: - del pathcomponents[0] - if isinstance(item, SynchronizedCollectionBase): - return item.find("/".join(pathcomponents)) - else: - raise IOError((errno.ENOTDIR, "Interior path components must be subcollection")) + pathcomponents = path.split("/", 1) + item = self._items.get(pathcomponents[0]) + if len(pathcomponents) == 1: + return item else: - return self + if isinstance(item, RichCollectionBase): + if pathcomponents[1]: + return item.find(pathcomponents[1]) + else: + return item + else: + raise IOError((errno.ENOTDIR, "Interior path components must be subcollection")) def mkdirs(path): """Recursive subcollection create. @@ -615,7 +614,7 @@ class SynchronizedCollectionBase(CollectionBase): raise errors.ArgumentError("Bad mode '%s'" % mode) create = (mode != "r") - if create and self.sync_mode() == SYNC_READONLY: + if create and not self.writable(): raise IOError((errno.EROFS, "Collection is read only")) if create: @@ -640,8 +639,7 @@ class SynchronizedCollectionBase(CollectionBase): @synchronized def modified(self): - """Test if the collection (or any subcollection or file) has been modified - since it was created.""" + """Test if the collection (or any subcollection or file) has been modified.""" if self._modified: return True for k,v in self._items.items(): @@ -661,22 +659,18 @@ class SynchronizedCollectionBase(CollectionBase): """Iterate over names of files and collections contained in this collection.""" return iter(self._items.keys()) - @synchronized - def iterkeys(self): - """Iterate over names of files and collections directly contained in this collection.""" - return self._items.keys() - @synchronized def __getitem__(self, k): - """Get a file or collection that is directly contained by this collection. If - you want to search a path, use `find()` instead. + """Get a file or collection that is directly contained by this collection. + + If you want to search a path, use `find()` instead. + """ return self._items[k] @synchronized def __contains__(self, k): - """If there is a file or collection a directly contained by this collection - with name `k`.""" + """Test if there is a file or collection a directly contained by this collection.""" return k in self._items @synchronized @@ -709,7 +703,7 @@ class SynchronizedCollectionBase(CollectionBase): def exists(self, path): """Test if there is a file or collection at `path`.""" - return self.find(path) != None + return self.find(path) is not None @must_be_writable @synchronized @@ -719,39 +713,72 @@ class SynchronizedCollectionBase(CollectionBase): :recursive: Specify whether to remove non-empty subcollections (True), or raise an error (False). """ - pathcomponents = path.split("/") - if len(pathcomponents) > 0: - item = self._items.get(pathcomponents[0]) - if item is None: - raise IOError((errno.ENOENT, "File not found")) - if len(pathcomponents) == 1: - if isinstance(self._items[pathcomponents[0]], SynchronizedCollectionBase) and len(self._items[pathcomponents[0]]) > 0 and not recursive: - raise IOError((errno.ENOTEMPTY, "Subcollection not empty")) - deleteditem = self._items[pathcomponents[0]] - del self._items[pathcomponents[0]] - self._modified = True - self.notify(DEL, self, pathcomponents[0], deleteditem) - else: - del pathcomponents[0] - item.remove("/".join(pathcomponents)) - else: + if not path: + raise errors.ArgumentError("Parameter 'path' must not be empty.") + + pathcomponents = path.split("/", 1) + item = self._items.get(pathcomponents[0]) + if item is None: raise IOError((errno.ENOENT, "File not found")) + if len(pathcomponents) == 1: + if isinstance(self._items[pathcomponents[0]], RichCollectionBase) and len(self._items[pathcomponents[0]]) > 0 and not recursive: + raise IOError((errno.ENOTEMPTY, "Subcollection not empty")) + deleteditem = self._items[pathcomponents[0]] + del self._items[pathcomponents[0]] + self._modified = True + self.notify(DEL, self, pathcomponents[0], deleteditem) + else: + item.remove(pathcomponents[1]) - def _cloneinto(self, target): - for k,v in self._items.items(): - target._items[k] = v.clone(target) + def _clonefrom(self, source): + for k,v in source.items(): + self._items[k] = v.clone(self) def clone(self): raise NotImplementedError() + @must_be_writable + @synchronized + def add(self, source_obj, target_name, overwrite=False): + """Copy a file or subcollection to this collection. + + :source_obj: + An ArvadosFile, or Subcollection object + + :target_name: + Destination item name. If the target name already exists and is a + file, this will raise an error unless you specify `overwrite=True`. + + :overwrite: + Whether to overwrite target file if it already exists. + + """ + + if target_name in self and not overwrite: + raise IOError((errno.EEXIST, "File already exists")) + + modified_from = None + if target_name in self: + modified_from = self[target_name] + + # Actually make the copy. + dup = source_obj.clone(self) + self._items[target_name] = dup + self._modified = True + + if modified_from: + self.notify(MOD, self, target_name, (modified_from, dup)) + else: + self.notify(ADD, self, target_name, dup) + @must_be_writable @synchronized def copy(self, source, target_path, source_collection=None, overwrite=False): """Copy a file or subcollection to a new path in this collection. :source: - An ArvadosFile, Subcollection, or string with a path to source file or subcollection + A string with a path to source file or subcollection, or an actual ArvadosFile or Subcollection object. :target_path: Destination file or path. If the target path already exists and is a @@ -789,26 +816,11 @@ class SynchronizedCollectionBase(CollectionBase): target_dir = self.find_or_create("/".join(targetcomponents[0:-1]), COLLECTION) - if target_name in target_dir: - if isinstance(target_dir[target_name], SynchronizedCollectionBase) and sourcecomponents: - target_dir = target_dir[target_name] - target_name = sourcecomponents[-1] - elif not overwrite: - raise IOError((errno.EEXIST, "File already exists")) + if target_name in target_dir and isinstance(self[target_name], RichCollectionBase) and sourcecomponents: + target_dir = target_dir[target_name] + target_name = sourcecomponents[-1] - modified_from = None - if target_name in target_dir: - modified_from = target_dir[target_name] - - # Actually make the copy. - dup = source_obj.clone(target_dir) - target_dir._items[target_name] = dup - target_dir._modified = True - - if modified_from: - self.notify(MOD, target_dir, target_name, (modified_from, dup)) - else: - self.notify(ADD, target_dir, target_name, dup) + target_dir.add(source_obj, target_name, overwrite) @synchronized def manifest_text(self, stream_name=".", strip=False, normalize=False): @@ -830,13 +842,12 @@ class SynchronizedCollectionBase(CollectionBase): """ if self.modified() or self._manifest_text is None or normalize: - item = self stream = {} - buf = "" - sorted_keys = sorted(item.keys()) - for filename in [s for s in sorted_keys if isinstance(item[s], ArvadosFile)]: + buf = [] + sorted_keys = sorted(self.keys()) + for filename in [s for s in sorted_keys if isinstance(self[s], ArvadosFile)]: # Create a stream per file `k` - arvfile = item[filename] + arvfile = self[filename] filestream = [] for segment in arvfile.segments(): loc = segment.locator @@ -848,11 +859,10 @@ class SynchronizedCollectionBase(CollectionBase): segment.segment_offset, segment.range_size)) stream[filename] = filestream if stream: - buf += ' '.join(normalize_stream(stream_name, stream)) - buf += "\n" - for dirname in [s for s in sorted_keys if isinstance(item[s], SynchronizedCollectionBase)]: - buf += item[dirname].manifest_text(stream_name=os.path.join(stream_name, dirname), strip=strip) - return buf + buf.append(" ".join(normalize_stream(stream_name, stream)) + "\n") + for dirname in [s for s in sorted_keys if isinstance(self[s], RichCollectionBase)]: + buf.append(self[dirname].manifest_text(stream_name=os.path.join(stream_name, dirname), strip=strip)) + return "".join(buf) else: if strip: return self.stripped_manifest() @@ -861,9 +871,10 @@ class SynchronizedCollectionBase(CollectionBase): @synchronized def diff(self, end_collection, prefix=".", holding_collection=None): - """ - Generate list of add/modify/delete actions which, when given to `apply`, will - change `self` to match `end_collection` + """Generate list of add/modify/delete actions. + + When given to `apply`, will change `self` to match `end_collection` + """ changes = [] if holding_collection is None: @@ -937,7 +948,7 @@ class SynchronizedCollectionBase(CollectionBase): def __eq__(self, other): if other is self: return True - if not isinstance(other, SynchronizedCollectionBase): + if not isinstance(other, RichCollectionBase): return False if len(self._items) != len(other): return False @@ -952,11 +963,15 @@ class SynchronizedCollectionBase(CollectionBase): return not self.__eq__(other) -class Collection(SynchronizedCollectionBase): - """Represents the root of an Arvados Collection, which may be associated with - an API server Collection record. +class Collection(RichCollectionBase): + """Represents the root of an Arvados Collection. + + This class is threadsafe. The root collection object, all subcollections + and files are protected by a single lock (i.e. each access locks the entire + collection). - Brief summary of useful methods: + Brief summary of + useful methods: :To read an existing file: `c.open("myfile", "r")` @@ -982,9 +997,8 @@ class Collection(SynchronizedCollectionBase): :To merge remote changes into this object: `c.update()` - This class is threadsafe. The root collection object, all subcollections - and files are protected by a single lock (i.e. each access locks the entire - collection). + Must be associated with an API server Collection record (during + initialization, or using `save_new`) to use `save` or `update` """ @@ -1031,7 +1045,6 @@ class Collection(SynchronizedCollectionBase): self._manifest_text = None self._api_response = None - self._sync = SYNC_EXPLICIT self.lock = threading.RLock() self.callbacks = [] self.events = None @@ -1047,8 +1060,10 @@ class Collection(SynchronizedCollectionBase): raise errors.ArgumentError( "Argument to CollectionReader must be a manifest or a collection UUID") - self._populate() - + try: + self._populate() + except (IOError, errors.SyntaxError) as e: + raise errors.ArgumentError("Error processing manifest text: %s", e) def root_collection(self): return self @@ -1056,16 +1071,14 @@ class Collection(SynchronizedCollectionBase): def stream_name(self): return "." - def sync_mode(self): - return self._sync + def writable(self): + return True @synchronized @retry_method def update(self, other=None, num_retries=None): - """Fetch the latest collection record on the API server and merge it with the - current collection contents. + """Merge the latest collection on the API server with the current collection.""" - """ if other is None: if self._manifest_locator is None: raise errors.ArgumentError("`other` is None but collection does not have a manifest_locator uuid") @@ -1146,7 +1159,7 @@ class Collection(SynchronizedCollectionBase): error_via_keep = self._populate_from_keep() if self._manifest_text is None: # Nothing worked! - raise arvados.errors.NotFoundError( + raise errors.NotFoundError( ("Failed to retrieve collection '{}' " + "from either API server ({}) or Keep ({})." ).format( @@ -1166,11 +1179,26 @@ class Collection(SynchronizedCollectionBase): def __exit__(self, exc_type, exc_value, traceback): """Support scoped auto-commit in a with: block.""" - if self._sync != SYNC_READONLY and self._has_collection_uuid(): - self.save() + if exc_type is not None: + if self.writable() and self._has_collection_uuid(): + self.save() if self._block_manager is not None: self._block_manager.stop_threads() + @synchronized + def manifest_locator(self): + """Get the manifest locator, if any. + + The manifest locator will be set when the collection is loaded from an + API server record or the portable data hash of a manifest. + + The manifest locator will be None if the collection is newly created or + was created directly from manifest text. The method `save_new()` will + assign a manifest locator. + + """ + return self._manifest_locator + @synchronized def clone(self, new_parent=None, readonly=False, new_config=None): if new_config is None: @@ -1180,9 +1208,7 @@ class Collection(SynchronizedCollectionBase): else: newcollection = Collection(parent=new_parent, apiconfig=new_config) - newcollection._sync = None - self._cloneinto(newcollection) - newcollection._sync = SYNC_READONLY if readonly else SYNC_EXPLICIT + newcollection._clonefrom(self) return newcollection @synchronized @@ -1196,21 +1222,21 @@ class Collection(SynchronizedCollectionBase): return self._api_response def find_or_create(self, path, create_type): - """See `SynchronizedCollectionBase.find_or_create`""" + """See `RichCollectionBase.find_or_create`""" if path == ".": return self else: return super(Collection, self).find_or_create(path[2:] if path.startswith("./") else path, create_type) def find(self, path): - """See `SynchronizedCollectionBase.find`""" + """See `RichCollectionBase.find`""" if path == ".": return self else: return super(Collection, self).find(path[2:] if path.startswith("./") else path) def remove(self, path, recursive=False): - """See `SynchronizedCollectionBase.remove`""" + """See `RichCollectionBase.remove`""" if path == ".": raise errors.ArgumentError("Cannot remove '.'") else: @@ -1220,18 +1246,23 @@ class Collection(SynchronizedCollectionBase): @synchronized @retry_method def save(self, merge=True, num_retries=None): - """Commit pending buffer blocks to Keep, merge with remote record (if - update=True), write the manifest to Keep, and update the collection - record. + """Save collection to an existing collection record. + + Commit pending buffer blocks to Keep, merge with remote record (if + merge=True, the default), write the manifest to Keep, and update the + collection record. Will raise AssertionError if not associated with a collection record on the API server. If you want to save a manifest to Keep only, see `save_new()`. - :update: + :merge: Update and merge remote changes before saving. Otherwise, any remote changes will be ignored and overwritten. + :num_retries: + Retry count on API calls (if None, use the collection default) + """ if self.modified(): if not self._has_collection_uuid(): @@ -1255,17 +1286,18 @@ class Collection(SynchronizedCollectionBase): @synchronized @retry_method def save_new(self, name=None, create_collection_record=True, owner_uuid=None, ensure_unique_name=False, num_retries=None): - """Commit pending buffer blocks to Keep, write the manifest to Keep, and create - a new collection record (if create_collection_record True). + """Save collection to a new collection record. + Commit pending buffer blocks to Keep, write the manifest to Keep, and + create a new collection record (if create_collection_record True). After creating a new collection record, this Collection object will be associated with the new record used by `save()`. :name: The collection name. - :keep_only: - Only save the manifest to keep, do not create a collection record. + :create_collection_record: + If True, create a collection record. If False, only save the manifest to keep. :owner_uuid: the user, or project uuid that will own this collection. @@ -1276,6 +1308,9 @@ class Collection(SynchronizedCollectionBase): if it conflicts with a collection with the same name and owner. If False, a name conflict will result in an error. + :num_retries: + Retry count on API calls (if None, use the collection default) + """ self._my_block_manager().commit_all() self._my_keep().put(self.manifest_text(strip=True), num_retries=num_retries) @@ -1322,9 +1357,6 @@ class Collection(SynchronizedCollectionBase): if len(self) > 0: raise ArgumentError("Can only import manifest into an empty collection") - save_sync = self.sync_mode() - self._sync = None - STREAM_NAME = 0 BLOCKS = 1 SEGMENTS = 2 @@ -1332,9 +1364,9 @@ class Collection(SynchronizedCollectionBase): stream_name = None state = STREAM_NAME - for n in re.finditer(r'(\S+)(\s+|$)', manifest_text): - tok = n.group(1) - sep = n.group(2) + for token_and_separator in re.finditer(r'(\S+)(\s+|$)', manifest_text): + tok = token_and_separator.group(1) + sep = token_and_separator.group(2) if state == STREAM_NAME: # starting a new stream @@ -1346,24 +1378,24 @@ class Collection(SynchronizedCollectionBase): continue if state == BLOCKS: - s = re.match(r'[0-9a-f]{32}\+(\d+)(\+\S+)*', tok) - if s: - blocksize = long(s.group(1)) + block_locator = re.match(r'[0-9a-f]{32}\+(\d+)(\+\S+)*', tok) + if block_locator: + blocksize = long(block_locator.group(1)) blocks.append(Range(tok, streamoffset, blocksize)) streamoffset += blocksize else: state = SEGMENTS if state == SEGMENTS: - s = re.search(r'^(\d+):(\d+):(\S+)', tok) - if s: - pos = long(s.group(1)) - size = long(s.group(2)) - name = s.group(3).replace('\\040', ' ') + file_segment = re.search(r'^(\d+):(\d+):(\S+)', tok) + if file_segment: + pos = long(file_segment.group(1)) + size = long(file_segment.group(2)) + name = file_segment.group(3).replace('\\040', ' ') filepath = os.path.join(stream_name, name) - f = self.find_or_create(filepath, FILE) - if isinstance(f, ArvadosFile): - f.add_segment(blocks, pos, size) + afile = self.find_or_create(filepath, FILE) + if isinstance(afile, ArvadosFile): + afile.add_segment(blocks, pos, size) else: raise errors.SyntaxError("File %s conflicts with stream of the same name.", filepath) else: @@ -1375,10 +1407,9 @@ class Collection(SynchronizedCollectionBase): state = STREAM_NAME self.set_unmodified() - self._sync = save_sync -class Subcollection(SynchronizedCollectionBase): +class Subcollection(RichCollectionBase): """This is a subdirectory within a collection that doesn't have its own API server record. @@ -1394,8 +1425,8 @@ class Subcollection(SynchronizedCollectionBase): def root_collection(self): return self.parent.root_collection() - def sync_mode(self): - return self.root_collection().sync_mode() + def writable(self): + return self.root_collection().writable() def _my_api(self): return self.root_collection()._my_api() @@ -1418,31 +1449,33 @@ class Subcollection(SynchronizedCollectionBase): @synchronized def clone(self, new_parent): c = Subcollection(new_parent) - self._cloneinto(c) + c._clonefrom(self) return c class CollectionReader(Collection): - """A read-only collection object from an api collection record locator, - a portable data hash of a manifest, or raw manifest text. + """A read-only collection object. - See `Collection` constructor for detailed options. + Initialize from an api collection record locator, a portable data hash of a + manifest, or raw manifest text. See `Collection` constructor for detailed + options. """ - def __init__(self, *args, **kwargs): - if not args and not kwargs.get("manifest_locator_or_text"): - raise errors.ArgumentError("Must provide manifest locator or text to initialize ReadOnlyCollection") - - super(CollectionReader, self).__init__(*args, **kwargs) + def __init__(self, manifest_locator_or_text, *args, **kwargs): + self._in_init = True + super(CollectionReader, self).__init__(manifest_locator_or_text, *args, **kwargs) + self._in_init = False # Forego any locking since it should never change once initialized. self.lock = NoopLock() - self._sync = SYNC_READONLY # Backwards compatability with old CollectionReader # all_streams() and all_files() self._streams = None + def writable(self): + return self._in_init + def _populate_streams(orig_func): @functools.wraps(orig_func) def populate_streams_wrapper(self, *args, **kwargs):