sdk/python/arvados/collection.py

   1 import functools
   2 import logging
   3 import os
   4 import re
   5 import errno
   6 import hashlib
   7 import time
   8 import threading
   9
  10 from collections import deque
  11 from stat import *
  12
  13 from .arvfile import split, _FileLikeObjectBase, ArvadosFile, ArvadosFileWriter, ArvadosFileReader, _BlockManager, synchronized, must_be_writable, NoopLock
  14 from keep import KeepLocator, KeepClient
  15 from .stream import StreamReader
  16 from ._normalize_stream import normalize_stream
  17 from ._ranges import Range, LocatorAndRange
  18 from .safeapi import ThreadSafeApiCache
  19 import config
  20 import errors
  21 import util
  22 import events
  23 from arvados.retry import retry_method
  24
  25 _logger = logging.getLogger('arvados.collection')
  26
  27 class CollectionBase(object):
  28     def __enter__(self):
  29         return self
  30
  31     def __exit__(self, exc_type, exc_value, traceback):
  32         pass
  33
  34     def _my_keep(self):
  35         if self._keep_client is None:
  36             self._keep_client = KeepClient(api_client=self._api_client,
  37                                            num_retries=self.num_retries)
  38         return self._keep_client
  39
  40     def stripped_manifest(self):
  41         """Get the manifest with locator hints stripped.
  42
  43         Return the manifest for the current collection with all
  44         non-portable hints (i.e., permission signatures and other
  45         hints other than size hints) removed from the locators.
  46         """
  47         raw = self.manifest_text()
  48         clean = []
  49         for line in raw.split("\n"):
  50             fields = line.split()
  51             if fields:
  52                 clean_fields = fields[:1] + [
  53                     (re.sub(r'\+[^\d][^\+]*', '', x)
  54                      if re.match(util.keep_locator_pattern, x)
  55                      else x)
  56                     for x in fields[1:]]
  57                 clean += [' '.join(clean_fields), "\n"]
  58         return ''.join(clean)
  59
  60
  61 class _WriterFile(_FileLikeObjectBase):
  62     def __init__(self, coll_writer, name):
  63         super(_WriterFile, self).__init__(name, 'wb')
  64         self.dest = coll_writer
  65
  66     def close(self):
  67         super(_WriterFile, self).close()
  68         self.dest.finish_current_file()
  69
  70     @_FileLikeObjectBase._before_close
  71     def write(self, data):
  72         self.dest.write(data)
  73
  74     @_FileLikeObjectBase._before_close
  75     def writelines(self, seq):
  76         for data in seq:
  77             self.write(data)
  78
  79     @_FileLikeObjectBase._before_close
  80     def flush(self):
  81         self.dest.flush_data()
  82
  83
  84 class CollectionWriter(CollectionBase):
  85     def __init__(self, api_client=None, num_retries=0, replication=None):
  86         """Instantiate a CollectionWriter.
  87
  88         CollectionWriter lets you build a new Arvados Collection from scratch.
  89         Write files to it.  The CollectionWriter will upload data to Keep as
  90         appropriate, and provide you with the Collection manifest text when
  91         you're finished.
  92
  93         Arguments:
  94         * api_client: The API client to use to look up Collections.  If not
  95           provided, CollectionReader will build one from available Arvados
  96           configuration.
  97         * num_retries: The default number of times to retry failed
  98           service requests.  Default 0.  You may change this value
  99           after instantiation, but note those changes may not
 100           propagate to related objects like the Keep client.
 101         * replication: The number of copies of each block to store.
 102           If this argument is None or not supplied, replication is
 103           the server-provided default if available, otherwise 2.
 104         """
 105         self._api_client = api_client
 106         self.num_retries = num_retries
 107         self.replication = (2 if replication is None else replication)
 108         self._keep_client = None
 109         self._data_buffer = []
 110         self._data_buffer_len = 0
 111         self._current_stream_files = []
 112         self._current_stream_length = 0
 113         self._current_stream_locators = []
 114         self._current_stream_name = '.'
 115         self._current_file_name = None
 116         self._current_file_pos = 0
 117         self._finished_streams = []
 118         self._close_file = None
 119         self._queued_file = None
 120         self._queued_dirents = deque()
 121         self._queued_trees = deque()
 122         self._last_open = None
 123
 124     def __exit__(self, exc_type, exc_value, traceback):
 125         if exc_type is None:
 126             self.finish()
 127
 128     def do_queued_work(self):
 129         # The work queue consists of three pieces:
 130         # * _queued_file: The file object we're currently writing to the
 131         #   Collection.
 132         # * _queued_dirents: Entries under the current directory
 133         #   (_queued_trees[0]) that we want to write or recurse through.
 134         #   This may contain files from subdirectories if
 135         #   max_manifest_depth == 0 for this directory.
 136         # * _queued_trees: Directories that should be written as separate
 137         #   streams to the Collection.
 138         # This function handles the smallest piece of work currently queued
 139         # (current file, then current directory, then next directory) until
 140         # no work remains.  The _work_THING methods each do a unit of work on
 141         # THING.  _queue_THING methods add a THING to the work queue.
 142         while True:
 143             if self._queued_file:
 144                 self._work_file()
 145             elif self._queued_dirents:
 146                 self._work_dirents()
 147             elif self._queued_trees:
 148                 self._work_trees()
 149             else:
 150                 break
 151
 152     def _work_file(self):
 153         while True:
 154             buf = self._queued_file.read(config.KEEP_BLOCK_SIZE)
 155             if not buf:
 156                 break
 157             self.write(buf)
 158         self.finish_current_file()
 159         if self._close_file:
 160             self._queued_file.close()
 161         self._close_file = None
 162         self._queued_file = None
 163
 164     def _work_dirents(self):
 165         path, stream_name, max_manifest_depth = self._queued_trees[0]
 166         if stream_name != self.current_stream_name():
 167             self.start_new_stream(stream_name)
 168         while self._queued_dirents:
 169             dirent = self._queued_dirents.popleft()
 170             target = os.path.join(path, dirent)
 171             if os.path.isdir(target):
 172                 self._queue_tree(target,
 173                                  os.path.join(stream_name, dirent),
 174                                  max_manifest_depth - 1)
 175             else:
 176                 self._queue_file(target, dirent)
 177                 break
 178         if not self._queued_dirents:
 179             self._queued_trees.popleft()
 180
 181     def _work_trees(self):
 182         path, stream_name, max_manifest_depth = self._queued_trees[0]
 183         d = util.listdir_recursive(
 184             path, max_depth = (None if max_manifest_depth == 0 else 0))
 185         if d:
 186             self._queue_dirents(stream_name, d)
 187         else:
 188             self._queued_trees.popleft()
 189
 190     def _queue_file(self, source, filename=None):
 191         assert (self._queued_file is None), "tried to queue more than one file"
 192         if not hasattr(source, 'read'):
 193             source = open(source, 'rb')
 194             self._close_file = True
 195         else:
 196             self._close_file = False
 197         if filename is None:
 198             filename = os.path.basename(source.name)
 199         self.start_new_file(filename)
 200         self._queued_file = source
 201
 202     def _queue_dirents(self, stream_name, dirents):
 203         assert (not self._queued_dirents), "tried to queue more than one tree"
 204         self._queued_dirents = deque(sorted(dirents))
 205
 206     def _queue_tree(self, path, stream_name, max_manifest_depth):
 207         self._queued_trees.append((path, stream_name, max_manifest_depth))
 208
 209     def write_file(self, source, filename=None):
 210         self._queue_file(source, filename)
 211         self.do_queued_work()
 212
 213     def write_directory_tree(self,
 214                              path, stream_name='.', max_manifest_depth=-1):
 215         self._queue_tree(path, stream_name, max_manifest_depth)
 216         self.do_queued_work()
 217
 218     def write(self, newdata):
 219         if hasattr(newdata, '__iter__'):
 220             for s in newdata:
 221                 self.write(s)
 222             return
 223         self._data_buffer.append(newdata)
 224         self._data_buffer_len += len(newdata)
 225         self._current_stream_length += len(newdata)
 226         while self._data_buffer_len >= config.KEEP_BLOCK_SIZE:
 227             self.flush_data()
 228
 229     def open(self, streampath, filename=None):
 230         """open(streampath[, filename]) -> file-like object
 231
 232         Pass in the path of a file to write to the Collection, either as a
 233         single string or as two separate stream name and file name arguments.
 234         This method returns a file-like object you can write to add it to the
 235         Collection.
 236
 237         You may only have one file object from the Collection open at a time,
 238         so be sure to close the object when you're done.  Using the object in
 239         a with statement makes that easy::
 240
 241           with cwriter.open('./doc/page1.txt') as outfile:
 242               outfile.write(page1_data)
 243           with cwriter.open('./doc/page2.txt') as outfile:
 244               outfile.write(page2_data)
 245         """
 246         if filename is None:
 247             streampath, filename = split(streampath)
 248         if self._last_open and not self._last_open.closed:
 249             raise errors.AssertionError(
 250                 "can't open '{}' when '{}' is still open".format(
 251                     filename, self._last_open.name))
 252         if streampath != self.current_stream_name():
 253             self.start_new_stream(streampath)
 254         self.set_current_file_name(filename)
 255         self._last_open = _WriterFile(self, filename)
 256         return self._last_open
 257
 258     def flush_data(self):
 259         data_buffer = ''.join(self._data_buffer)
 260         if data_buffer:
 261             self._current_stream_locators.append(
 262                 self._my_keep().put(
 263                     data_buffer[0:config.KEEP_BLOCK_SIZE],
 264                     copies=self.replication))
 265             self._data_buffer = [data_buffer[config.KEEP_BLOCK_SIZE:]]
 266             self._data_buffer_len = len(self._data_buffer[0])
 267
 268     def start_new_file(self, newfilename=None):
 269         self.finish_current_file()
 270         self.set_current_file_name(newfilename)
 271
 272     def set_current_file_name(self, newfilename):
 273         if re.search(r'[\t\n]', newfilename):
 274             raise errors.AssertionError(
 275                 "Manifest filenames cannot contain whitespace: %s" %
 276                 newfilename)
 277         elif re.search(r'\x00', newfilename):
 278             raise errors.AssertionError(
 279                 "Manifest filenames cannot contain NUL characters: %s" %
 280                 newfilename)
 281         self._current_file_name = newfilename
 282
 283     def current_file_name(self):
 284         return self._current_file_name
 285
 286     def finish_current_file(self):
 287         if self._current_file_name is None:
 288             if self._current_file_pos == self._current_stream_length:
 289                 return
 290             raise errors.AssertionError(
 291                 "Cannot finish an unnamed file " +
 292                 "(%d bytes at offset %d in '%s' stream)" %
 293                 (self._current_stream_length - self._current_file_pos,
 294                  self._current_file_pos,
 295                  self._current_stream_name))
 296         self._current_stream_files.append([
 297                 self._current_file_pos,
 298                 self._current_stream_length - self._current_file_pos,
 299                 self._current_file_name])
 300         self._current_file_pos = self._current_stream_length
 301         self._current_file_name = None
 302
 303     def start_new_stream(self, newstreamname='.'):
 304         self.finish_current_stream()
 305         self.set_current_stream_name(newstreamname)
 306
 307     def set_current_stream_name(self, newstreamname):
 308         if re.search(r'[\t\n]', newstreamname):
 309             raise errors.AssertionError(
 310                 "Manifest stream names cannot contain whitespace")
 311         self._current_stream_name = '.' if newstreamname=='' else newstreamname
 312
 313     def current_stream_name(self):
 314         return self._current_stream_name
 315
 316     def finish_current_stream(self):
 317         self.finish_current_file()
 318         self.flush_data()
 319         if not self._current_stream_files:
 320             pass
 321         elif self._current_stream_name is None:
 322             raise errors.AssertionError(
 323                 "Cannot finish an unnamed stream (%d bytes in %d files)" %
 324                 (self._current_stream_length, len(self._current_stream_files)))
 325         else:
 326             if not self._current_stream_locators:
 327                 self._current_stream_locators.append(config.EMPTY_BLOCK_LOCATOR)
 328             self._finished_streams.append([self._current_stream_name,
 329                                            self._current_stream_locators,
 330                                            self._current_stream_files])
 331         self._current_stream_files = []
 332         self._current_stream_length = 0
 333         self._current_stream_locators = []
 334         self._current_stream_name = None
 335         self._current_file_pos = 0
 336         self._current_file_name = None
 337
 338     def finish(self):
 339         """Store the manifest in Keep and return its locator.
 340
 341         This is useful for storing manifest fragments (task outputs)
 342         temporarily in Keep during a Crunch job.
 343
 344         In other cases you should make a collection instead, by
 345         sending manifest_text() to the API server's "create
 346         collection" endpoint.
 347         """
 348         return self._my_keep().put(self.manifest_text(), copies=self.replication)
 349
 350     def portable_data_hash(self):
 351         stripped = self.stripped_manifest()
 352         return hashlib.md5(stripped).hexdigest() + '+' + str(len(stripped))
 353
 354     def manifest_text(self):
 355         self.finish_current_stream()
 356         manifest = ''
 357
 358         for stream in self._finished_streams:
 359             if not re.search(r'^\.(/.*)?$', stream[0]):
 360                 manifest += './'
 361             manifest += stream[0].replace(' ', '\\040')
 362             manifest += ' ' + ' '.join(stream[1])
 363             manifest += ' ' + ' '.join("%d:%d:%s" % (sfile[0], sfile[1], sfile[2].replace(' ', '\\040')) for sfile in stream[2])
 364             manifest += "\n"
 365
 366         return manifest
 367
 368     def data_locators(self):
 369         ret = []
 370         for name, locators, files in self._finished_streams:
 371             ret += locators
 372         return ret
 373
 374
 375 class ResumableCollectionWriter(CollectionWriter):
 376     STATE_PROPS = ['_current_stream_files', '_current_stream_length',
 377                    '_current_stream_locators', '_current_stream_name',
 378                    '_current_file_name', '_current_file_pos', '_close_file',
 379                    '_data_buffer', '_dependencies', '_finished_streams',
 380                    '_queued_dirents', '_queued_trees']
 381
 382     def __init__(self, api_client=None, **kwargs):
 383         self._dependencies = {}
 384         super(ResumableCollectionWriter, self).__init__(api_client, **kwargs)
 385
 386     @classmethod
 387     def from_state(cls, state, *init_args, **init_kwargs):
 388         # Try to build a new writer from scratch with the given state.
 389         # If the state is not suitable to resume (because files have changed,
 390         # been deleted, aren't predictable, etc.), raise a
 391         # StaleWriterStateError.  Otherwise, return the initialized writer.
 392         # The caller is responsible for calling writer.do_queued_work()
 393         # appropriately after it's returned.
 394         writer = cls(*init_args, **init_kwargs)
 395         for attr_name in cls.STATE_PROPS:
 396             attr_value = state[attr_name]
 397             attr_class = getattr(writer, attr_name).__class__
 398             # Coerce the value into the same type as the initial value, if
 399             # needed.
 400             if attr_class not in (type(None), attr_value.__class__):
 401                 attr_value = attr_class(attr_value)
 402             setattr(writer, attr_name, attr_value)
 403         # Check dependencies before we try to resume anything.
 404         if any(KeepLocator(ls).permission_expired()
 405                for ls in writer._current_stream_locators):
 406             raise errors.StaleWriterStateError(
 407                 "locators include expired permission hint")
 408         writer.check_dependencies()
 409         if state['_current_file'] is not None:
 410             path, pos = state['_current_file']
 411             try:
 412                 writer._queued_file = open(path, 'rb')
 413                 writer._queued_file.seek(pos)
 414             except IOError as error:
 415                 raise errors.StaleWriterStateError(
 416                     "failed to reopen active file {}: {}".format(path, error))
 417         return writer
 418
 419     def check_dependencies(self):
 420         for path, orig_stat in self._dependencies.items():
 421             if not S_ISREG(orig_stat[ST_MODE]):
 422                 raise errors.StaleWriterStateError("{} not file".format(path))
 423             try:
 424                 now_stat = tuple(os.stat(path))
 425             except OSError as error:
 426                 raise errors.StaleWriterStateError(
 427                     "failed to stat {}: {}".format(path, error))
 428             if ((not S_ISREG(now_stat[ST_MODE])) or
 429                 (orig_stat[ST_MTIME] != now_stat[ST_MTIME]) or
 430                 (orig_stat[ST_SIZE] != now_stat[ST_SIZE])):
 431                 raise errors.StaleWriterStateError("{} changed".format(path))
 432
 433     def dump_state(self, copy_func=lambda x: x):
 434         state = {attr: copy_func(getattr(self, attr))
 435                  for attr in self.STATE_PROPS}
 436         if self._queued_file is None:
 437             state['_current_file'] = None
 438         else:
 439             state['_current_file'] = (os.path.realpath(self._queued_file.name),
 440                                       self._queued_file.tell())
 441         return state
 442
 443     def _queue_file(self, source, filename=None):
 444         try:
 445             src_path = os.path.realpath(source)
 446         except Exception:
 447             raise errors.AssertionError("{} not a file path".format(source))
 448         try:
 449             path_stat = os.stat(src_path)
 450         except OSError as stat_error:
 451             path_stat = None
 452         super(ResumableCollectionWriter, self)._queue_file(source, filename)
 453         fd_stat = os.fstat(self._queued_file.fileno())
 454         if not S_ISREG(fd_stat.st_mode):
 455             # We won't be able to resume from this cache anyway, so don't
 456             # worry about further checks.
 457             self._dependencies[source] = tuple(fd_stat)
 458         elif path_stat is None:
 459             raise errors.AssertionError(
 460                 "could not stat {}: {}".format(source, stat_error))
 461         elif path_stat.st_ino != fd_stat.st_ino:
 462             raise errors.AssertionError(
 463                 "{} changed between open and stat calls".format(source))
 464         else:
 465             self._dependencies[src_path] = tuple(fd_stat)
 466
 467     def write(self, data):
 468         if self._queued_file is None:
 469             raise errors.AssertionError(
 470                 "resumable writer can't accept unsourced data")
 471         return super(ResumableCollectionWriter, self).write(data)
 472
 473
 474 ADD = "add"
 475 DEL = "del"
 476 MOD = "mod"
 477 FILE = "file"
 478 COLLECTION = "collection"
 479
 480 class RichCollectionBase(CollectionBase):
 481     """Base class for Collections and Subcollections.
 482
 483     Implements the majority of functionality relating to accessing items in the
 484     Collection.
 485
 486     """
 487
 488     def __init__(self, parent=None):
 489         self.parent = parent
 490         self._modified = True
 491         self._items = {}
 492
 493     def _my_api(self):
 494         raise NotImplementedError()
 495
 496     def _my_keep(self):
 497         raise NotImplementedError()
 498
 499     def _my_block_manager(self):
 500         raise NotImplementedError()
 501
 502     def writable(self):
 503         raise NotImplementedError()
 504
 505     def root_collection(self):
 506         raise NotImplementedError()
 507
 508     def notify(self, event, collection, name, item):
 509         raise NotImplementedError()
 510
 511     def stream_name(self):
 512         raise NotImplementedError()
 513
 514     @must_be_writable
 515     @synchronized
 516     def find_or_create(self, path, create_type):
 517         """Recursively search the specified file path.
 518
 519         May return either a `Collection` or `ArvadosFile`.  If not found, will
 520         create a new item at the specified path based on `create_type`.  Will
 521         create intermediate subcollections needed to contain the final item in
 522         the path.
 523
 524         :create_type:
 525           One of `arvados.collection.FILE` or
 526           `arvados.collection.COLLECTION`.  If the path is not found, and value
 527           of create_type is FILE then create and return a new ArvadosFile for
 528           the last path component.  If COLLECTION, then create and return a new
 529           Collection for the last path component.
 530
 531         """
 532
 533         pathcomponents = path.split("/", 1)
 534         if pathcomponents[0]:
 535             item = self._items.get(pathcomponents[0])
 536             if len(pathcomponents) == 1:
 537                 if item is None:
 538                     # create new file
 539                     if create_type == COLLECTION:
 540                         item = Subcollection(self)
 541                     else:
 542                         item = ArvadosFile(self)
 543                     self._items[pathcomponents[0]] = item
 544                     self._modified = True
 545                     self.notify(ADD, self, pathcomponents[0], item)
 546                 return item
 547             else:
 548                 if item is None:
 549                     # create new collection
 550                     item = Subcollection(self)
 551                     self._items[pathcomponents[0]] = item
 552                     self._modified = True
 553                     self.notify(ADD, self, pathcomponents[0], item)
 554                 if isinstance(item, RichCollectionBase):
 555                     return item.find_or_create(pathcomponents[1], create_type)
 556                 else:
 557                     raise IOError((errno.ENOTDIR, "Interior path components must be subcollection"))
 558         else:
 559             return self
 560
 561     @synchronized
 562     def find(self, path):
 563         """Recursively search the specified file path.
 564
 565         May return either a Collection or ArvadosFile.  Return None if not
 566         found.
 567
 568         """
 569         if not path:
 570             raise errors.ArgumentError("Parameter 'path' must not be empty.")
 571
 572         pathcomponents = path.split("/", 1)
 573         item = self._items.get(pathcomponents[0])
 574         if len(pathcomponents) == 1:
 575             return item
 576         else:
 577             if isinstance(item, RichCollectionBase):
 578                 if pathcomponents[1]:
 579                     return item.find(pathcomponents[1])
 580                 else:
 581                     return item
 582             else:
 583                 raise IOError((errno.ENOTDIR, "Interior path components must be subcollection"))
 584
 585     def mkdirs(path):
 586         """Recursive subcollection create.
 587
 588         Like `os.mkdirs()`.  Will create intermediate subcollections needed to
 589         contain the leaf subcollection path.
 590
 591         """
 592         return self.find_or_create(path, COLLECTION)
 593
 594     def open(self, path, mode="r"):
 595         """Open a file-like object for access.
 596
 597         :path:
 598           path to a file in the collection
 599         :mode:
 600           one of "r", "r+", "w", "w+", "a", "a+"
 601           :"r":
 602             opens for reading
 603           :"r+":
 604             opens for reading and writing.  Reads/writes share a file pointer.
 605           :"w", "w+":
 606             truncates to 0 and opens for reading and writing.  Reads/writes share a file pointer.
 607           :"a", "a+":
 608             opens for reading and writing.  All writes are appended to
 609             the end of the file.  Writing does not affect the file pointer for
 610             reading.
 611         """
 612         mode = mode.replace("b", "")
 613         if len(mode) == 0 or mode[0] not in ("r", "w", "a"):
 614             raise errors.ArgumentError("Bad mode '%s'" % mode)
 615         create = (mode != "r")
 616
 617         if create and not self.writable():
 618             raise IOError((errno.EROFS, "Collection is read only"))
 619
 620         if create:
 621             arvfile = self.find_or_create(path, FILE)
 622         else:
 623             arvfile = self.find(path)
 624
 625         if arvfile is None:
 626             raise IOError((errno.ENOENT, "File not found"))
 627         if not isinstance(arvfile, ArvadosFile):
 628             raise IOError((errno.EISDIR, "Path must refer to a file."))
 629
 630         if mode[0] == "w":
 631             arvfile.truncate(0)
 632
 633         name = os.path.basename(path)
 634
 635         if mode == "r":
 636             return ArvadosFileReader(arvfile, name, mode, num_retries=self.num_retries)
 637         else:
 638             return ArvadosFileWriter(arvfile, name, mode, num_retries=self.num_retries)
 639
 640     @synchronized
 641     def modified(self):
 642         """Test if the collection (or any subcollection or file) has been modified."""
 643         if self._modified:
 644             return True
 645         for k,v in self._items.items():
 646             if v.modified():
 647                 return True
 648         return False
 649
 650     @synchronized
 651     def set_unmodified(self):
 652         """Recursively clear modified flag."""
 653         self._modified = False
 654         for k,v in self._items.items():
 655             v.set_unmodified()
 656
 657     @synchronized
 658     def __iter__(self):
 659         """Iterate over names of files and collections contained in this collection."""
 660         return iter(self._items.keys())
 661
 662     @synchronized
 663     def __getitem__(self, k):
 664         """Get a file or collection that is directly contained by this collection.
 665
 666         If you want to search a path, use `find()` instead.
 667
 668         """
 669         return self._items[k]
 670
 671     @synchronized
 672     def __contains__(self, k):
 673         """Test if there is a file or collection a directly contained by this collection."""
 674         return k in self._items
 675
 676     @synchronized
 677     def __len__(self):
 678         """Get the number of items directly contained in this collection."""
 679         return len(self._items)
 680
 681     @must_be_writable
 682     @synchronized
 683     def __delitem__(self, p):
 684         """Delete an item by name which is directly contained by this collection."""
 685         del self._items[p]
 686         self._modified = True
 687         self.notify(DEL, self, p, None)
 688
 689     @synchronized
 690     def keys(self):
 691         """Get a list of names of files and collections directly contained in this collection."""
 692         return self._items.keys()
 693
 694     @synchronized
 695     def values(self):
 696         """Get a list of files and collection objects directly contained in this collection."""
 697         return self._items.values()
 698
 699     @synchronized
 700     def items(self):
 701         """Get a list of (name, object) tuples directly contained in this collection."""
 702         return self._items.items()
 703
 704     def exists(self, path):
 705         """Test if there is a file or collection at `path`."""
 706         return self.find(path) is not None
 707
 708     @must_be_writable
 709     @synchronized
 710     def remove(self, path, recursive=False):
 711         """Remove the file or subcollection (directory) at `path`.
 712
 713         :recursive:
 714           Specify whether to remove non-empty subcollections (True), or raise an error (False).
 715         """
 716
 717         if not path:
 718             raise errors.ArgumentError("Parameter 'path' must not be empty.")
 719
 720         pathcomponents = path.split("/", 1)
 721         item = self._items.get(pathcomponents[0])
 722         if item is None:
 723             raise IOError((errno.ENOENT, "File not found"))
 724         if len(pathcomponents) == 1:
 725             if isinstance(self._items[pathcomponents[0]], RichCollectionBase) and len(self._items[pathcomponents[0]]) > 0 and not recursive:
 726                 raise IOError((errno.ENOTEMPTY, "Subcollection not empty"))
 727             deleteditem = self._items[pathcomponents[0]]
 728             del self._items[pathcomponents[0]]
 729             self._modified = True
 730             self.notify(DEL, self, pathcomponents[0], deleteditem)
 731         else:
 732             item.remove(pathcomponents[1])
 733
 734     def _clonefrom(self, source):
 735         for k,v in source.items():
 736             self._items[k] = v.clone(self)
 737
 738     def clone(self):
 739         raise NotImplementedError()
 740
 741     @must_be_writable
 742     @synchronized
 743     def add(self, source_obj, target_name, overwrite=False):
 744         """Copy a file or subcollection to this collection.
 745
 746         :source_obj:
 747           An ArvadosFile, or Subcollection object
 748
 749         :target_name:
 750           Destination item name.  If the target name already exists and is a
 751           file, this will raise an error unless you specify `overwrite=True`.
 752
 753         :overwrite:
 754           Whether to overwrite target file if it already exists.
 755
 756         """
 757
 758         if target_name in self and not overwrite:
 759             raise IOError((errno.EEXIST, "File already exists"))
 760
 761         modified_from = None
 762         if target_name in self:
 763             modified_from = self[target_name]
 764
 765         # Actually make the copy.
 766         dup = source_obj.clone(self)
 767         self._items[target_name] = dup
 768         self._modified = True
 769
 770         if modified_from:
 771             self.notify(MOD, self, target_name, (modified_from, dup))
 772         else:
 773             self.notify(ADD, self, target_name, dup)
 774
 775     @must_be_writable
 776     @synchronized
 777     def copy(self, source, target_path, source_collection=None, overwrite=False):
 778         """Copy a file or subcollection to a new path in this collection.
 779
 780         :source:
 781           A string with a path to source file or subcollection, or an actual ArvadosFile or Subcollection object.
 782
 783         :target_path:
 784           Destination file or path.  If the target path already exists and is a
 785           subcollection, the item will be placed inside the subcollection.  If
 786           the target path already exists and is a file, this will raise an error
 787           unless you specify `overwrite=True`.
 788
 789         :source_collection:
 790           Collection to copy `source_path` from (default `self`)
 791
 792         :overwrite:
 793           Whether to overwrite target file if it already exists.
 794         """
 795         if source_collection is None:
 796             source_collection = self
 797
 798         # Find the object to copy
 799         if isinstance(source, basestring):
 800             source_obj = source_collection.find(source)
 801             if source_obj is None:
 802                 raise IOError((errno.ENOENT, "File not found"))
 803             sourcecomponents = source.split("/")
 804         else:
 805             source_obj = source
 806             sourcecomponents = None
 807
 808         # Find parent collection the target path
 809         targetcomponents = target_path.split("/")
 810
 811         # Determine the name to use.
 812         target_name = targetcomponents[-1] if targetcomponents[-1] else (sourcecomponents[-1] if sourcecomponents else None)
 813
 814         if not target_name:
 815             raise errors.ArgumentError("Target path is empty and source is an object.  Cannot determine destination filename to use.")
 816
 817         target_dir = self.find_or_create("/".join(targetcomponents[0:-1]), COLLECTION)
 818
 819         if target_name in target_dir and isinstance(self[target_name], RichCollectionBase) and sourcecomponents:
 820             target_dir = target_dir[target_name]
 821             target_name = sourcecomponents[-1]
 822
 823         target_dir.add(source_obj, target_name, overwrite)
 824
 825     @synchronized
 826     def manifest_text(self, stream_name=".", strip=False, normalize=False):
 827         """Get the manifest text for this collection, sub collections and files.
 828
 829         :stream_name:
 830           Name of the stream (directory)
 831
 832         :strip:
 833           If True, remove signing tokens from block locators if present.
 834           If False (default), block locators are left unchanged.
 835
 836         :normalize:
 837           If True, always export the manifest text in normalized form
 838           even if the Collection is not modified.  If False (default) and the collection
 839           is not modified, return the original manifest text even if it is not
 840           in normalized form.
 841
 842         """
 843
 844         if self.modified() or self._manifest_text is None or normalize:
 845             stream = {}
 846             buf = []
 847             sorted_keys = sorted(self.keys())
 848             for filename in [s for s in sorted_keys if isinstance(self[s], ArvadosFile)]:
 849                 # Create a stream per file `k`
 850                 arvfile = self[filename]
 851                 filestream = []
 852                 for segment in arvfile.segments():
 853                     loc = segment.locator
 854                     if arvfile.parent._my_block_manager().is_bufferblock(loc):
 855                         loc = arvfile.parent._my_block_manager().get_bufferblock(loc).locator()
 856                     if strip:
 857                         loc = KeepLocator(loc).stripped()
 858                     filestream.append(LocatorAndRange(loc, KeepLocator(loc).size,
 859                                          segment.segment_offset, segment.range_size))
 860                 stream[filename] = filestream
 861             if stream:
 862                 buf.append(" ".join(normalize_stream(stream_name, stream)) + "\n")
 863             for dirname in [s for s in sorted_keys if isinstance(self[s], RichCollectionBase)]:
 864                 buf.append(self[dirname].manifest_text(stream_name=os.path.join(stream_name, dirname), strip=strip))
 865             return "".join(buf)
 866         else:
 867             if strip:
 868                 return self.stripped_manifest()
 869             else:
 870                 return self._manifest_text
 871
 872     @synchronized
 873     def diff(self, end_collection, prefix=".", holding_collection=None):
 874         """Generate list of add/modify/delete actions.
 875
 876         When given to `apply`, will change `self` to match `end_collection`
 877
 878         """
 879         changes = []
 880         if holding_collection is None:
 881             holding_collection = Collection(api_client=self._my_api(), keep_client=self._my_keep())
 882         for k in self:
 883             if k not in end_collection:
 884                changes.append((DEL, os.path.join(prefix, k), self[k].clone(holding_collection)))
 885         for k in end_collection:
 886             if k in self:
 887                 if isinstance(end_collection[k], Subcollection) and isinstance(self[k], Subcollection):
 888                     changes.extend(self[k].diff(end_collection[k], os.path.join(prefix, k), holding_collection))
 889                 elif end_collection[k] != self[k]:
 890                     changes.append((MOD, os.path.join(prefix, k), self[k].clone(holding_collection), end_collection[k].clone(holding_collection)))
 891             else:
 892                 changes.append((ADD, os.path.join(prefix, k), end_collection[k].clone(holding_collection)))
 893         return changes
 894
 895     @must_be_writable
 896     @synchronized
 897     def apply(self, changes):
 898         """Apply changes from `diff`.
 899
 900         If a change conflicts with a local change, it will be saved to an
 901         alternate path indicating the conflict.
 902
 903         """
 904         for change in changes:
 905             event_type = change[0]
 906             path = change[1]
 907             initial = change[2]
 908             local = self.find(path)
 909             conflictpath = "%s~conflict-%s~" % (path, time.strftime("%Y-%m-%d-%H:%M:%S",
 910                                                                     time.gmtime()))
 911             if event_type == ADD:
 912                 if local is None:
 913                     # No local file at path, safe to copy over new file
 914                     self.copy(initial, path)
 915                 elif local is not None and local != initial:
 916                     # There is already local file and it is different:
 917                     # save change to conflict file.
 918                     self.copy(initial, conflictpath)
 919             elif event_type == MOD:
 920                 final = change[3]
 921                 if local == initial:
 922                     # Local matches the "initial" item so it has not
 923                     # changed locally and is safe to update.
 924                     if isinstance(local, ArvadosFile) and isinstance(final, ArvadosFile):
 925                         # Replace contents of local file with new contents
 926                         local.replace_contents(final)
 927                     else:
 928                         # Overwrite path with new item; this can happen if
 929                         # path was a file and is now a collection or vice versa
 930                         self.copy(final, path, overwrite=True)
 931                 else:
 932                     # Local is missing (presumably deleted) or local doesn't
 933                     # match the "start" value, so save change to conflict file
 934                     self.copy(final, conflictpath)
 935             elif event_type == DEL:
 936                 if local == initial:
 937                     # Local item matches "initial" value, so it is safe to remove.
 938                     self.remove(path, recursive=True)
 939                 # else, the file is modified or already removed, in either
 940                 # case we don't want to try to remove it.
 941
 942     def portable_data_hash(self):
 943         """Get the portable data hash for this collection's manifest."""
 944         stripped = self.manifest_text(strip=True)
 945         return hashlib.md5(stripped).hexdigest() + '+' + str(len(stripped))
 946
 947     @synchronized
 948     def __eq__(self, other):
 949         if other is self:
 950             return True
 951         if not isinstance(other, RichCollectionBase):
 952             return False
 953         if len(self._items) != len(other):
 954             return False
 955         for k in self._items:
 956             if k not in other:
 957                 return False
 958             if self._items[k] != other[k]:
 959                 return False
 960         return True
 961
 962     def __ne__(self, other):
 963         return not self.__eq__(other)
 964
 965
 966 class Collection(RichCollectionBase):
 967     """Represents the root of an Arvados Collection.
 968
 969     This class is threadsafe.  The root collection object, all subcollections
 970     and files are protected by a single lock (i.e. each access locks the entire
 971     collection).
 972
 973     Brief summary of
 974     useful methods:
 975
 976     :To read an existing file:
 977       `c.open("myfile", "r")`
 978
 979     :To write a new file:
 980       `c.open("myfile", "w")`
 981
 982     :To determine if a file exists:
 983       `c.find("myfile") is not None`
 984
 985     :To copy a file:
 986       `c.copy("source", "dest")`
 987
 988     :To delete a file:
 989       `c.remove("myfile")`
 990
 991     :To save to an existing collection record:
 992       `c.save()`
 993
 994     :To save a new collection record:
 995     `c.save_new()`
 996
 997     :To merge remote changes into this object:
 998       `c.update()`
 999
1000     Must be associated with an API server Collection record (during
1001     initialization, or using `save_new`) to use `save` or `update`
1002
1003     """
1004
1005     def __init__(self, manifest_locator_or_text=None,
1006                  api_client=None,
1007                  keep_client=None,
1008                  num_retries=None,
1009                  parent=None,
1010                  apiconfig=None,
1011                  block_manager=None):
1012         """Collection constructor.
1013
1014         :manifest_locator_or_text:
1015           One of Arvados collection UUID, block locator of
1016           a manifest, raw manifest text, or None (to create an empty collection).
1017         :parent:
1018           the parent Collection, may be None.
1019         :apiconfig:
1020           A dict containing keys for ARVADOS_API_HOST and ARVADOS_API_TOKEN.
1021           Prefer this over supplying your own api_client and keep_client (except in testing).
1022           Will use default config settings if not specified.
1023         :api_client:
1024           The API client object to use for requests.  If not specified, create one using `apiconfig`.
1025         :keep_client:
1026           the Keep client to use for requests.  If not specified, create one using `apiconfig`.
1027         :num_retries:
1028           the number of retries for API and Keep requests.
1029         :block_manager:
1030           the block manager to use.  If not specified, create one.
1031
1032         """
1033         super(Collection, self).__init__(parent)
1034         self._api_client = api_client
1035         self._keep_client = keep_client
1036         self._block_manager = block_manager
1037
1038         if apiconfig:
1039             self._config = apiconfig
1040         else:
1041             self._config = config.settings()
1042
1043         self.num_retries = num_retries if num_retries is not None else 0
1044         self._manifest_locator = None
1045         self._manifest_text = None
1046         self._api_response = None
1047
1048         self.lock = threading.RLock()
1049         self.callbacks = []
1050         self.events = None
1051
1052         if manifest_locator_or_text:
1053             if re.match(util.keep_locator_pattern, manifest_locator_or_text):
1054                 self._manifest_locator = manifest_locator_or_text
1055             elif re.match(util.collection_uuid_pattern, manifest_locator_or_text):
1056                 self._manifest_locator = manifest_locator_or_text
1057             elif re.match(util.manifest_pattern, manifest_locator_or_text):
1058                 self._manifest_text = manifest_locator_or_text
1059             else:
1060                 raise errors.ArgumentError(
1061                     "Argument to CollectionReader must be a manifest or a collection UUID")
1062
1063             try:
1064                 self._populate()
1065             except (IOError, errors.SyntaxError) as e:
1066                 raise errors.ArgumentError("Error processing manifest text: %s", e)
1067
1068     def root_collection(self):
1069         return self
1070
1071     def stream_name(self):
1072         return "."
1073
1074     def writable(self):
1075         return True
1076
1077     @synchronized
1078     @retry_method
1079     def update(self, other=None, num_retries=None):
1080         """Merge the latest collection on the API server with the current collection."""
1081
1082         if other is None:
1083             if self._manifest_locator is None:
1084                 raise errors.ArgumentError("`other` is None but collection does not have a manifest_locator uuid")
1085             response = self._my_api().collections().get(uuid=self._manifest_locator).execute(num_retries=num_retries)
1086             other = CollectionReader(response["manifest_text"])
1087         baseline = CollectionReader(self._manifest_text)
1088         self.apply(baseline.diff(other))
1089
1090     @synchronized
1091     def _my_api(self):
1092         if self._api_client is None:
1093             self._api_client = ThreadSafeApiCache(self._config)
1094             self._keep_client = self._api_client.keep
1095         return self._api_client
1096
1097     @synchronized
1098     def _my_keep(self):
1099         if self._keep_client is None:
1100             if self._api_client is None:
1101                 self._my_api()
1102             else:
1103                 self._keep_client = KeepClient(api_client=self._api_client)
1104         return self._keep_client
1105
1106     @synchronized
1107     def _my_block_manager(self):
1108         if self._block_manager is None:
1109             self._block_manager = _BlockManager(self._my_keep())
1110         return self._block_manager
1111
1112     def _populate_from_api_server(self):
1113         # As in KeepClient itself, we must wait until the last
1114         # possible moment to instantiate an API client, in order to
1115         # avoid tripping up clients that don't have access to an API
1116         # server.  If we do build one, make sure our Keep client uses
1117         # it.  If instantiation fails, we'll fall back to the except
1118         # clause, just like any other Collection lookup
1119         # failure. Return an exception, or None if successful.
1120         try:
1121             self._api_response = self._my_api().collections().get(
1122                 uuid=self._manifest_locator).execute(
1123                     num_retries=self.num_retries)
1124             self._manifest_text = self._api_response['manifest_text']
1125             return None
1126         except Exception as e:
1127             return e
1128
1129     def _populate_from_keep(self):
1130         # Retrieve a manifest directly from Keep. This has a chance of
1131         # working if [a] the locator includes a permission signature
1132         # or [b] the Keep services are operating in world-readable
1133         # mode. Return an exception, or None if successful.
1134         try:
1135             self._manifest_text = self._my_keep().get(
1136                 self._manifest_locator, num_retries=self.num_retries)
1137         except Exception as e:
1138             return e
1139
1140     def _populate(self):
1141         if self._manifest_locator is None and self._manifest_text is None:
1142             return
1143         error_via_api = None
1144         error_via_keep = None
1145         should_try_keep = ((self._manifest_text is None) and
1146                            util.keep_locator_pattern.match(
1147                                self._manifest_locator))
1148         if ((self._manifest_text is None) and
1149             util.signed_locator_pattern.match(self._manifest_locator)):
1150             error_via_keep = self._populate_from_keep()
1151         if self._manifest_text is None:
1152             error_via_api = self._populate_from_api_server()
1153             if error_via_api is not None and not should_try_keep:
1154                 raise error_via_api
1155         if ((self._manifest_text is None) and
1156             not error_via_keep and
1157             should_try_keep):
1158             # Looks like a keep locator, and we didn't already try keep above
1159             error_via_keep = self._populate_from_keep()
1160         if self._manifest_text is None:
1161             # Nothing worked!
1162             raise errors.NotFoundError(
1163                 ("Failed to retrieve collection '{}' " +
1164                  "from either API server ({}) or Keep ({})."
1165                  ).format(
1166                     self._manifest_locator,
1167                     error_via_api,
1168                     error_via_keep))
1169         # populate
1170         self._baseline_manifest = self._manifest_text
1171         self._import_manifest(self._manifest_text)
1172
1173
1174     def _has_collection_uuid(self):
1175         return self._manifest_locator is not None and re.match(util.collection_uuid_pattern, self._manifest_locator)
1176
1177     def __enter__(self):
1178         return self
1179
1180     def __exit__(self, exc_type, exc_value, traceback):
1181         """Support scoped auto-commit in a with: block."""
1182         if exc_type is not None:
1183             if self.writable() and self._has_collection_uuid():
1184                 self.save()
1185         if self._block_manager is not None:
1186             self._block_manager.stop_threads()
1187
1188     @synchronized
1189     def manifest_locator(self):
1190         """Get the manifest locator, if any.
1191
1192         The manifest locator will be set when the collection is loaded from an
1193         API server record or the portable data hash of a manifest.
1194
1195         The manifest locator will be None if the collection is newly created or
1196         was created directly from manifest text.  The method `save_new()` will
1197         assign a manifest locator.
1198
1199         """
1200         return self._manifest_locator
1201
1202     @synchronized
1203     def clone(self, new_parent=None, readonly=False, new_config=None):
1204         if new_config is None:
1205             new_config = self._config
1206         if readonly:
1207             newcollection = CollectionReader(parent=new_parent, apiconfig=new_config)
1208         else:
1209             newcollection = Collection(parent=new_parent, apiconfig=new_config)
1210
1211         newcollection._clonefrom(self)
1212         return newcollection
1213
1214     @synchronized
1215     def api_response(self):
1216         """Returns information about this Collection fetched from the API server.
1217
1218         If the Collection exists in Keep but not the API server, currently
1219         returns None.  Future versions may provide a synthetic response.
1220
1221         """
1222         return self._api_response
1223
1224     def find_or_create(self, path, create_type):
1225         """See `RichCollectionBase.find_or_create`"""
1226         if path == ".":
1227             return self
1228         else:
1229             return super(Collection, self).find_or_create(path[2:] if path.startswith("./") else path, create_type)
1230
1231     def find(self, path):
1232         """See `RichCollectionBase.find`"""
1233         if path == ".":
1234             return self
1235         else:
1236             return super(Collection, self).find(path[2:] if path.startswith("./") else path)
1237
1238     def remove(self, path, recursive=False):
1239         """See `RichCollectionBase.remove`"""
1240         if path == ".":
1241             raise errors.ArgumentError("Cannot remove '.'")
1242         else:
1243             return super(Collection, self).remove(path[2:] if path.startswith("./") else path, recursive)
1244
1245     @must_be_writable
1246     @synchronized
1247     @retry_method
1248     def save(self, merge=True, num_retries=None):
1249         """Save collection to an existing collection record.
1250
1251         Commit pending buffer blocks to Keep, merge with remote record (if
1252         merge=True, the default), write the manifest to Keep, and update the
1253         collection record.
1254
1255         Will raise AssertionError if not associated with a collection record on
1256         the API server.  If you want to save a manifest to Keep only, see
1257         `save_new()`.
1258
1259         :merge:
1260           Update and merge remote changes before saving.  Otherwise, any
1261           remote changes will be ignored and overwritten.
1262
1263         :num_retries:
1264           Retry count on API calls (if None,  use the collection default)
1265
1266         """
1267         if self.modified():
1268             if not self._has_collection_uuid():
1269                 raise AssertionError("Collection manifest_locator must be a collection uuid.  Use save_new() for new collections.")
1270             self._my_block_manager().commit_all()
1271             if merge:
1272                 self.update()
1273             self._my_keep().put(self.manifest_text(strip=True), num_retries=num_retries)
1274
1275             text = self.manifest_text(strip=False)
1276             self._api_response = self._my_api().collections().update(
1277                 uuid=self._manifest_locator,
1278                 body={'manifest_text': text}
1279                 ).execute(
1280                     num_retries=num_retries)
1281             self._manifest_text = self._api_response["manifest_text"]
1282             self.set_unmodified()
1283
1284
1285     @must_be_writable
1286     @synchronized
1287     @retry_method
1288     def save_new(self, name=None, create_collection_record=True, owner_uuid=None, ensure_unique_name=False, num_retries=None):
1289         """Save collection to a new collection record.
1290
1291         Commit pending buffer blocks to Keep, write the manifest to Keep, and
1292         create a new collection record (if create_collection_record True).
1293         After creating a new collection record, this Collection object will be
1294         associated with the new record used by `save()`.
1295
1296         :name:
1297           The collection name.
1298
1299         :create_collection_record:
1300           If True, create a collection record.  If False, only save the manifest to keep.
1301
1302         :owner_uuid:
1303           the user, or project uuid that will own this collection.
1304           If None, defaults to the current user.
1305
1306         :ensure_unique_name:
1307           If True, ask the API server to rename the collection
1308           if it conflicts with a collection with the same name and owner.  If
1309           False, a name conflict will result in an error.
1310
1311         :num_retries:
1312           Retry count on API calls (if None,  use the collection default)
1313
1314         """
1315         self._my_block_manager().commit_all()
1316         self._my_keep().put(self.manifest_text(strip=True), num_retries=num_retries)
1317         text = self.manifest_text(strip=False)
1318
1319         if create_collection_record:
1320             if name is None:
1321                 name = "Collection created %s" % (time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime()))
1322
1323             body = {"manifest_text": text,
1324                     "name": name}
1325             if owner_uuid:
1326                 body["owner_uuid"] = owner_uuid
1327
1328             self._api_response = self._my_api().collections().create(ensure_unique_name=ensure_unique_name, body=body).execute(num_retries=num_retries)
1329             text = self._api_response["manifest_text"]
1330
1331             self._manifest_locator = self._api_response["uuid"]
1332
1333         self._manifest_text = text
1334         self.set_unmodified()
1335
1336     @synchronized
1337     def subscribe(self, callback):
1338         self.callbacks.append(callback)
1339
1340     @synchronized
1341     def unsubscribe(self, callback):
1342         self.callbacks.remove(callback)
1343
1344     @synchronized
1345     def notify(self, event, collection, name, item):
1346         for c in self.callbacks:
1347             c(event, collection, name, item)
1348
1349     @synchronized
1350     def _import_manifest(self, manifest_text):
1351         """Import a manifest into a `Collection`.
1352
1353         :manifest_text:
1354           The manifest text to import from.
1355
1356         """
1357         if len(self) > 0:
1358             raise ArgumentError("Can only import manifest into an empty collection")
1359
1360         STREAM_NAME = 0
1361         BLOCKS = 1
1362         SEGMENTS = 2
1363
1364         stream_name = None
1365         state = STREAM_NAME
1366
1367         for token_and_separator in re.finditer(r'(\S+)(\s+|$)', manifest_text):
1368             tok = token_and_separator.group(1)
1369             sep = token_and_separator.group(2)
1370
1371             if state == STREAM_NAME:
1372                 # starting a new stream
1373                 stream_name = tok.replace('\\040', ' ')
1374                 blocks = []
1375                 segments = []
1376                 streamoffset = 0L
1377                 state = BLOCKS
1378                 continue
1379
1380             if state == BLOCKS:
1381                 block_locator = re.match(r'[0-9a-f]{32}\+(\d+)(\+\S+)*', tok)
1382                 if block_locator:
1383                     blocksize = long(block_locator.group(1))
1384                     blocks.append(Range(tok, streamoffset, blocksize))
1385                     streamoffset += blocksize
1386                 else:
1387                     state = SEGMENTS
1388
1389             if state == SEGMENTS:
1390                 file_segment = re.search(r'^(\d+):(\d+):(\S+)', tok)
1391                 if file_segment:
1392                     pos = long(file_segment.group(1))
1393                     size = long(file_segment.group(2))
1394                     name = file_segment.group(3).replace('\\040', ' ')
1395                     filepath = os.path.join(stream_name, name)
1396                     afile = self.find_or_create(filepath, FILE)
1397                     if isinstance(afile, ArvadosFile):
1398                         afile.add_segment(blocks, pos, size)
1399                     else:
1400                         raise errors.SyntaxError("File %s conflicts with stream of the same name.", filepath)
1401                 else:
1402                     # error!
1403                     raise errors.SyntaxError("Invalid manifest format")
1404
1405             if sep == "\n":
1406                 stream_name = None
1407                 state = STREAM_NAME
1408
1409         self.set_unmodified()
1410
1411
1412 class Subcollection(RichCollectionBase):
1413     """This is a subdirectory within a collection that doesn't have its own API
1414     server record.
1415
1416     It falls under the umbrella of the root collection.
1417
1418     """
1419
1420     def __init__(self, parent):
1421         super(Subcollection, self).__init__(parent)
1422         self.lock = self.root_collection().lock
1423         self._manifest_text = None
1424
1425     def root_collection(self):
1426         return self.parent.root_collection()
1427
1428     def writable(self):
1429         return self.root_collection().writable()
1430
1431     def _my_api(self):
1432         return self.root_collection()._my_api()
1433
1434     def _my_keep(self):
1435         return self.root_collection()._my_keep()
1436
1437     def _my_block_manager(self):
1438         return self.root_collection()._my_block_manager()
1439
1440     def notify(self, event, collection, name, item):
1441         return self.root_collection().notify(event, collection, name, item)
1442
1443     def stream_name(self):
1444         for k, v in self.parent.items():
1445             if v is self:
1446                 return os.path.join(self.parent.stream_name(), k)
1447         return '.'
1448
1449     @synchronized
1450     def clone(self, new_parent):
1451         c = Subcollection(new_parent)
1452         c._clonefrom(self)
1453         return c
1454
1455
1456 class CollectionReader(Collection):
1457     """A read-only collection object.
1458
1459     Initialize from an api collection record locator, a portable data hash of a
1460     manifest, or raw manifest text.  See `Collection` constructor for detailed
1461     options.
1462
1463     """
1464     def __init__(self, manifest_locator_or_text, *args, **kwargs):
1465         self._in_init = True
1466         super(CollectionReader, self).__init__(manifest_locator_or_text, *args, **kwargs)
1467         self._in_init = False
1468
1469         # Forego any locking since it should never change once initialized.
1470         self.lock = NoopLock()
1471
1472         # Backwards compatability with old CollectionReader
1473         # all_streams() and all_files()
1474         self._streams = None
1475
1476     def writable(self):
1477         return self._in_init
1478
1479     def _populate_streams(orig_func):
1480         @functools.wraps(orig_func)
1481         def populate_streams_wrapper(self, *args, **kwargs):
1482             # Defer populating self._streams until needed since it creates a copy of the manifest.
1483             if self._streams is None:
1484                 if self._manifest_text:
1485                     self._streams = [sline.split()
1486                                      for sline in self._manifest_text.split("\n")
1487                                      if sline]
1488                 else:
1489                     self._streams = []
1490             return orig_func(self, *args, **kwargs)
1491         return populate_streams_wrapper
1492
1493     @_populate_streams
1494     def normalize(self):
1495         """Normalize the streams returned by `all_streams`.
1496
1497         This method is kept for backwards compatability and only affects the
1498         behavior of `all_streams()` and `all_files()`
1499
1500         """
1501
1502         # Rearrange streams
1503         streams = {}
1504         for s in self.all_streams():
1505             for f in s.all_files():
1506                 streamname, filename = split(s.name() + "/" + f.name())
1507                 if streamname not in streams:
1508                     streams[streamname] = {}
1509                 if filename not in streams[streamname]:
1510                     streams[streamname][filename] = []
1511                 for r in f.segments:
1512                     streams[streamname][filename].extend(s.locators_and_ranges(r.locator, r.range_size))
1513
1514         self._streams = [normalize_stream(s, streams[s])
1515                          for s in sorted(streams)]
1516     @_populate_streams
1517     def all_streams(self):
1518         return [StreamReader(s, self._my_keep(), num_retries=self.num_retries)
1519                 for s in self._streams]
1520
1521     @_populate_streams
1522     def all_files(self):
1523         for s in self.all_streams():
1524             for f in s.all_files():
1525                 yield f