sdk/python/arvados/collection.py

   1 import gflags
   2 import httplib
   3 import httplib2
   4 import logging
   5 import os
   6 import pprint
   7 import sys
   8 import types
   9 import subprocess
  10 import json
  11 import UserDict
  12 import re
  13 import hashlib
  14 import string
  15 import bz2
  16 import zlib
  17 import fcntl
  18 import time
  19 import threading
  20
  21 from collections import deque
  22 from stat import *
  23
  24 from keep import *
  25 from stream import *
  26 import config
  27 import errors
  28 import util
  29
  30 _logger = logging.getLogger('arvados.collection')
  31
  32 def normalize_stream(s, stream):
  33     stream_tokens = [s]
  34     sortedfiles = list(stream.keys())
  35     sortedfiles.sort()
  36
  37     blocks = {}
  38     streamoffset = 0L
  39     for f in sortedfiles:
  40         for b in stream[f]:
  41             if b[arvados.LOCATOR] not in blocks:
  42                 stream_tokens.append(b[arvados.LOCATOR])
  43                 blocks[b[arvados.LOCATOR]] = streamoffset
  44                 streamoffset += b[arvados.BLOCKSIZE]
  45
  46     if len(stream_tokens) == 1:
  47         stream_tokens.append(config.EMPTY_BLOCK_LOCATOR)
  48
  49     for f in sortedfiles:
  50         current_span = None
  51         fout = f.replace(' ', '\\040')
  52         for segment in stream[f]:
  53             segmentoffset = blocks[segment[arvados.LOCATOR]] + segment[arvados.OFFSET]
  54             if current_span == None:
  55                 current_span = [segmentoffset, segmentoffset + segment[arvados.SEGMENTSIZE]]
  56             else:
  57                 if segmentoffset == current_span[1]:
  58                     current_span[1] += segment[arvados.SEGMENTSIZE]
  59                 else:
  60                     stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
  61                     current_span = [segmentoffset, segmentoffset + segment[arvados.SEGMENTSIZE]]
  62
  63         if current_span != None:
  64             stream_tokens.append("{0}:{1}:{2}".format(current_span[0], current_span[1] - current_span[0], fout))
  65
  66         if len(stream[f]) == 0:
  67             stream_tokens.append("0:0:{0}".format(fout))
  68
  69     return stream_tokens
  70
  71 def normalize(collection):
  72     streams = {}
  73     for s in collection.all_streams():
  74         for f in s.all_files():
  75             filestream = s.name() + "/" + f.name()
  76             r = filestream.rindex("/")
  77             streamname = filestream[:r]
  78             filename = filestream[r+1:]
  79             if streamname not in streams:
  80                 streams[streamname] = {}
  81             if filename not in streams[streamname]:
  82                 streams[streamname][filename] = []
  83             for r in f.segments:
  84                 streams[streamname][filename].extend(s.locators_and_ranges(r[0], r[1]))
  85
  86     normalized_streams = []
  87     sortedstreams = list(streams.keys())
  88     sortedstreams.sort()
  89     for s in sortedstreams:
  90         normalized_streams.append(normalize_stream(s, streams[s]))
  91     return normalized_streams
  92
  93
  94 class CollectionBase(object):
  95     def __enter__(self):
  96         pass
  97
  98     def __exit__(self):
  99         pass
 100
 101     def _my_keep(self):
 102         if self._keep_client is None:
 103             self._keep_client = KeepClient(api_client=self._api_client,
 104                                            num_retries=self.num_retries)
 105         return self._keep_client
 106
 107
 108 class CollectionReader(CollectionBase):
 109     def __init__(self, manifest_locator_or_text, api_client=None,
 110                  keep_client=None, num_retries=0):
 111         """Instantiate a CollectionReader.
 112
 113         This class parses Collection manifests to provide a simple interface
 114         to read its underlying files.
 115
 116         Arguments:
 117         * manifest_locator_or_text: One of a Collection UUID, portable data
 118           hash, or full manifest text.
 119         * api_client: The API client to use to look up Collections.  If not
 120           provided, CollectionReader will build one from available Arvados
 121           configuration.
 122         * keep_client: The KeepClient to use to download Collection data.
 123           If not provided, CollectionReader will build one from available
 124           Arvados configuration.
 125         * num_retries: The default number of times to retry failed
 126           service requests.  Default 0.  You may change this value
 127           after instantiation, but note those changes may not
 128           propagate to related objects like the Keep client.
 129         """
 130         self._api_client = api_client
 131         self._keep_client = keep_client
 132         self.num_retries = num_retries
 133         if re.match(r'[a-f0-9]{32}(\+\d+)?(\+\S+)*$', manifest_locator_or_text):
 134             self._manifest_locator = manifest_locator_or_text
 135             self._manifest_text = None
 136         elif re.match(r'[a-z0-9]{5}-[a-z0-9]{5}-[a-z0-9]{15}$', manifest_locator_or_text):
 137             self._manifest_locator = manifest_locator_or_text
 138             self._manifest_text = None
 139         elif re.match(r'((\S+)( +[a-f0-9]{32}(\+\d+)(\+\S+)*)+( +\d+:\d+:\S+)+$)+', manifest_locator_or_text, re.MULTILINE):
 140             self._manifest_text = manifest_locator_or_text
 141             self._manifest_locator = None
 142         else:
 143             raise errors.ArgumentError(
 144                 "Argument to CollectionReader must be a manifest or a collection UUID")
 145         self._streams = None
 146
 147     def _populate(self):
 148         if self._streams is not None:
 149             return
 150         if not self._manifest_text:
 151             try:
 152                 # As in KeepClient itself, we must wait until the last possible
 153                 # moment to instantiate an API client, in order to avoid
 154                 # tripping up clients that don't have access to an API server.
 155                 # If we do build one, make sure our Keep client uses it.
 156                 # If instantiation fails, we'll fall back to the except clause,
 157                 # just like any other Collection lookup failure.
 158                 if self._api_client is None:
 159                     self._api_client = arvados.api('v1')
 160                     self._keep_client = None  # Make a new one with the new api.
 161                 c = self._api_client.collections().get(
 162                     uuid=self._manifest_locator).execute(
 163                     num_retries=self.num_retries)
 164                 self._manifest_text = c['manifest_text']
 165             except Exception as e:
 166                 if not util.portable_data_hash_pattern.match(
 167                       self._manifest_locator):
 168                     raise
 169                 _logger.warning(
 170                     "API server did not return Collection '%s'. " +
 171                     "Trying to fetch directly from Keep (deprecated).",
 172                     self._manifest_locator)
 173                 self._manifest_text = self._my_keep().get(
 174                     self._manifest_locator, num_retries=self.num_retries)
 175         self._streams = [sline.split()
 176                          for sline in self._manifest_text.split("\n")
 177                          if sline]
 178         self._streams = normalize(self)
 179
 180         # now regenerate the manifest text based on the normalized stream
 181
 182         #print "normalizing", self._manifest_text
 183         self._manifest_text = ''.join([StreamReader(stream, keep=self._my_keep()).manifest_text() for stream in self._streams])
 184         #print "result", self._manifest_text
 185
 186
 187     def all_streams(self):
 188         self._populate()
 189         return [StreamReader(s, self._my_keep(), num_retries=self.num_retries)
 190                 for s in self._streams]
 191
 192     def all_files(self):
 193         for s in self.all_streams():
 194             for f in s.all_files():
 195                 yield f
 196
 197     def manifest_text(self, strip=False):
 198         self._populate()
 199         if strip:
 200             m = ''.join([StreamReader(stream, keep=self._my_keep()).manifest_text(strip=True) for stream in self._streams])
 201             return m
 202         else:
 203             return self._manifest_text
 204
 205
 206 class CollectionWriter(CollectionBase):
 207     KEEP_BLOCK_SIZE = 2**26
 208
 209     def __init__(self, api_client=None, num_retries=0):
 210         """Instantiate a CollectionWriter.
 211
 212         CollectionWriter lets you build a new Arvados Collection from scratch.
 213         Write files to it.  The CollectionWriter will upload data to Keep as
 214         appropriate, and provide you with the Collection manifest text when
 215         you're finished.
 216
 217         Arguments:
 218         * api_client: The API client to use to look up Collections.  If not
 219           provided, CollectionReader will build one from available Arvados
 220           configuration.
 221         * num_retries: The default number of times to retry failed
 222           service requests.  Default 0.  You may change this value
 223           after instantiation, but note those changes may not
 224           propagate to related objects like the Keep client.
 225         """
 226         self._api_client = api_client
 227         self.num_retries = num_retries
 228         self._keep_client = None
 229         self._data_buffer = []
 230         self._data_buffer_len = 0
 231         self._current_stream_files = []
 232         self._current_stream_length = 0
 233         self._current_stream_locators = []
 234         self._current_stream_name = '.'
 235         self._current_file_name = None
 236         self._current_file_pos = 0
 237         self._finished_streams = []
 238         self._close_file = None
 239         self._queued_file = None
 240         self._queued_dirents = deque()
 241         self._queued_trees = deque()
 242
 243     def __exit__(self):
 244         self.finish()
 245
 246     def do_queued_work(self):
 247         # The work queue consists of three pieces:
 248         # * _queued_file: The file object we're currently writing to the
 249         #   Collection.
 250         # * _queued_dirents: Entries under the current directory
 251         #   (_queued_trees[0]) that we want to write or recurse through.
 252         #   This may contain files from subdirectories if
 253         #   max_manifest_depth == 0 for this directory.
 254         # * _queued_trees: Directories that should be written as separate
 255         #   streams to the Collection.
 256         # This function handles the smallest piece of work currently queued
 257         # (current file, then current directory, then next directory) until
 258         # no work remains.  The _work_THING methods each do a unit of work on
 259         # THING.  _queue_THING methods add a THING to the work queue.
 260         while True:
 261             if self._queued_file:
 262                 self._work_file()
 263             elif self._queued_dirents:
 264                 self._work_dirents()
 265             elif self._queued_trees:
 266                 self._work_trees()
 267             else:
 268                 break
 269
 270     def _work_file(self):
 271         while True:
 272             buf = self._queued_file.read(self.KEEP_BLOCK_SIZE)
 273             if not buf:
 274                 break
 275             self.write(buf)
 276         self.finish_current_file()
 277         if self._close_file:
 278             self._queued_file.close()
 279         self._close_file = None
 280         self._queued_file = None
 281
 282     def _work_dirents(self):
 283         path, stream_name, max_manifest_depth = self._queued_trees[0]
 284         if stream_name != self.current_stream_name():
 285             self.start_new_stream(stream_name)
 286         while self._queued_dirents:
 287             dirent = self._queued_dirents.popleft()
 288             target = os.path.join(path, dirent)
 289             if os.path.isdir(target):
 290                 self._queue_tree(target,
 291                                  os.path.join(stream_name, dirent),
 292                                  max_manifest_depth - 1)
 293             else:
 294                 self._queue_file(target, dirent)
 295                 break
 296         if not self._queued_dirents:
 297             self._queued_trees.popleft()
 298
 299     def _work_trees(self):
 300         path, stream_name, max_manifest_depth = self._queued_trees[0]
 301         make_dirents = (util.listdir_recursive if (max_manifest_depth == 0)
 302                         else os.listdir)
 303         d = make_dirents(path)
 304         if len(d) > 0:
 305             self._queue_dirents(stream_name, d)
 306         else:
 307             self._queued_trees.popleft()
 308
 309     def _queue_file(self, source, filename=None):
 310         assert (self._queued_file is None), "tried to queue more than one file"
 311         if not hasattr(source, 'read'):
 312             source = open(source, 'rb')
 313             self._close_file = True
 314         else:
 315             self._close_file = False
 316         if filename is None:
 317             filename = os.path.basename(source.name)
 318         self.start_new_file(filename)
 319         self._queued_file = source
 320
 321     def _queue_dirents(self, stream_name, dirents):
 322         assert (not self._queued_dirents), "tried to queue more than one tree"
 323         self._queued_dirents = deque(sorted(dirents))
 324
 325     def _queue_tree(self, path, stream_name, max_manifest_depth):
 326         self._queued_trees.append((path, stream_name, max_manifest_depth))
 327
 328     def write_file(self, source, filename=None):
 329         self._queue_file(source, filename)
 330         self.do_queued_work()
 331
 332     def write_directory_tree(self,
 333                              path, stream_name='.', max_manifest_depth=-1):
 334         self._queue_tree(path, stream_name, max_manifest_depth)
 335         self.do_queued_work()
 336
 337     def write(self, newdata):
 338         if hasattr(newdata, '__iter__'):
 339             for s in newdata:
 340                 self.write(s)
 341             return
 342         self._data_buffer.append(newdata)
 343         self._data_buffer_len += len(newdata)
 344         self._current_stream_length += len(newdata)
 345         while self._data_buffer_len >= self.KEEP_BLOCK_SIZE:
 346             self.flush_data()
 347
 348     def flush_data(self):
 349         data_buffer = ''.join(self._data_buffer)
 350         if data_buffer:
 351             self._current_stream_locators.append(
 352                 self._my_keep().put(data_buffer[0:self.KEEP_BLOCK_SIZE]))
 353             self._data_buffer = [data_buffer[self.KEEP_BLOCK_SIZE:]]
 354             self._data_buffer_len = len(self._data_buffer[0])
 355
 356     def start_new_file(self, newfilename=None):
 357         self.finish_current_file()
 358         self.set_current_file_name(newfilename)
 359
 360     def set_current_file_name(self, newfilename):
 361         if re.search(r'[\t\n]', newfilename):
 362             raise errors.AssertionError(
 363                 "Manifest filenames cannot contain whitespace: %s" %
 364                 newfilename)
 365         self._current_file_name = newfilename
 366
 367     def current_file_name(self):
 368         return self._current_file_name
 369
 370     def finish_current_file(self):
 371         if self._current_file_name == None:
 372             if self._current_file_pos == self._current_stream_length:
 373                 return
 374             raise errors.AssertionError(
 375                 "Cannot finish an unnamed file " +
 376                 "(%d bytes at offset %d in '%s' stream)" %
 377                 (self._current_stream_length - self._current_file_pos,
 378                  self._current_file_pos,
 379                  self._current_stream_name))
 380         self._current_stream_files.append([
 381                 self._current_file_pos,
 382                 self._current_stream_length - self._current_file_pos,
 383                 self._current_file_name])
 384         self._current_file_pos = self._current_stream_length
 385
 386     def start_new_stream(self, newstreamname='.'):
 387         self.finish_current_stream()
 388         self.set_current_stream_name(newstreamname)
 389
 390     def set_current_stream_name(self, newstreamname):
 391         if re.search(r'[\t\n]', newstreamname):
 392             raise errors.AssertionError(
 393                 "Manifest stream names cannot contain whitespace")
 394         self._current_stream_name = '.' if newstreamname=='' else newstreamname
 395
 396     def current_stream_name(self):
 397         return self._current_stream_name
 398
 399     def finish_current_stream(self):
 400         self.finish_current_file()
 401         self.flush_data()
 402         if not self._current_stream_files:
 403             pass
 404         elif self._current_stream_name is None:
 405             raise errors.AssertionError(
 406                 "Cannot finish an unnamed stream (%d bytes in %d files)" %
 407                 (self._current_stream_length, len(self._current_stream_files)))
 408         else:
 409             if not self._current_stream_locators:
 410                 self._current_stream_locators.append(config.EMPTY_BLOCK_LOCATOR)
 411             self._finished_streams.append([self._current_stream_name,
 412                                            self._current_stream_locators,
 413                                            self._current_stream_files])
 414         self._current_stream_files = []
 415         self._current_stream_length = 0
 416         self._current_stream_locators = []
 417         self._current_stream_name = None
 418         self._current_file_pos = 0
 419         self._current_file_name = None
 420
 421     def finish(self):
 422         # Store the manifest in Keep and return its locator.
 423         return self._my_keep().put(self.manifest_text())
 424
 425     def stripped_manifest(self):
 426         """
 427         Return the manifest for the current collection with all permission
 428         hints removed from the locators in the manifest.
 429         """
 430         raw = self.manifest_text()
 431         clean = ''
 432         for line in raw.split("\n"):
 433             fields = line.split()
 434             if len(fields) > 0:
 435                 locators = [ re.sub(r'\+A[a-z0-9@_-]+', '', x)
 436                              for x in fields[1:-1] ]
 437                 clean += fields[0] + ' ' + ' '.join(locators) + ' ' + fields[-1] + "\n"
 438         return clean
 439
 440     def manifest_text(self):
 441         self.finish_current_stream()
 442         manifest = ''
 443
 444         for stream in self._finished_streams:
 445             if not re.search(r'^\.(/.*)?$', stream[0]):
 446                 manifest += './'
 447             manifest += stream[0].replace(' ', '\\040')
 448             manifest += ' ' + ' '.join(stream[1])
 449             manifest += ' ' + ' '.join("%d:%d:%s" % (sfile[0], sfile[1], sfile[2].replace(' ', '\\040')) for sfile in stream[2])
 450             manifest += "\n"
 451
 452         if manifest:
 453             return CollectionReader(manifest, self._api_client).manifest_text()
 454         else:
 455             return ""
 456
 457     def data_locators(self):
 458         ret = []
 459         for name, locators, files in self._finished_streams:
 460             ret += locators
 461         return ret
 462
 463
 464 class ResumableCollectionWriter(CollectionWriter):
 465     STATE_PROPS = ['_current_stream_files', '_current_stream_length',
 466                    '_current_stream_locators', '_current_stream_name',
 467                    '_current_file_name', '_current_file_pos', '_close_file',
 468                    '_data_buffer', '_dependencies', '_finished_streams',
 469                    '_queued_dirents', '_queued_trees']
 470
 471     def __init__(self, api_client=None, num_retries=0):
 472         self._dependencies = {}
 473         super(ResumableCollectionWriter, self).__init__(
 474             api_client, num_retries=num_retries)
 475
 476     @classmethod
 477     def from_state(cls, state, *init_args, **init_kwargs):
 478         # Try to build a new writer from scratch with the given state.
 479         # If the state is not suitable to resume (because files have changed,
 480         # been deleted, aren't predictable, etc.), raise a
 481         # StaleWriterStateError.  Otherwise, return the initialized writer.
 482         # The caller is responsible for calling writer.do_queued_work()
 483         # appropriately after it's returned.
 484         writer = cls(*init_args, **init_kwargs)
 485         for attr_name in cls.STATE_PROPS:
 486             attr_value = state[attr_name]
 487             attr_class = getattr(writer, attr_name).__class__
 488             # Coerce the value into the same type as the initial value, if
 489             # needed.
 490             if attr_class not in (type(None), attr_value.__class__):
 491                 attr_value = attr_class(attr_value)
 492             setattr(writer, attr_name, attr_value)
 493         # Check dependencies before we try to resume anything.
 494         if any(KeepLocator(ls).permission_expired()
 495                for ls in writer._current_stream_locators):
 496             raise errors.StaleWriterStateError(
 497                 "locators include expired permission hint")
 498         writer.check_dependencies()
 499         if state['_current_file'] is not None:
 500             path, pos = state['_current_file']
 501             try:
 502                 writer._queued_file = open(path, 'rb')
 503                 writer._queued_file.seek(pos)
 504             except IOError as error:
 505                 raise errors.StaleWriterStateError(
 506                     "failed to reopen active file {}: {}".format(path, error))
 507         return writer
 508
 509     def check_dependencies(self):
 510         for path, orig_stat in self._dependencies.items():
 511             if not S_ISREG(orig_stat[ST_MODE]):
 512                 raise errors.StaleWriterStateError("{} not file".format(path))
 513             try:
 514                 now_stat = tuple(os.stat(path))
 515             except OSError as error:
 516                 raise errors.StaleWriterStateError(
 517                     "failed to stat {}: {}".format(path, error))
 518             if ((not S_ISREG(now_stat[ST_MODE])) or
 519                 (orig_stat[ST_MTIME] != now_stat[ST_MTIME]) or
 520                 (orig_stat[ST_SIZE] != now_stat[ST_SIZE])):
 521                 raise errors.StaleWriterStateError("{} changed".format(path))
 522
 523     def dump_state(self, copy_func=lambda x: x):
 524         state = {attr: copy_func(getattr(self, attr))
 525                  for attr in self.STATE_PROPS}
 526         if self._queued_file is None:
 527             state['_current_file'] = None
 528         else:
 529             state['_current_file'] = (os.path.realpath(self._queued_file.name),
 530                                       self._queued_file.tell())
 531         return state
 532
 533     def _queue_file(self, source, filename=None):
 534         try:
 535             src_path = os.path.realpath(source)
 536         except Exception:
 537             raise errors.AssertionError("{} not a file path".format(source))
 538         try:
 539             path_stat = os.stat(src_path)
 540         except OSError as stat_error:
 541             path_stat = None
 542         super(ResumableCollectionWriter, self)._queue_file(source, filename)
 543         fd_stat = os.fstat(self._queued_file.fileno())
 544         if not S_ISREG(fd_stat.st_mode):
 545             # We won't be able to resume from this cache anyway, so don't
 546             # worry about further checks.
 547             self._dependencies[source] = tuple(fd_stat)
 548         elif path_stat is None:
 549             raise errors.AssertionError(
 550                 "could not stat {}: {}".format(source, stat_error))
 551         elif path_stat.st_ino != fd_stat.st_ino:
 552             raise errors.AssertionError(
 553                 "{} changed between open and stat calls".format(source))
 554         else:
 555             self._dependencies[src_path] = tuple(fd_stat)
 556
 557     def write(self, data):
 558         if self._queued_file is None:
 559             raise errors.AssertionError(
 560                 "resumable writer can't accept unsourced data")
 561         return super(ResumableCollectionWriter, self).write(data)