From aa384c3f2d8b7d1782ea059e1eb56c15f542a40c Mon Sep 17 00:00:00 2001 From: Brett Smith Date: Thu, 29 May 2014 13:02:14 -0400 Subject: [PATCH] 2752: Don't duplicate arv-put work after resume. This change serializes the command-line arguments that we've actually processed, vs. not. That allows us to safely iterate over them again to upload any files that we hadn't started, while skipping the ones we've already done. --- sdk/python/arvados/commands/put.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/sdk/python/arvados/commands/put.py b/sdk/python/arvados/commands/put.py index 1ccf786114..44f911e60b 100644 --- a/sdk/python/arvados/commands/put.py +++ b/sdk/python/arvados/commands/put.py @@ -221,10 +221,11 @@ class ResumeCache(object): class ArvPutCollectionWriter(arvados.ResumableCollectionWriter): STATE_PROPS = (arvados.ResumableCollectionWriter.STATE_PROPS + - ['bytes_written']) + ['bytes_written', '_seen_inputs']) def __init__(self, cache=None, reporter=None, bytes_expected=None): self.bytes_written = 0 + self._seen_inputs = [] self.cache = cache if reporter is None: self.report_progress = lambda bytes_w, bytes_e: None @@ -267,6 +268,25 @@ class ArvPutCollectionWriter(arvados.ResumableCollectionWriter): self.bytes_written += (bytes_buffered - self._data_buffer_len) self.report_progress(self.bytes_written, self.bytes_expected) + def _record_new_input(self, input_type, source_name, dest_name): + # The key needs to be a list because that's what we'll get back + # from JSON deserialization. + key = [input_type, source_name, dest_name] + if key in self._seen_inputs: + return False + self._seen_inputs.append(key) + return True + + def write_file(self, source, filename=None): + if self._record_new_input('file', source, filename): + super(ArvPutCollectionWriter, self).write_file(source, filename) + + def write_directory_tree(self, + path, stream_name='.', max_manifest_depth=-1): + if self._record_new_input('directory', path, stream_name): + super(ArvPutCollectionWriter, self).write_directory_tree( + path, stream_name, max_manifest_depth) + def expected_bytes_for(pathlist): # Walk the given directory trees and stat files, adding up file sizes, -- 2.30.2