From f248a1ad098f5fe877acf79db9b43665103a14de Mon Sep 17 00:00:00 2001 From: Lucas Di Pentima Date: Fri, 6 Aug 2021 16:55:16 -0300 Subject: [PATCH] 17948: Optimizes tool to stop adding data to a big enough manifest. Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima --- .../test-collection-create.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tools/test-collection-create/test-collection-create.py b/tools/test-collection-create/test-collection-create.py index ddd5d04ba3..12f9ebe025 100644 --- a/tools/test-collection-create/test-collection-create.py +++ b/tools/test-collection-create/test-collection-create.py @@ -60,15 +60,24 @@ def get_stream(name, max_filesize, data_loc, args): stream = "{} {} {}".format(name, data_loc, ' '.join(files)) return stream -def create_substreams(depth, base_stream_name, max_filesize, data_loc, args): - streams = [get_stream(base_stream_name, max_filesize, data_loc, args)] - if depth == 0: - logger.info("Finished stream {}".format(base_stream_name)) +def create_substreams(depth, base_stream_name, max_filesize, data_loc, args, current_size=0): + current_stream = get_stream(base_stream_name, max_filesize, data_loc, args) + current_size += len(current_stream) + streams = [current_stream] + + if current_size >= (128 * 1024 * 1024): + logger.debug("Maximum manifest size reached -- finishing early at {}".format(base_stream_name)) + elif depth == 0: + logger.debug("Finished stream {}".format(base_stream_name)) else: for _ in range(random.randint(1, 10)): stream_name = base_stream_name+'/'+get_random_name(False) - streams.extend( - create_substreams(depth-1, stream_name, max_filesize, data_loc, args)) + substreams = create_substreams(depth-1, stream_name, max_filesize, + data_loc, args, current_size) + current_size += sum([len(x) for x in substreams]) + if current_size >= (128 * 1024 * 1024) == 0: + break + streams.extend(substreams) return streams def parse_arguments(arguments): @@ -85,6 +94,7 @@ def parse_arguments(arguments): def main(arguments=None): args = parse_arguments(arguments) + logger.info("Creating test collection with (min={}, max={}) files per directory and a tree depth of (min={}, max={})...".format(args.min_files, args.max_files, args.min_depth, args.max_depth)) api = arvados.api('v1', timeout=5*60) max_filesize = 1024*1024 data_block = ''.join([random.choice(string.printable) for i in range(max_filesize)]) -- 2.30.2