X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/20143aeb278799fa34d6d99aedcf1a519514aaf2..0560a5812377315a92c8a4e9b41a68677832ea65:/crunch_scripts/collection-merge diff --git a/crunch_scripts/collection-merge b/crunch_scripts/collection-merge old mode 100644 new mode 100755 index b377d01398..ca80a82e3f --- a/crunch_scripts/collection-merge +++ b/crunch_scripts/collection-merge @@ -1,13 +1,46 @@ #!/usr/bin/env python +# collection-merge +# +# Merge two or more collections together. Can also be used to extract specific +# files from a collection to produce a new collection. +# +# input: +# An array of collections or collection/file paths in script_parameter["input"] +# +# output: +# A manifest with the collections merged. Duplicate file names will +# have their contents concatenated in the order that they appear in the input +# array. + import arvados +import md5 +import crunchutil.subst as subst +import subprocess +import os +import hashlib -inputs = arvados.current_job()['script_parameters']['input'] -if not isinstance(inputs, (list,tuple)): - inputs = [inputs] +p = arvados.current_job()['script_parameters'] -out_manifest = '' -for locator in inputs: - out_manifest += arvados.CollectionReader(locator).manifest_text() +merged = "" +src = [] +for c in p["input"]: + c = subst.do_substitution(p, c) + i = c.find('/') + if i == -1: + src.append(c) + merged += arvados.CollectionReader(c).manifest_text() + else: + src.append(c[0:i]) + cr = arvados.CollectionReader(c[0:i]) + j = c.rfind('/') + stream = c[i+1:j] + if stream == "": + stream = "." + fn = c[(j+1):] + for s in cr.all_streams(): + if s.name() == stream: + if fn in s.files(): + merged += s.files()[fn].as_manifest() -arvados.current_task().set_output(Keep.put(out_manifest)) +arvados.current_task().set_output(merged)