X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/20143aeb278799fa34d6d99aedcf1a519514aaf2..e6e3c231b08a8f9786ea04c948f79c45604da1cc:/crunch_scripts/collection-merge?ds=sidebyside diff --git a/crunch_scripts/collection-merge b/crunch_scripts/collection-merge old mode 100644 new mode 100755 index b377d01398..f16d62466a --- a/crunch_scripts/collection-merge +++ b/crunch_scripts/collection-merge @@ -1,13 +1,57 @@ #!/usr/bin/env python import arvados +import md5 +import subst +import subprocess +import os +import hashlib -inputs = arvados.current_job()['script_parameters']['input'] -if not isinstance(inputs, (list,tuple)): - inputs = [inputs] +p = arvados.current_job()['script_parameters'] -out_manifest = '' -for locator in inputs: - out_manifest += arvados.CollectionReader(locator).manifest_text() +merged = "" +src = [] +for c in p["input"]: + c = subst.do_substitution(p, c) + i = c.find('/') + if i == -1: + src.append(c) + merged += arvados.CollectionReader(c).manifest_text() + else: + src.append(c[0:i]) + cr = arvados.CollectionReader(c[0:i]) + j = c.rfind('/') + stream = c[i+1:j] + if stream == "": + stream = "." + fn = c[(j+1):] + for s in cr.all_streams(): + if s.name() == stream: + if fn in s.files(): + merged += s.files()[fn].as_manifest() -arvados.current_task().set_output(Keep.put(out_manifest)) +crm = arvados.CollectionReader(merged) + +combined = crm.manifest_text(strip=True) + +m = hashlib.new('md5') +m.update(combined) + +uuid = "{}+{}".format(m.hexdigest(), len(combined)) + +collection = arvados.api().collections().create( + body={ + 'uuid': uuid, + 'manifest_text': crm.manifest_text(), + }).execute() + +for s in src: + l = arvados.api().links().create(body={ + "link": { + "tail_uuid": s, + "head_uuid": uuid, + "link_class": "provenance", + "name": "provided" + }}).execute() + +arvados.current_task().set_output(uuid)