Merge branch 'master' into 11840-unique-constraint-untrash-coll
[arvados.git] / crunch_scripts / collection-merge
old mode 100644 (file)
new mode 100755 (executable)
index 50e90f7..ca80a82
@@ -1,13 +1,46 @@
 #!/usr/bin/env python
 
+# collection-merge
+#
+# Merge two or more collections together.  Can also be used to extract specific
+# files from a collection to produce a new collection.
+#
+# input:
+# An array of collections or collection/file paths in script_parameter["input"]
+#
+# output:
+# A manifest with the collections merged.  Duplicate file names will
+# have their contents concatenated in the order that they appear in the input
+# array.
+
 import arvados
+import md5
+import crunchutil.subst as subst
+import subprocess
+import os
+import hashlib
 
-inputs = arvados.current_job()['script_parameters']['input']
-if not isinstance(inputs, (list,tuple)):
-    inputs = [inputs]
+p = arvados.current_job()['script_parameters']
 
-out_manifest = ''
-for locator in inputs:
-    out_manifest += arvados.CollectionReader(locator).manifest_text()
+merged = ""
+src = []
+for c in p["input"]:
+    c = subst.do_substitution(p, c)
+    i = c.find('/')
+    if i == -1:
+        src.append(c)
+        merged += arvados.CollectionReader(c).manifest_text()
+    else:
+        src.append(c[0:i])
+        cr = arvados.CollectionReader(c[0:i])
+        j = c.rfind('/')
+        stream = c[i+1:j]
+        if stream == "":
+            stream = "."
+        fn = c[(j+1):]
+        for s in cr.all_streams():
+            if s.name() == stream:
+                if fn in s.files():
+                    merged += s.files()[fn].as_manifest()
 
-arvados.current_task().set_output(arvados.Keep.put(out_manifest))
+arvados.current_task().set_output(merged)