X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/92f63fe18f3b6d8e4ee589e7a962d39ed4754e9e..d01477c59395e6d0895fffe0f60cce6bda9bb083:/crunch_scripts/decompress-all.py diff --git a/crunch_scripts/decompress-all.py b/crunch_scripts/decompress-all.py index a3858d279d..50d11f4d97 100755 --- a/crunch_scripts/decompress-all.py +++ b/crunch_scripts/decompress-all.py @@ -1,40 +1,61 @@ #!/usr/bin/env python +# +# decompress-all.py +# +# Decompress all compressed files in the collection using the "dtrx" tool and +# produce a new collection with the contents. Uncompressed files +# are passed through. +# +# input: +# A collection at script_parameters["input"] +# +# output: +# A manifest of the uncompressed contents of the input collection. + import arvados import re import subprocess +import os +import sys +import crunchutil.robust_put as robust_put arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True, input_as_path=True) task = arvados.current_task() -input_file = arvados.gettaskparam('input') +input_file = task['parameters']['input'] -result = re.match(r"(^[a-f0-9]{32}\+\d+)(\+\S+)*(/.*)(/.*)?$", input_file) +infile_parts = re.match(r"(^[a-f0-9]{32}\+\d+)(\+\S+)*(/.*)?(/[^/]+)$", input_file) outdir = os.path.join(task.tmpdir, "output") -os.mkdirs(outdir) +os.makedirs(outdir) os.chdir(outdir) -if result != None: - cr = arvados.CollectionReader(re.group(1)) - streamname = '.' - if re.group(3) != None: - streamname += re.group(2) - filename = re.group(3)[1:] - else: - filename = re.group(2)[1:] +if infile_parts is None: + print >>sys.stderr, "Failed to parse input filename '%s' as a Keep file\n" % input_file + sys.exit(1) + +cr = arvados.CollectionReader(infile_parts.group(1)) +streamname = infile_parts.group(3)[1:] +filename = infile_parts.group(4)[1:] - os.mkdirs(streamname) +if streamname is not None: + subprocess.call(["mkdir", "-p", streamname]) os.chdir(streamname) - streamreader = filter(lambda s: s.name() == streamname, cr.all_streams())[0] - filereader = stream.files()[filename] - rc = subprocess.call("dtrx", "-r", "-n", arvados.get_task_param_mount('input')) +else: + streamname = '.' + +m = re.match(r'.*\.(gz|Z|bz2|tgz|tbz|zip|rar|7z|cab|deb|rpm|cpio|gem)$', arvados.get_task_param_mount('input'), re.IGNORECASE) + +if m is not None: + rc = subprocess.call(["dtrx", "-r", "-n", "-q", arvados.get_task_param_mount('input')]) if rc == 0: - out.write_directory_tree(outdir, max_manifest_depth=0) - arvados.task_set_output(out.finish()) + task.set_output(robust_put.upload(outdir)) else: - arvados.task_set_output(streamname + filereader.as_manifest()[1:]) + sys.exit(rc) else: - sys.exit(1) + streamreader = filter(lambda s: s.name() == streamname, cr.all_streams())[0] + filereader = streamreader.files()[filename] + task.set_output(streamname + filereader.as_manifest()[1:])