Merge branch '3373-improve-gatk3-snv-pipeline' closes #3373
[arvados.git] / crunch_scripts / decompress-all.py
1 #!/usr/bin/env python
2
3 #
4 # decompress-all.py
5 #
6 # Decompress all compressed files in the collection using the "dtrx" tool and
7 # produce a new collection with the contents.  Uncompressed files
8 # are passed through.
9 #
10 # input:
11 # A collection at script_parameters["input"]
12 #
13 # output:
14 # A manifest of the uncompressed contents of the input collection.
15
16 import arvados
17 import re
18 import subprocess
19 import os
20 import sys
21
22 arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True,
23                                           input_as_path=True)
24
25 task = arvados.current_task()
26
27 input_file = task['parameters']['input']
28
29 infile_parts = re.match(r"(^[a-f0-9]{32}\+\d+)(\+\S+)*(/.*)?(/[^/]+)$", input_file)
30
31 outdir = os.path.join(task.tmpdir, "output")
32 os.makedirs(outdir)
33 os.chdir(outdir)
34
35 if infile_parts == None:
36     print >>sys.stderr, "Failed to parse input filename '%s' as a Keep file\n" % input_file
37     sys.exit(1)
38
39 cr = arvados.CollectionReader(infile_parts.group(1))
40 streamname = infile_parts.group(3)[1:]
41 filename = infile_parts.group(4)[1:]
42
43 if streamname != None:
44     subprocess.call(["mkdir", "-p", streamname])
45     os.chdir(streamname)
46 else:
47     streamname = '.'
48
49 m = re.match(r'\.(gz|Z|bz2|tgz|tbz|zip|rar|7z|cab|deb|rpm|cpio|gem)$', arvados.get_task_param_mount('input'), re.IGNORECASE)
50
51 if m != None:
52     rc = subprocess.call(["dtrx", "-r", "-n", "-q", arvados.get_task_param_mount('input')])
53     if rc == 0:
54         out = arvados.CollectionWriter()
55         out.write_directory_tree(outdir, max_manifest_depth=0)
56         task.set_output(out.finish())
57     else:
58         return rc
59 else:
60     streamreader = filter(lambda s: s.name() == streamname, cr.all_streams())[0]
61     filereader = streamreader.files()[filename]
62     task.set_output(streamname + filereader.as_manifest()[1:])