6218: add performance profiling and a sample test in python sdk.
[arvados.git] / crunch_scripts / rtg-fastq2sdf
1 #!/usr/bin/env python
2
3 import arvados
4 import os
5 import re
6 import sys
7 import pyrtg
8
9 this_job = arvados.current_job()
10 this_task = arvados.current_task()
11 fastq_path = arvados.util.collection_extract(
12     collection = this_job['script_parameters']['input'],
13     path = 'fastq')
14 fastq_files = filter(lambda f: f != '.locator', os.listdir(fastq_path))
15 tmp_dir_base = os.path.join(arvados.current_task().tmpdir, 'tmp')
16 out_dir = os.path.join(arvados.current_task().tmpdir, 'reads')
17
18 arvados.util.run_command(['rm', '-rf', tmp_dir_base], stderr=sys.stderr)
19 arvados.util.run_command(['rm', '-rf', out_dir], stderr=sys.stderr)
20 os.mkdir(tmp_dir_base)
21
22 # convert fastq to sdf
23 tmp_dirs = []
24 for leftarm in fastq_files:
25     if re.search(r'_1.f(ast)?q(.gz)?$', leftarm):
26         rightarm = re.sub(r'_1(.f(ast)?q(.gz)?)$', '_2\\1', leftarm)
27         if rightarm in fastq_files:
28             tmp_dirs += ['%s/%08d' % (tmp_dir_base, len(tmp_dirs))]
29             pyrtg.run_rtg('format', tmp_dirs[-1],
30                           ['-f', 'fastq',
31                            '-q', 'sanger',
32                            '-l', os.path.join(fastq_path, leftarm),
33                            '-r', os.path.join(fastq_path, rightarm)])
34
35 # split sdf
36 pyrtg.run_rtg('sdfsplit', out_dir,
37               ['-n', '1500000'] + tmp_dirs)
38
39 # store output
40 out = arvados.CollectionWriter()
41 out.write_directory_tree(out_dir, max_manifest_depth=1)
42 this_task.set_output(out.finish())