13924: Add index for PDH lookups.
[arvados.git] / crunch_scripts / rtg-fastq2sdf
1 #!/usr/bin/env python
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: Apache-2.0
5
6 import arvados
7 import os
8 import re
9 import sys
10 import pyrtg
11
12 this_job = arvados.current_job()
13 this_task = arvados.current_task()
14 fastq_path = arvados.util.collection_extract(
15     collection = this_job['script_parameters']['input'],
16     path = 'fastq')
17 fastq_files = filter(lambda f: f != '.locator', os.listdir(fastq_path))
18 tmp_dir_base = os.path.join(arvados.current_task().tmpdir, 'tmp')
19 out_dir = os.path.join(arvados.current_task().tmpdir, 'reads')
20
21 arvados.util.run_command(['rm', '-rf', tmp_dir_base], stderr=sys.stderr)
22 arvados.util.run_command(['rm', '-rf', out_dir], stderr=sys.stderr)
23 os.mkdir(tmp_dir_base)
24
25 # convert fastq to sdf
26 tmp_dirs = []
27 for leftarm in fastq_files:
28     if re.search(r'_1.f(ast)?q(.gz)?$', leftarm):
29         rightarm = re.sub(r'_1(.f(ast)?q(.gz)?)$', '_2\\1', leftarm)
30         if rightarm in fastq_files:
31             tmp_dirs += ['%s/%08d' % (tmp_dir_base, len(tmp_dirs))]
32             pyrtg.run_rtg('format', tmp_dirs[-1],
33                           ['-f', 'fastq',
34                            '-q', 'sanger',
35                            '-l', os.path.join(fastq_path, leftarm),
36                            '-r', os.path.join(fastq_path, rightarm)])
37
38 # split sdf
39 pyrtg.run_rtg('sdfsplit', out_dir,
40               ['-n', '1500000'] + tmp_dirs)
41
42 # store output
43 out = arvados.CollectionWriter()
44 out.write_directory_tree(out_dir, max_manifest_depth=1)
45 this_task.set_output(out.finish())