2 # Copyright (C) The Arvados Authors. All rights reserved.
4 # SPDX-License-Identifier: Apache-2.0
12 this_job = arvados.current_job()
13 this_task = arvados.current_task()
14 fastq_path = arvados.util.collection_extract(
15 collection = this_job['script_parameters']['input'],
17 fastq_files = filter(lambda f: f != '.locator', os.listdir(fastq_path))
18 tmp_dir_base = os.path.join(arvados.current_task().tmpdir, 'tmp')
19 out_dir = os.path.join(arvados.current_task().tmpdir, 'reads')
21 arvados.util.run_command(['rm', '-rf', tmp_dir_base], stderr=sys.stderr)
22 arvados.util.run_command(['rm', '-rf', out_dir], stderr=sys.stderr)
23 os.mkdir(tmp_dir_base)
25 # convert fastq to sdf
27 for leftarm in fastq_files:
28 if re.search(r'_1.f(ast)?q(.gz)?$', leftarm):
29 rightarm = re.sub(r'_1(.f(ast)?q(.gz)?)$', '_2\\1', leftarm)
30 if rightarm in fastq_files:
31 tmp_dirs += ['%s/%08d' % (tmp_dir_base, len(tmp_dirs))]
32 pyrtg.run_rtg('format', tmp_dirs[-1],
35 '-l', os.path.join(fastq_path, leftarm),
36 '-r', os.path.join(fastq_path, rightarm)])
39 pyrtg.run_rtg('sdfsplit', out_dir,
40 ['-n', '1500000'] + tmp_dirs)
43 out = arvados.CollectionWriter()
44 out.write_directory_tree(out_dir, max_manifest_depth=1)
45 this_task.set_output(out.finish())