#!/usr/bin/env python # Copyright (C) The Arvados Authors. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 import arvados import os import re import sys import pyrtg this_job = arvados.current_job() this_task = arvados.current_task() fastq_path = arvados.util.collection_extract( collection = this_job['script_parameters']['input'], path = 'fastq') fastq_files = filter(lambda f: f != '.locator', os.listdir(fastq_path)) tmp_dir_base = os.path.join(arvados.current_task().tmpdir, 'tmp') out_dir = os.path.join(arvados.current_task().tmpdir, 'reads') arvados.util.run_command(['rm', '-rf', tmp_dir_base], stderr=sys.stderr) arvados.util.run_command(['rm', '-rf', out_dir], stderr=sys.stderr) os.mkdir(tmp_dir_base) # convert fastq to sdf tmp_dirs = [] for leftarm in fastq_files: if re.search(r'_1.f(ast)?q(.gz)?$', leftarm): rightarm = re.sub(r'_1(.f(ast)?q(.gz)?)$', '_2\\1', leftarm) if rightarm in fastq_files: tmp_dirs += ['%s/%08d' % (tmp_dir_base, len(tmp_dirs))] pyrtg.run_rtg('format', tmp_dirs[-1], ['-f', 'fastq', '-q', 'sanger', '-l', os.path.join(fastq_path, leftarm), '-r', os.path.join(fastq_path, rightarm)]) # split sdf pyrtg.run_rtg('sdfsplit', out_dir, ['-n', '1500000'] + tmp_dirs) # store output out = arvados.CollectionWriter() out.write_directory_tree(out_dir, max_manifest_depth=1) this_task.set_output(out.finish())