#!/usr/bin/env python # Copyright (C) The Arvados Authors. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 import arvados import os import re arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True) this_job = arvados.current_job() this_task = arvados.current_task() gatk_path = arvados.util.tarball_extract( tarball = this_job['script_parameters']['gatk_binary_tarball'], path = 'gatk') bundle_path = arvados.util.collection_extract( collection = this_job['script_parameters']['gatk_bundle'], path = 'gatk-bundle', files = ['human_g1k_v37.dict', 'human_g1k_v37.fasta', 'human_g1k_v37.fasta.fai']) this_task_input = this_task['parameters']['input'] input_file = list(arvados.CollectionReader(this_task_input).all_files())[0] # choose vcf temporary file names vcf_in = os.path.join(arvados.current_task().tmpdir, os.path.basename(input_file.name())) vcf_out = re.sub('(.*)\\.vcf', '\\1-filtered.vcf', vcf_in) # fetch the unfiltered data vcf_in_file = open(vcf_in, 'w') for buf in input_file.readall(): vcf_in_file.write(buf) vcf_in_file.close() stdoutdata, stderrdata = arvados.util.run_command( ['java', '-Xmx1g', '-jar', os.path.join(gatk_path,'GenomeAnalysisTK.jar'), '-T', 'VariantFiltration', '--variant', vcf_in, '--out', vcf_out, '--filterExpression', 'QD < 2.0', '--filterName', 'GATK_QD', '--filterExpression', 'MQ < 40.0', '--filterName', 'GATK_MQ', '--filterExpression', 'FS > 60.0', '--filterName', 'GATK_FS', '--filterExpression', 'MQRankSum < -12.5', '--filterName', 'GATK_MQRankSum', '--filterExpression', 'ReadPosRankSum < -8.0', '--filterName', 'GATK_ReadPosRankSum', '-R', os.path.join(bundle_path, 'human_g1k_v37.fasta')], cwd=arvados.current_task().tmpdir) # store the filtered data with open(vcf_out, 'rb') as f: out = arvados.CollectionWriter() while True: buf = f.read() if len(buf) == 0: break out.write(buf) out.set_current_file_name(os.path.basename(vcf_out)) this_task.set_output(out.finish())