7 arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True)
9 this_job = arvados.current_job()
10 this_task = arvados.current_task()
11 gatk_path = arvados.util.tarball_extract(
12 tarball = this_job['script_parameters']['gatk_binary_tarball'],
14 bundle_path = arvados.util.collection_extract(
15 collection = this_job['script_parameters']['gatk_bundle'],
17 files = ['human_g1k_v37.dict', 'human_g1k_v37.fasta', 'human_g1k_v37.fasta.fai'])
18 this_task_input = this_task['parameters']['input']
20 input_file = list(arvados.CollectionReader(this_task_input).all_files())[0]
22 # choose vcf temporary file names
23 vcf_in = os.path.join(arvados.current_task().tmpdir,
24 os.path.basename(input_file.name()))
25 vcf_out = re.sub('(.*)\\.vcf', '\\1-filtered.vcf', vcf_in)
27 # fetch the unfiltered data
28 vcf_in_file = open(vcf_in, 'w')
29 for buf in input_file.readall():
30 vcf_in_file.write(buf)
33 stdoutdata, stderrdata = arvados.util.run_command(
35 '-jar', os.path.join(gatk_path,'GenomeAnalysisTK.jar'),
36 '-T', 'VariantFiltration', '--variant', vcf_in,
38 '--filterExpression', 'QD < 2.0',
39 '--filterName', 'GATK_QD',
40 '--filterExpression', 'MQ < 40.0',
41 '--filterName', 'GATK_MQ',
42 '--filterExpression', 'FS > 60.0',
43 '--filterName', 'GATK_FS',
44 '--filterExpression', 'MQRankSum < -12.5',
45 '--filterName', 'GATK_MQRankSum',
46 '--filterExpression', 'ReadPosRankSum < -8.0',
47 '--filterName', 'GATK_ReadPosRankSum',
48 '-R', os.path.join(bundle_path, 'human_g1k_v37.fasta')],
49 cwd=arvados.current_task().tmpdir)
51 # store the filtered data
52 with open(vcf_out, 'rb') as f:
53 out = arvados.CollectionWriter()
59 out.set_current_file_name(os.path.basename(vcf_out))
61 this_task.set_output(out.finish())