add GATK2 VariantFiltration example crunch script
authorTom Clegg <tom@clinicalfuture.com>
Thu, 11 Jul 2013 15:56:14 +0000 (15:56 +0000)
committerTom Clegg <tom@clinicalfuture.com>
Thu, 11 Jul 2013 15:58:07 +0000 (15:58 +0000)
crunch_scripts/GATK2-VariantFiltration [new file with mode: 0755]

diff --git a/crunch_scripts/GATK2-VariantFiltration b/crunch_scripts/GATK2-VariantFiltration
new file mode 100755 (executable)
index 0000000..75af612
--- /dev/null
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+import arvados
+import os
+import re
+
+arvados.job_setup.one_task_per_input_file(if_sequence=0, and_end_task=True)
+
+this_job = arvados.current_job()
+this_task = arvados.current_task()
+gatk_path = arvados.util.tarball_extract(
+    tarball = this_job['script_parameters']['gatk_binary_tarball'],
+    path = 'gatk')
+bundle_path = arvados.util.collection_extract(
+    collection = this_job['script_parameters']['gatk_bundle'],
+    path = 'gatk-bundle',
+    files = ['human_g1k_v37.dict', 'human_g1k_v37.fasta', 'human_g1k_v37.fasta.fai'])
+this_task_input = this_task['parameters']['input']
+
+input_file = list(arvados.CollectionReader(this_task_input).all_files())[0]
+
+# pick "before" and "after" vcf filenames
+vcf_in = os.path.join(arvados.current_task().tmpdir,
+                      os.path.basename(input_file.name()))
+vcf_out = re.sub('(.*)\\.vcf', '\\1-filtered.vcf', vcf_in)
+
+# fetch the "before" data
+vcf_in_file = open(vcf_in, 'w')
+for buf in input_file.readall():
+    vcf_in_file.write(buf)
+vcf_in_file.close()
+
+stdoutdata, stderrdata = arvados.util.run_command(
+    ['java', '-Xmx1g',
+     '-jar', os.path.join(gatk_path,'GenomeAnalysisTK.jar'),
+     '-T', 'VariantFiltration', '--variant', vcf_in,
+     '--out', vcf_out,
+     '--filterExpression', 'QD < 2.0',
+     '--filterName', 'GATK_QD',
+     '--filterExpression', 'MQ < 40.0',
+     '--filterName', 'GATK_MQ',
+     '--filterExpression', 'FS > 60.0',
+     '--filterName', 'GATK_FS',
+     '--filterExpression', 'MQRankSum < -12.5',
+     '--filterName', 'GATK_MQRankSum',
+     '--filterExpression', 'ReadPosRankSum < -8.0',
+     '--filterName', 'GATK_ReadPosRankSum',
+     '-R', os.path.join(bundle_path, 'human_g1k_v37.fasta')],
+    cwd=arvados.current_task().tmpdir)
+
+with open(vcf_out, 'rb') as f:
+    out = arvados.CollectionWriter()
+    while True:
+        buf = f.read()
+        if len(buf) == 0:
+            break
+        out.write(buf)
+out.set_current_file_name(os.path.basename(vcf_out))
+this_task.set_output(out.finish())