Merge branch '3373-improve-gatk3-snv-pipeline' closes #3373
[arvados.git] / apps / workbench / app / helpers / vcf_pipeline_helper.rb
1 module VcfPipelineHelper
2   require 'csv'
3
4   def reset_vcf_pipeline_instance(pi, input_manifest)
5     params = {
6       'PICARD_ZIP' => '7a4073e29bfa87154b7102e75668c454+83+K@van',
7       'GATK_BUNDLE' => '0a37aaf212464efa2a77ff9ba51c0148+10524+K@van',
8       'GATK_TAR_BZ2' => '482ebab0408e173370c499f0b7c00878+93+K@van',
9       'BWA' => '73be5598809c66f260fedd253c8608bd+67+K@van',
10       'SAM' => '55d2115faa608eb95dab4f875b7511b1+72+K@van',
11       'REGION_PADDING' => '10',
12       'REGIONS' => 'e52c086f41c2f089d88ec2bbd45355d3+87+K@van/SeqCap_EZ_Exome_v2.hg19.bed',
13       'STAND_CALL_CONF' => '4.0',
14       'STAND_EMIT_CONF' => '4.0',
15       "bwa/INPUT" => input_manifest
16     }
17     pi.components = PipelineTemplate.find(pi.pipeline_uuid).components
18     pi.update_job_parameters(params)
19     pi.active = true
20     pi.success = nil
21   end
22
23   def vcf_pipeline_summary(pi)
24     stats = {}
25     collection_link = Link.
26       where(head_uuid: pi.uuid,
27             link_class: 'client-defined',
28             name: 'vcffarm-pipeline-invocation').
29       last
30     if collection_link
31       stats[:collection_uuid] = collection_link.tail_uuid
32     else
33       pi.components[:steps].each do |step|
34         if step[:name] == 'bwa'
35           step[:params].each do |param|
36             if param[:name] == 'INPUT'
37               stats[:collection_uuid] = param[:data_locator] || param[:value]
38               break
39             end
40           end
41         end
42       end
43     end
44     if stats[:collection_uuid]
45       Link.where(tail_uuid: stats[:collection_uuid],
46                  head_kind: Group)[0..0].each do |c2p|
47         stats[:project_uuid] = c2p.head_uuid
48         group = Group.find stats[:project_uuid]
49         stats[:project_name] = group.name rescue nil
50       end
51       Link.where(tail_uuid: stats[:collection_uuid],
52                  head_kind: Specimen)[0..0].each do |c2s|
53         stats[:specimen_uuid] = c2s.head_uuid
54         specimen = Specimen.find stats[:specimen_uuid]
55         stats[:specimen_id] = specimen.properties[:specimen_id] rescue nil
56       end
57     end
58     stats[:runtime] = {}
59     stats[:alignment_for_step] = {}
60     stats[:alignment] = {}
61     stats[:coverage] = []
62     pi.components[:steps].each do |step|
63       if step[:warehousejob]
64         if step[:name] == 'bwa' and step[:warehousejob][:starttime]
65           stats[:runtime][:started_at] = step[:warehousejob][:starttime]
66         end
67         if step[:warehousejob][:finishtime]
68           stats[:runtime][:finished_at] =
69             [ step[:warehousejob][:finishtime],
70               stats[:runtime][:finished_at] ].compact.max
71         end
72       end
73       if step[:name] == 'picard-casm' and
74           step[:complete] and
75           step[:output_data_locator]
76         tsv = IO.
77           popen("whget -r #{step[:output_data_locator]}/ -").
78           readlines.
79           collect { |x| x.strip.split "\t" }
80         casm = {}
81         head = []
82         tsv.each do |data|
83           if data.size < 4 or data[0].match /^\#/
84             next
85           elsif data[0] == 'CATEGORY' or data[1].match /[^\d\.]/
86             head = data
87           elsif data[0] == 'PAIR'
88             head.each_with_index do |name, index|
89               x = data[index]
90               if x and x.match /^\d+$/
91                 x = x.to_i
92               elsif x and x.match /^\d+\.\d+$/
93                 x = x.to_f
94               end
95               name = name.downcase.to_sym
96               casm[name] ||= []
97               casm[name] << x
98             end
99           end
100         end
101         stats[:picard_alignment_summary] = casm
102       end
103       if step[:name] == 'gatk-stats' and
104           step[:complete] and
105           step[:output_data_locator]
106         csv = IO.
107           popen("whget #{step[:output_data_locator]}/mincoverage_nlocus.csv").
108           readlines.
109           collect { |x| x.strip.split ',' }
110         csv.each do |depth, nlocus, percent|
111           stats[:coverage][depth.to_i] = nlocus.to_i
112         end
113       end
114       if step[:name] == 'gatk-realign' and
115           step[:complete] and
116           step[:output_data_locator]
117         logs = IO.
118           popen("whget #{step[:warehousejob][:metakey]}").
119           readlines.
120           collect(&:strip)
121         logs.each do |logline|
122           if (re = logline.match /\s(\d+) stderr INFO .* (\d+) reads were filtered out.*of (\d+) total/)
123             stats[:alignment_for_step][re[1]] ||= {}
124             stats[:alignment_for_step][re[1]][:filtered_reads] = re[2].to_i
125             stats[:alignment_for_step][re[1]][:total_reads] = re[3].to_i
126           elsif (re = logline.match /(\d+) reads.* failing BadMate/)
127             stats[:alignment][:bad_mate_reads] = re[1].to_i
128           elsif (re = logline.match /(\d+) reads.* failing MappingQualityZero/)
129             stats[:alignment][:mapq0_reads] = re[1].to_i
130           end
131         end
132       end
133       if step[:name] == 'gatk-merge-call' and
134           step[:complete] and
135           step[:output_data_locator]
136         stats[:vcf_file_name] = "#{stats[:project_name]}-#{stats[:specimen_id]}-#{step[:output_data_locator][0..31]}.vcf"
137         logs = IO.
138           popen("whget #{step[:warehousejob][:metakey]}").
139           readlines.
140           collect(&:strip)
141         logs.each do |logline|
142           if (re = logline.match /(\d+) reads were filtered out.*of (\d+) total/)
143             stats[:alignment][:filtered_reads] = re[1].to_i
144             stats[:alignment][:total_realigned_reads] = re[2].to_i
145           elsif (re = logline.match /(\d+) reads.* failing BadMate/)
146             stats[:alignment][:bad_mate_reads] = re[1].to_i
147           elsif (re = logline.match /(\d+) reads.* failing UnmappedRead/)
148             stats[:alignment][:unmapped_reads] = re[1].to_i
149           end
150         end
151
152         stats[:chromosome_calls] = {}
153         tsv = IO.
154           popen("whget #{step[:output_data_locator]}/merged.vcf | egrep -v '^#' | cut -f1 | uniq -c").
155           readlines.
156           collect { |x| x.strip.split }
157         tsv.each do |n_variants, sequence_name|
158           stats[:chromosome_calls][sequence_name] = n_variants.to_i
159         end
160
161         stats[:inferred_sex] = false
162         calls = stats[:chromosome_calls]
163         if calls['X'] and calls['X'] > 200
164           if !calls['Y']
165             stats[:inferred_sex] = 'female'
166           elsif calls['Y'] * 60 < calls['X']
167             # if Y < X/60 they are presumed to be misalignments
168             stats[:inferred_sex] = 'female'
169           elsif calls['Y'] * 25 > calls['X']
170             # if Y > X/25 we presume a Y chromosome was present
171             stats[:inferred_sex] = 'male'
172           end
173         end
174       end
175     end
176     stats[:alignment][:total_reads] = 0
177     stats[:alignment][:filtered_reads] ||= 0
178     stats[:alignment][:bad_mate_reads] ||= 0
179     stats[:alignment][:mapq0_reads] ||= 0
180     stats[:alignment_for_step].values.each do |a4s|
181       stats[:alignment][:total_reads] += (a4s[:total_reads] || 0)
182       stats[:alignment][:filtered_reads] += (a4s[:filtered_reads] || 0)
183       stats[:alignment][:bad_mate_reads] += (a4s[:bad_mate_reads] || 0)
184       stats[:alignment][:mapq0_reads] += (a4s[:mapq0_reads] || 0)
185     end
186
187     if stats[:collection_uuid]
188       csv = CSV.parse IO.
189         popen("whget #{stats[:collection_uuid]}/SampleSheet.csv -").
190         read
191       if !csv.empty?
192         pivoted = []
193         csv[0].each_with_index do |head, col|
194           pivoted << csv.collect { |row| row[col] }
195         end
196         stats[:source_data_csv_columns] = pivoted
197       end
198     end
199
200     picardas = stats[:picard_alignment_summary]
201     stats[:summary_csv_columns] =
202       [['PROJECT', stats[:project_name]],
203        ['SPECIMEN', stats[:specimen_id]],
204        ['VCF_FILE_NAME', stats[:vcf_file_name]],
205        ['INFERRED_SEX', stats[:inferred_sex]],
206        ['SOURCE_DATA', stats[:collection_uuid]],
207        ['PIPELINE_UUID', pi.pipeline_uuid],
208        ['PIPELINE_RUN_UUID', pi.uuid],
209        ['PIPELINE_RUN_START', (stats[:runtime][:started_at] rescue nil)],
210        ['PIPELINE_RUN_FINISH', (stats[:runtime][:finished_at] rescue nil)],
211        ['N_READS_RAW',
212         (n_raw = picardas[:total_reads].inject(0,:+) rescue nil)],
213        ['N_READS_MAPPED',
214         (n_mapped = picardas[:reads_aligned_in_pairs].inject(0,:+) rescue nil)],
215        ['PERCENT_READS_MAPPED',
216         (100.0 * n_mapped / n_raw rescue nil)],
217        ['N_READS_ON_TARGET',
218         (n_on_target = stats[:alignment][:total_reads] - stats[:alignment][:filtered_reads] rescue nil)],
219        ['PERCENT_READS_ON_TARGET',
220         (100.0 * n_on_target / n_raw rescue nil)],
221        ['PERCENT_TARGET_COVERAGE_1X',
222         (100.0 * stats[:coverage][1] / stats[:coverage][0] rescue nil)],
223        ['PERCENT_TARGET_COVERAGE_10X',
224         (100.0 * stats[:coverage][10] / stats[:coverage][0] rescue nil)],
225        ['PERCENT_TARGET_COVERAGE_20X',
226         (100.0 * stats[:coverage][20] / stats[:coverage][0] rescue nil)],
227        ['PERCENT_TARGET_COVERAGE_50X',
228         (100.0 * stats[:coverage][50] / stats[:coverage][0] rescue nil)],
229        ['PERCENT_TARGET_COVERAGE_100X',
230         (100.0 * stats[:coverage][100] / stats[:coverage][0] rescue nil)]]
231
232     stats
233   end
234 end