apps/workbench/app/helpers/vcf_pipeline_helper.rb

   1 module VcfPipelineHelper
   2   require 'csv'
   3
   4   def reset_vcf_pipeline_instance(pi, input_manifest)
   5     params = {
   6       'PICARD_ZIP' => '7a4073e29bfa87154b7102e75668c454+83+K@van',
   7       'GATK_BUNDLE' => '0a37aaf212464efa2a77ff9ba51c0148+10524+K@van',
   8       'GATK_TAR_BZ2' => '482ebab0408e173370c499f0b7c00878+93+K@van',
   9       'BWA' => '73be5598809c66f260fedd253c8608bd+67+K@van',
  10       'SAM' => '55d2115faa608eb95dab4f875b7511b1+72+K@van',
  11       'REGION_PADDING' => '10',
  12       'REGIONS' => 'e52c086f41c2f089d88ec2bbd45355d3+87+K@van/SeqCap_EZ_Exome_v2.hg19.bed',
  13       'STAND_CALL_CONF' => '4.0',
  14       'STAND_EMIT_CONF' => '4.0',
  15       "bwa/INPUT" => input_manifest
  16     }
  17     pi.components = PipelineTemplate.find(pi.pipeline_uuid).components
  18     pi.update_job_parameters(params)
  19     pi.active = true
  20     pi.success = nil
  21   end
  22
  23   def vcf_pipeline_summary(pi)
  24     stats = {}
  25     collection_link = Link.
  26       where(head_uuid: pi.uuid,
  27             link_class: 'client-defined',
  28             name: 'vcffarm-pipeline-invocation').
  29       last
  30     if collection_link
  31       stats[:collection_uuid] = collection_link.tail_uuid
  32     else
  33       pi.components[:steps].each do |step|
  34         if step[:name] == 'bwa'
  35           step[:params].each do |param|
  36             if param[:name] == 'INPUT'
  37               stats[:collection_uuid] = param[:data_locator] || param[:value]
  38               break
  39             end
  40           end
  41         end
  42       end
  43     end
  44     if stats[:collection_uuid]
  45       Link.where(tail_uuid: stats[:collection_uuid],
  46                  head_kind: Group)[0..0].each do |c2p|
  47         stats[:project_uuid] = c2p.head_uuid
  48         group = Group.find stats[:project_uuid]
  49         stats[:project_name] = group.name rescue nil
  50       end
  51       Link.where(tail_uuid: stats[:collection_uuid],
  52                  head_kind: Specimen)[0..0].each do |c2s|
  53         stats[:specimen_uuid] = c2s.head_uuid
  54         specimen = Specimen.find stats[:specimen_uuid]
  55         stats[:specimen_id] = specimen.properties[:specimen_id] rescue nil
  56       end
  57     end
  58     stats[:runtime] = {}
  59     stats[:alignment_for_step] = {}
  60     stats[:alignment] = {}
  61     stats[:coverage] = []
  62     pi.components[:steps].each do |step|
  63       if step[:warehousejob]
  64         if step[:name] == 'bwa' and step[:warehousejob][:starttime]
  65           stats[:runtime][:started_at] = step[:warehousejob][:starttime]
  66         end
  67         if step[:warehousejob][:finishtime]
  68           stats[:runtime][:finished_at] =
  69             [ step[:warehousejob][:finishtime],
  70               stats[:runtime][:finished_at] ].compact.max
  71         end
  72       end
  73       if step[:name] == 'picard-casm' and
  74           step[:complete] and
  75           step[:output_data_locator]
  76         tsv = IO.
  77           popen("whget -r #{step[:output_data_locator]}/ -").
  78           readlines.
  79           collect { |x| x.strip.split "\t" }
  80         casm = {}
  81         head = []
  82         tsv.each do |data|
  83           if data.size < 4 or data[0].match /^\#/
  84             next
  85           elsif data[0] == 'CATEGORY' or data[1].match /[^\d\.]/
  86             head = data
  87           elsif data[0] == 'PAIR'
  88             head.each_with_index do |name, index|
  89               x = data[index]
  90               if x and x.match /^\d+$/
  91                 x = x.to_i
  92               elsif x and x.match /^\d+\.\d+$/
  93                 x = x.to_f
  94               end
  95               name = name.downcase.to_sym
  96               casm[name] ||= []
  97               casm[name] << x
  98             end
  99           end
 100         end
 101         stats[:picard_alignment_summary] = casm
 102       end
 103       if step[:name] == 'gatk-stats' and
 104           step[:complete] and
 105           step[:output_data_locator]
 106         csv = IO.
 107           popen("whget #{step[:output_data_locator]}/mincoverage_nlocus.csv").
 108           readlines.
 109           collect { |x| x.strip.split ',' }
 110         csv.each do |depth, nlocus, percent|
 111           stats[:coverage][depth.to_i] = nlocus.to_i
 112         end
 113       end
 114       if step[:name] == 'gatk-realign' and
 115           step[:complete] and
 116           step[:output_data_locator]
 117         logs = IO.
 118           popen("whget #{step[:warehousejob][:metakey]}").
 119           readlines.
 120           collect(&:strip)
 121         logs.each do |logline|
 122           if (re = logline.match /\s(\d+) stderr INFO .* (\d+) reads were filtered out.*of (\d+) total/)
 123             stats[:alignment_for_step][re[1]] ||= {}
 124             stats[:alignment_for_step][re[1]][:filtered_reads] = re[2].to_i
 125             stats[:alignment_for_step][re[1]][:total_reads] = re[3].to_i
 126           elsif (re = logline.match /(\d+) reads.* failing BadMate/)
 127             stats[:alignment][:bad_mate_reads] = re[1].to_i
 128           elsif (re = logline.match /(\d+) reads.* failing MappingQualityZero/)
 129             stats[:alignment][:mapq0_reads] = re[1].to_i
 130           end
 131         end
 132       end
 133       if step[:name] == 'gatk-merge-call' and
 134           step[:complete] and
 135           step[:output_data_locator]
 136         stats[:vcf_file_name] = "#{stats[:project_name]}-#{stats[:specimen_id]}-#{step[:output_data_locator][0..31]}.vcf"
 137         logs = IO.
 138           popen("whget #{step[:warehousejob][:metakey]}").
 139           readlines.
 140           collect(&:strip)
 141         logs.each do |logline|
 142           if (re = logline.match /(\d+) reads were filtered out.*of (\d+) total/)
 143             stats[:alignment][:filtered_reads] = re[1].to_i
 144             stats[:alignment][:total_realigned_reads] = re[2].to_i
 145           elsif (re = logline.match /(\d+) reads.* failing BadMate/)
 146             stats[:alignment][:bad_mate_reads] = re[1].to_i
 147           elsif (re = logline.match /(\d+) reads.* failing UnmappedRead/)
 148             stats[:alignment][:unmapped_reads] = re[1].to_i
 149           end
 150         end
 151
 152         stats[:chromosome_calls] = {}
 153         tsv = IO.
 154           popen("whget #{step[:output_data_locator]}/merged.vcf | egrep -v '^#' | cut -f1 | uniq -c").
 155           readlines.
 156           collect { |x| x.strip.split }
 157         tsv.each do |n_variants, sequence_name|
 158           stats[:chromosome_calls][sequence_name] = n_variants.to_i
 159         end
 160
 161         stats[:inferred_sex] = false
 162         calls = stats[:chromosome_calls]
 163         if calls['X'] and calls['X'] > 200
 164           if !calls['Y']
 165             stats[:inferred_sex] = 'female'
 166           elsif calls['Y'] * 60 < calls['X']
 167             # if Y < X/60 they are presumed to be misalignments
 168             stats[:inferred_sex] = 'female'
 169           elsif calls['Y'] * 25 > calls['X']
 170             # if Y > X/25 we presume a Y chromosome was present
 171             stats[:inferred_sex] = 'male'
 172           end
 173         end
 174       end
 175     end
 176     stats[:alignment][:total_reads] = 0
 177     stats[:alignment][:filtered_reads] ||= 0
 178     stats[:alignment][:bad_mate_reads] ||= 0
 179     stats[:alignment][:mapq0_reads] ||= 0
 180     stats[:alignment_for_step].values.each do |a4s|
 181       stats[:alignment][:total_reads] += (a4s[:total_reads] || 0)
 182       stats[:alignment][:filtered_reads] += (a4s[:filtered_reads] || 0)
 183       stats[:alignment][:bad_mate_reads] += (a4s[:bad_mate_reads] || 0)
 184       stats[:alignment][:mapq0_reads] += (a4s[:mapq0_reads] || 0)
 185     end
 186
 187     if stats[:collection_uuid]
 188       csv = CSV.parse IO.
 189         popen("whget #{stats[:collection_uuid]}/SampleSheet.csv -").
 190         read
 191       if !csv.empty?
 192         pivoted = []
 193         csv[0].each_with_index do |head, col|
 194           pivoted << csv.collect { |row| row[col] }
 195         end
 196         stats[:source_data_csv_columns] = pivoted
 197       end
 198     end
 199
 200     picardas = stats[:picard_alignment_summary]
 201     stats[:summary_csv_columns] =
 202       [['PROJECT', stats[:project_name]],
 203        ['SPECIMEN', stats[:specimen_id]],
 204        ['VCF_FILE_NAME', stats[:vcf_file_name]],
 205        ['INFERRED_SEX', stats[:inferred_sex]],
 206        ['SOURCE_DATA', stats[:collection_uuid]],
 207        ['PIPELINE_UUID', pi.pipeline_uuid],
 208        ['PIPELINE_RUN_UUID', pi.uuid],
 209        ['PIPELINE_RUN_START', (stats[:runtime][:started_at] rescue nil)],
 210        ['PIPELINE_RUN_FINISH', (stats[:runtime][:finished_at] rescue nil)],
 211        ['N_READS_RAW',
 212         (n_raw = picardas[:total_reads].inject(0,:+) rescue nil)],
 213        ['N_READS_MAPPED',
 214         (n_mapped = picardas[:reads_aligned_in_pairs].inject(0,:+) rescue nil)],
 215        ['PERCENT_READS_MAPPED',
 216         (100.0 * n_mapped / n_raw rescue nil)],
 217        ['N_READS_ON_TARGET',
 218         (n_on_target = stats[:alignment][:total_reads] - stats[:alignment][:filtered_reads] rescue nil)],
 219        ['PERCENT_READS_ON_TARGET',
 220         (100.0 * n_on_target / n_raw rescue nil)],
 221        ['PERCENT_TARGET_COVERAGE_1X',
 222         (100.0 * stats[:coverage][1] / stats[:coverage][0] rescue nil)],
 223        ['PERCENT_TARGET_COVERAGE_10X',
 224         (100.0 * stats[:coverage][10] / stats[:coverage][0] rescue nil)],
 225        ['PERCENT_TARGET_COVERAGE_20X',
 226         (100.0 * stats[:coverage][20] / stats[:coverage][0] rescue nil)],
 227        ['PERCENT_TARGET_COVERAGE_50X',
 228         (100.0 * stats[:coverage][50] / stats[:coverage][0] rescue nil)],
 229        ['PERCENT_TARGET_COVERAGE_100X',
 230         (100.0 * stats[:coverage][100] / stats[:coverage][0] rescue nil)]]
 231
 232     stats
 233   end
 234 end