Add splitDir to take Directory input
[arvados-tutorial.git] / WGS-processing / src / annotation / generatereport.py
1 import numpy as np
2 import scipy as scipy
3 import pandas as pd
4 import io
5 import argparse
6
7 def tablegeneration(reportdata,sectionlabel):
8     labelhtml = '<h2>'+sectionlabel+'</h2>'
9     # creating html table from dataframe
10     reportdatasub = reportdata[["Variant ID", "Allele ID", "Clinical Significance","Disease Name", "Frequency EXAC", "Frequency 1000 Genomes Project","Zygosity","URL"]]
11
12     reportdatasub['Disease Name'] = reportdatasub['Disease Name'].str.replace('|','<br/>')
13     str_io = io.StringIO()
14     reportdatasub.to_html(buf=str_io, classes='table table-bordered',index_names=False,index=False)
15     html_str = str_io.getvalue()
16     html_str_encoded = unicode(html_str).encode('utf8')
17     html_str_encoded = html_str_encoded.replace('&lt;','<')
18     html_str_encoded = html_str_encoded.replace('&gt;','>')
19     html_str_encoded = html_str_encoded.replace('_',' ')
20     section_html = labelhtml+html_str_encoded
21     return section_html
22
23 def generatereport():
24
25     parser = argparse.ArgumentParser()
26     parser.add_argument('txtfilename', metavar='VCF2TXTFILENAME', help='text file of info to annotate')
27     parser.add_argument('samplename', metavar='SAMPLENAME', help='name of sample to use on report')
28     parser.add_argument('headfile', metavar='REPORTHEADHTML', help='head html for report')
29     parser.add_argument('tailfile', metavar='REPORTTAILHTML', help='tail html for report')
30     args = parser.parse_args()
31
32     pd.set_option("display.max_colwidth", 10000)
33
34 #    filename = "reportdata.txt"
35 #    samplename = "hu34D5B9_var-GS000015891-ASM"
36 #    headfile = "head.html"
37 #    tailfile = "tail.html"
38
39     filename = args.txtfilename
40     samplename = args.samplename
41     headfile = args.headfile
42     tailfile = args.tailfile
43
44     # reading data into dataframe
45     headerlist = ["Variant ID", "Chromosome", "Position", "Ref","Alt","Allele ID", "Clinical Significance","Disease Name","Frequency GO-ESP", "Frequency EXAC", "Frequency 1000 Genomes Project","GT"]
46     reportdata = pd.read_csv(filename,header=0,names=headerlist,sep='\t')
47     
48     # defining zygosity
49     reportdata['Zygosity'] = reportdata.GT
50
51     # creating url from variant ID
52     clinvarURL =  "https://www.ncbi.nlm.nih.gov/clinvar/variation/" 
53     reportdata['URL'] = '<a href=' + clinvarURL + reportdata['Variant ID'].apply(str) + '> Link to ClinVar</a>'
54     reportdata.to_json('test.json',orient='records')
55     str_io = io.StringIO()
56
57     idxP = reportdata['Clinical Significance'].str.contains('Pathogenic')
58     idxLP = reportdata['Clinical Significance'].str.contains('Likely_pathogenic')
59     idxD = reportdata['Clinical Significance'].str.contains('drug_response') 
60     idxPro = reportdata['Clinical Significance'].str.contains('protective')
61     idxRisk = reportdata['Clinical Significance'].str.contains('risk_factor')
62     idxA = reportdata['Clinical Significance'].str.contains('Affects')
63     idxB = reportdata['Clinical Significance'].str.contains('Benign')
64     idxLB = reportdata['Clinical Significance'].str.contains('Likely_benign') 
65     idxAs = reportdata['Clinical Significance'].str.contains('association')
66
67     idxOther = ~(idxAs | idxLB | idxB | idxA | idxRisk | idxPro | idxD | idxP | idxLP)
68  
69     html_file = open(headfile, 'r')
70     source_code_head = html_file.read() 
71     source_code_head = source_code_head.replace('ClinVar Report','ClinVar Report For ' + samplename)
72     html_file.close()
73
74     html_file = open(tailfile, 'r')
75     source_code_tail = html_file.read()
76     html_file.close()
77  
78     pathogenic_html = tablegeneration(reportdata[idxP],'Pathogenic')
79     likely_pathogenic_html = tablegeneration(reportdata[idxLP],'Likely Pathogenic') 
80     drug_html = tablegeneration(reportdata[idxD],'Drug Response')
81     protective_html = tablegeneration(reportdata[idxPro],'Protective')
82     risk_html = tablegeneration(reportdata[idxRisk],'Risk Factor')
83     affects_html = tablegeneration(reportdata[idxA],'Affects')
84     association_html = tablegeneration(reportdata[idxAs],'Association')
85     benign_html = tablegeneration(reportdata[idxB],'Benign')
86     likely_benign_html = tablegeneration(reportdata[idxLB],'Likely Benign')
87     other_html = tablegeneration(reportdata[idxOther],'Other') 
88
89     # combine html table with head and tail html for total report
90     total_html = source_code_head + pathogenic_html + likely_pathogenic_html + drug_html + protective_html + risk_html + affects_html + association_html + other_html + benign_html + likely_benign_html + source_code_tail
91   
92     # write out report html
93     f = open(samplename+'.html','wb')
94     f.write(total_html)
95     f.close()
96
97 if __name__ == '__main__':
98     generatereport()