Adding modified code to select variants and gather arrays
authorSarah Wait Zaranek <swz@curii.com>
Thu, 11 Jun 2020 19:58:06 +0000 (19:58 +0000)
committerWard Vandewege <ward@jhvc.com>
Thu, 18 Jun 2020 15:16:40 +0000 (11:16 -0400)
Arvados-DCO-1.1-Signed-off-by: Sarah Wait Zaranek <swz@curii.com>
no issue #

cwl/helper/bwa-gatk-report-wf.cwl [new file with mode: 0644]
cwl/helper/gather-array-vcf.cwl [new file with mode: 0644]
cwl/helper/gather-vcf-try2.cwl
cwl/helper/gather-vcf.cwl
cwl/helper/gatk-haplotypecaller-with-interval.cwl
cwl/helper/gatk-selectvariants.cwl
cwl/helper/gatk-wf-with-interval.cwl
cwl/helper/getgvcfs.cwl [new file with mode: 0644]
cwl/helper/samtools-index.cwl
cwl/helper/samtools-sort.cwl
cwl/helper/scatter-gatk-wf-with-interval.cwl

diff --git a/cwl/helper/bwa-gatk-report-wf.cwl b/cwl/helper/bwa-gatk-report-wf.cwl
new file mode 100644 (file)
index 0000000..0a4b9d2
--- /dev/null
@@ -0,0 +1,93 @@
+cwlVersion: v1.1
+class: Workflow
+
+requirements:
+  - class: SubworkflowFeatureRequirement
+
+inputs:
+  fastq1: File
+  fastq2: File
+  reference:
+    type: File
+    secondaryFiles:
+      - .amb
+      - .ann
+      - .bwt
+      - .pac
+      - .sa
+      - .fai
+      - ^.dict
+  sample: string
+  knownsites:
+    type: File
+    secondaryFiles:
+      - .tbi   
+  scattercount: string
+  clinvarvcf: File
+  reportfunc: File
+  headhtml: File
+  tailhtml: File
+
+outputs:
+  qc-html:
+    type: File[]
+    outputSource: fastqc/out-html
+  qc-zip:
+    type: File[]
+    outputSource: fastqc/out-zip 
+  gvcf:
+    type: File
+    outputSource: haplotypecaller/gatheredgvcf
+  report:
+    type: File  
+    outputSource: generate-report/report
+steps:
+  fastqc:
+    run: fastqc.cwl
+    in:
+      fastq1: fastq1
+      fastq2: fastq2
+    out: [out-html, out-zip]
+  bwamem-samtools-view:
+    run: bwamem-samtools-view.cwl
+    in:
+      fastq1: fastq1
+      fastq2: fastq2
+      reference: reference
+      sample: sample
+    out: [bam]
+  samtools-sort:
+    run: samtools-sort.cwl 
+    in:
+      bam: bwamem-samtools-view/bam
+      sample: sample
+    out: [sortedbam]
+  mark-duplicates:
+    run: mark-duplicates.cwl
+    in:
+      bam: samtools-sort/sortedbam
+    out: [dupbam,dupmetrics]
+  samtools-index:
+    run: samtools-index.cwl
+    in:
+      bam: mark-duplicates/dupbam
+    out: [indexedbam]
+  haplotypecaller:
+    run: scatter-gatk-wf-with-interval.cwl 
+    in:
+      reference: reference
+      bam: samtools-index/indexedbam
+      sample: sample
+      scattercount: scattercount
+      knownsites1: knownsites
+    out: [gatheredgvcf]
+  generate-report:
+    run: report-wf.cwl
+    in:
+      gvcf: haplotypecaller/gatheredgvcf
+      samplename: sample
+      clinvarvcf: clinvarvcf
+      reportfunc: reportfunc
+      headhtml: headhtml
+      tailhtml: tailhtml
+    out: [report]  
diff --git a/cwl/helper/gather-array-vcf.cwl b/cwl/helper/gather-array-vcf.cwl
new file mode 100644 (file)
index 0000000..d566841
--- /dev/null
@@ -0,0 +1,85 @@
+cwlVersion: v1.1
+class: CommandLineTool
+label: Gathering vcf using Picard 
+$namespaces:
+  arv: "http://arvados.org/cwl#"
+  cwltool: "http://commonwl.org/cwltool#"
+
+requirements:
+  DockerRequirement:
+    dockerPull: broadinstitute/gatk:4.1.7.0
+  ShellCommandRequirement: {}
+  InlineJavascriptRequirement: {}
+
+hints:
+  ResourceRequirement:
+    ramMin: 20000
+    coresMin: 4    
+  arv:RuntimeConstraints:
+    outputDirType: keep_output_dir
+
+inputs:
+  gvcfarray: 
+    type: File[] 
+  sample: string
+  reference:
+    type: File
+    secondaryFiles:
+      - .amb
+      - .ann
+      - .bwt
+      - .pac
+      - .sa
+      - .fai
+      - ^.dict
+outputs:
+  gatheredgvcf:
+    type: File
+    secondaryFiles:
+      - .tbi
+    outputBinding:
+      glob: "*.g.vcf.gz"
+
+baseCommand: /gatk/gatk
+
+arguments:
+  - "--java-options"
+  - "-Xmx8G" 
+  - MergeVcfs
+  - shellQuote: false
+    valueFrom: | 
+     ${function compare(a, b) {
+      var baseA = a.basename;
+      var baseB = b.basename;
+
+      var comparison = 0;
+      if (baseA > baseB) {
+      comparison = 1;
+      } else if (baseA < baseB) {
+      comparison = -1;
+      }
+      return comparison;
+      }
+
+      var sortedarray = [];
+      sortedarray = inputs.gvcfarray.sort(compare)
+      var samples = [];
+      for (var i = 0; i < sortedarray.length; i++) {
+        var name = sortedarray[i];
+        if (name.nameext ==='.gz' ) {
+          samples.push(name.path);
+        }
+      }
+     
+      var sampleinput = "";
+
+      for (var i = 0; i < samples.length; i++) {
+       var s1 = samples[i];
+       sampleinput = sampleinput + "-I " + s1 + " "
+      }
+    
+      return sampleinput;
+      }
+  - prefix: "-O"
+    valueFrom: $(inputs.sample).g.vcf.gz
index c573ffb95ed97e5450a3f78bfcbec0c4e42ce5ca..40a5c9dadd98e50ea9fd1c0a67402bf342170658 100644 (file)
@@ -19,7 +19,10 @@ hints:
     outputDirType: keep_output_dir
 
 inputs:
-  gvcfarray: File[]
+  gvcfdir: 
+    type: Directory
+    label: Input directory of gvcfs
+    loadListing: 'shallow_listing'
   sample: string
   reference:
     type: File
@@ -45,12 +48,22 @@ arguments:
   - GatherVcfs
   - shellQuote: false
     valueFrom: >
-      ${
-        var cmd "";
-        for( var i = 0; i < inputs.gvcfarray.length; i++){
-           cmd += "\s echo " + "-I" + "\s" + inputs.gvcfsarray[i]
-        }
-        return cmd;
-       } 
+    ${
+    var samples = [];
+    for (var i = 0; i < inputs.gvcfdir.listing.length; i++) {
+      var name = inputs.gvcfdir.listing[i];
+      if (name.nameext ==='.gz' ) {
+        samples.push(name.basename);
+      }
+    }
+    samples = samples.sort();
+    var sampleinput = [];
+
+    for (var i = 0; i < samples.length; i++) {
+     var s1 = samples[i];
+     sampleinput = sampleinput + "-I " + s1 + " "
+    }
+    return sampleinput;
+    }
   - prefix: "-O"
     valueFrom: $(inputs.sample).g.vcf.gz
index 8e5b29a8c6063496f152e5533b49721f449f9a26..cd1532c7deb91bcc6d6b5e7fd69a9f512f64484c 100644 (file)
@@ -9,10 +9,7 @@ requirements:
   DockerRequirement:
     dockerPull: broadinstitute/gatk:4.1.7.0
   ShellCommandRequirement: {}
-  InitialWorkDirRequirement:
-    listing:
-      - $(inputs.gvcf1)
-      - $(inputs.gvcf2)
+  InlineJavascriptRequirement: {}
 
 hints:
   ResourceRequirement:
@@ -22,10 +19,10 @@ hints:
     outputDirType: keep_output_dir
 
 inputs:
-  gvcf1:
-    type: File
-  gvcf2:
-    type: File
+  gvcfdir: 
+    type: Directory
+    label: Input directory of gvcfs
+    loadListing: 'shallow_listing'
   sample: string
   reference:
     type: File
@@ -40,6 +37,8 @@ inputs:
 outputs:
   gatheredgvcf:
     type: File
+    secondaryFiles:
+      - .tbi
     outputBinding:
       glob: "*.g.vcf.gz"
 
@@ -48,10 +47,38 @@ baseCommand: /gatk/gatk
 arguments:
   - "--java-options"
   - "-Xmx8G" 
-  - GatherVcfs
-  - "-I"
-  - $(inputs.gvcf1.basename) 
-  - "-I"
-  - $(inputs.gvcf2.basename)
+  - MergeVcfs
+  - shellQuote: false
+    valueFrom: | 
+     ${function compare(a, b) {
+      var baseA = a.basename;
+      var baseB = b.basename;
+
+      var comparison = 0;
+      if (baseA > baseB) {
+      comparison = 1;
+      } else if (baseA < baseB) {
+      comparison = -1;
+      }
+      return comparison;
+      } 
+
+      var samples = [];
+      for (var i = 0; i < inputs.gvcfdir.listing.length; i++) {
+        var name = inputs.gvcfdir.listing[i];
+        if (name.nameext ==='.gz' ) {
+          samples.push(name.path);
+        }
+      }
+      samples = samples.sort(compare);
+      var sampleinput = [];
+
+      for (var i = 0; i < samples.length; i++) {
+       var s1 = samples[i];
+       sampleinput = sampleinput + "-I " + s1 + " "
+      }
+    
+      return sampleinput;
+      }
   - prefix: "-O"
     valueFrom: $(inputs.sample).g.vcf.gz
index 3e0b53d3e53a2a22fb0f7e616b0a78c7316a2aca..88c78e04654581896209c162a7c0dc7101519887 100644 (file)
@@ -40,6 +40,8 @@ inputs:
 outputs:
   gvcf:
     type: File
+    secondaryFiles:
+      - .tbi
     outputBinding:
       glob: "*vcf.gz"
 
index 2a65e5340e933f908c951bdf29672da6b6bd4f54..66b5d5c0bb552cb89237881dbd54a41dff198e76 100644 (file)
@@ -35,10 +35,10 @@ inputs:
   sample: string
 
 outputs:
-  genotypegvcf:
+  filteredgvcf:
     type: File
     outputBinding:
-      glob: "*selected.g.vcf.gz"
+      glob: "*g.vcf.gz"
 
 baseCommand: /gatk/gatk
 
@@ -51,6 +51,6 @@ arguments:
   - prefix: "--remove-unused-alternates"
     valueFrom: "true"
   - prefix: "-V"
-    valueFrom: $(inputs.gvcf)
+    valueFrom: $(inputs.gvcf.path)
   - prefix: "-O"
-    valueFrom: $(inputs.sample)selected.g.vcf.gz
+    valueFrom: selected$(inputs.gvcf.basename)
index 037accdc06db269ae3bc885d2897f58a42671d6b..4e9f456b24fec167cc3098a71560395254dd74da 100644 (file)
@@ -30,7 +30,7 @@ inputs:
 outputs:
   gvcf:
     type: File
-    outputSource: haplotypecaller/gvcf
+    outputSource: selectvariants/filteredgvcf
 
 steps:
   basecalibrator:
@@ -59,3 +59,10 @@ steps:
       sample: sample
       intervallist: intervallist
     out: [gvcf]
+  selectvariants:
+    run: gatk-selectvariants.cwl
+    in: 
+      gvcf: haplotypecaller/gvcf
+      reference: reference
+      sample: sample
+    out: [filteredgvcf]
diff --git a/cwl/helper/getgvcfs.cwl b/cwl/helper/getgvcfs.cwl
new file mode 100644 (file)
index 0000000..e3a7022
--- /dev/null
@@ -0,0 +1,37 @@
+$namespaces:
+  arv: "http://arvados.org/cwl#"
+  cwltool: "http://commonwl.org/cwltool#"
+class: ExpressionTool
+cwlVersion: v1.1
+label: Create array of gvcfs to process
+requirements:
+  InlineJavascriptRequirement: {}
+inputs:
+  gvcfdir:
+    type: Directory
+    label: Input directory of gvcfs
+    loadListing: 'shallow_listing' 
+outputs:
+  sampleinput: 
+    type: string
+expression: |
+  ${
+    var samples = [];
+    for (var i = 0; i < inputs.gvcfdir.listing.length; i++) {
+      var name = inputs.gvcfdir.listing[i];
+      if (name.nameext ==='.gz' ) {
+        samples.push(name.basename);
+      }
+    }
+    samples = samples.sort();
+    var sampleinput = [];
+   
+    for (var i = 0; i < samples.length; i++) {
+     var s1 = samples[i];
+     sampleinput = sampleinput + "-I " + s1 + " "
+    }
+
+
+    return {"sampleinput": sampleinput};
+
+  }
index 605390a9c4032972439f287e53346521b57959fe..d479fbd5d5a45f857103729c8e9942469b2b46a8 100644 (file)
@@ -21,7 +21,7 @@ inputs:
   bam: File
 
 outputs:
-  bam:
+  indexedbam:
     type: File
     outputBinding:
       glob: "*bam"
index b583f66be91d2897efb58f3da911793e9cd3c4c7..02072cdfbf863e0e6f982cd7cb21bb33603a21e2 100644 (file)
@@ -10,9 +10,6 @@ requirements:
   DockerRequirement:
     dockerPull: curii/bwa-samtools-picard
   ShellCommandRequirement: {}
-  InitialWorkDirRequirement:
-    listing:
-      - $(inputs.bam)
   ResourceRequirement:
     ramMin: 20000
     coresMin: 4
@@ -27,7 +24,7 @@ inputs:
   sample: string
 
 outputs:
-  bam:
+  sortedbam:
     type: File
     outputBinding:
       glob: "*sorted.bam"
@@ -38,7 +35,7 @@ arguments:
   - sort
   - -@
   - $(runtime.cores)
-  - $(inputs.bam.basename)
+  - $(inputs.bam.path)
   - -m
   - '2G'
   - -o
index 80d0456055a17e5b3d274df9906968cf5e32248c..6e9f415f691c54fc41b5d497f28d9abb30f74855 100644 (file)
@@ -31,10 +31,12 @@ inputs:
   scattercount: string
 
 outputs:
-  gvcf:
-    type: File[]
-    outputSource: recal-haplotypecaller/gvcf
-
+  gatheredgvcf:
+    type: File
+    secondaryFiles: 
+      - .tbi
+    outputSource: merge-GVCFs/gatheredgvcf
+    
 steps:
   splitintervals:
     run: gatk-splitintervals.cwl
@@ -54,3 +56,11 @@ steps:
       knownsites1: knownsites1
       intervallist: splitintervals/intervalfiles
     out: [gvcf]
+
+  merge-GVCFs:
+    run: gather-array-vcf.cwl
+    in:
+      gvcfarray: recal-haplotypecaller/gvcf
+      sample: sample
+      reference: reference
+    out: [gatheredgvcf]