Updating Dockerfile for vcfutil to point to my gvcf repo with updated changes for...
authoracoleman2000 <alexc401101@gmail.com>
Fri, 16 Jun 2023 16:58:06 +0000 (10:58 -0600)
committerAlex Coleman <alex.coleman@curii.com>
Thu, 20 Jul 2023 17:59:23 +0000 (11:59 -0600)
Updating workflow to support EITHER list of vcf files OR directory of vcf files to be provided. Adding more conditional logic for workflows.
Arvados-DCO-1.1-Signed-off-by: Alex Coleman <alex.coleman@curii.com>.

cwl/gvcf2fasta/maingvcf2fasta.cwl
cwl/gvcf2fasta/subworkflows/scatter/gvcf2fasta/gvcf2fasta_splitvcf-imputation-wf.cwl
cwl/gvcf2fasta/subworkflows/scatter/helpers/bcftools-consensus.cwl
cwl/gvcf2fasta/subworkflows/scatter/helpers/concat-get_bed_varonlyvcf.cwl
cwl/gvcf2fasta/subworkflows/scatter/helpers/get_sample_ids.cwl [new file with mode: 0644]
cwl/gvcf2fasta/subworkflows/scatter/helpers/get_vcfs.cwl [new file with mode: 0644]
cwl/gvcf2fasta/subworkflows/scatter/helpers/getfiles.cwl
cwl/gvcf2fasta/subworkflows/scatter/scatter-gvcf2fasta-wf.cwl
docker/vcfutil/Dockerfile

index 40a7a89602db53251d7e9488b75b9208e675cedf..f8b490ad19cbc47e858b7b5a0fb502fea40ed98b 100644 (file)
@@ -12,6 +12,7 @@ requirements:
   ScatterFeatureRequirement: {}
   MultipleInputFeatureRequirement: {}
   InlineJavascriptRequirement: {}
+  StepInputExpressionRequirement: {}
 hints:
   DockerRequirement:
     dockerPull: vcfutil
@@ -22,32 +23,39 @@ inputs:
   splitvcfdirs:
     type: Directory[]?
     label: Input directory of split gVCFs
+    default: null
   vcfsdir:
     type: Directory?
     label: Input directory of VCFs
+    default: null
   vcfs:
     type: File[]?
     label: Input VCFs in array of files 
-  vcftars:
-    type: File[]?
-    label: Input VCF tars
+    default: null
   genomebed:
     type: File?
     label: Whole genome BED
+    default: null
   ref:
     type: File?
     label: Reference FASTA
+    default: null
   gqcutoff:
     type: int?
     label: GQ (Genotype Quality) cutoff for filtering
+    default: null
   sampleids:
     type: string[]?
     label: Sample IDs
+    default: null
   chrs: string[]?
   refsdir: Directory?
   mapsdir: Directory?
   panelnocallbed: File?
   panelcallbed: File?
+  nonref: boolean?
+  split: boolean?
+  tar: boolean?
 
 
 outputs:
@@ -67,26 +75,53 @@ outputs:
     pickValue: first_non_null
 
 steps: 
+  getfiles:
+    run: subworkflows/scatter/helpers/getfiles.cwl
+    when: $(inputs.dir !== null)
+    in:
+      dir: vcfsdir
+    out: [vcfs]
+
+  vcf_throttle:
+    in:
+      vcf_files: 
+        source: vcfs
+        default: null
+      transformed_vcfs: 
+        source: getfiles/vcfs
+        default: null
+    run: subworkflows/scatter/helpers/get_vcfs.cwl
+    out: [vcfs]
+
+  get_sample_ids:
+    run: subworkflows/scatter/helpers/get_sample_ids.cwl
+    when: $(inputs.sampleids === null)
+    in:
+      vcfs: vcf_throttle/vcfs
+      sampleids: sampleids
+    out: [sampleids]
+
   gvcf2fasta_nonrefvcf-wf:
     run:  subworkflows/scatter/gvcf2fasta/gvcf2fasta_nonrefvcf-wf.cwl
-    when: $(inputs.sampleid && inputs.vcf)
+    when: $(inputs.vcf !== null && inputs.genomebed !== null && inputs.ref !== null && inputs.gqcutoff !== null && inputs.nonref === true)
     scatter: [sampleid, vcf]
     scatterMethod: dotproduct
     in:
       sampleid: 
-        source: sampleids
+        source: get_sample_ids/sampleids
         default: []
       vcf: 
-        source: vcfs
+        source: vcf_throttle/vcfs
         default: []
       gqcutoff: gqcutoff
       genomebed: genomebed
       ref: ref
+      nonref: nonref
     out: [fas]
 
   gvcf2fasta_splitvcf-imputation-wf:
     run: subworkflows/scatter/gvcf2fasta/gvcf2fasta_splitvcf-imputation-wf.cwl
-    when: $(inputs.sampleids !== null  && inputs.splitvcfdirs  !== null && inputs.chrs !== null && inputs.refsdir !== null && inputs.mapsdir !== null && inputs.panelcallbed  !== null && inputs.panelnocallbed !== null)
+    when: $(inputs.splitvcfdir !== null && inputs.chrs !== null && inputs.refsdir !== null && inputs.mapsdir !== null && inputs.panelcallbed !== null && inputs.panelnocallbed !== null)
     scatter: [sampleid, splitvcfdir]
     scatterMethod: dotproduct
     in:
@@ -108,12 +143,12 @@ steps:
 
   gvcf2fasta_splitvcf-wf:
     run: subworkflows/scatter/gvcf2fasta/gvcf2fasta_splitvcf-wf.cwl
-    when: $(inputs.sampleid !== null && inputs.splitvcfdir !== null && inputs.chrs == null)
+    when: $(inputs.split && inputs.chrs === null)
     scatter: [sampleid, splitvcfdir]
     scatterMethod: dotproduct
     in:
       sampleid: 
-        source: sampleids
+        source: get_sample_ids/sampleids
         default: []
       splitvcfdir: 
         source: splitvcfdirs
@@ -121,43 +156,45 @@ steps:
       gqcutoff: gqcutoff
       genomebed: genomebed
       ref: ref
+      split: split
+      chrs: chrs
     out: [fas]
+
   gvcf2fasta_splitvcftar-wf:
     run: subworkflows/scatter/gvcf2fasta/gvcf2fasta_splitvcftar-wf.cwl
-    when: $(inputs.sampleids !== null  && inputs.vcftars !== null )
+    when: $(inputs.tar === true && inputs.split === true)
     scatter: [sampleid, vcftar]
     scatterMethod: dotproduct
     in:
       sampleid: 
-        source: sampleids
+        source: get_sample_ids/sampleids
         default: []
       vcftar: 
-        source: vcftars
+        source: vcf_throttle/vcfs
         default: []
       gqcutoff: gqcutoff
       genomebed: genomebed
       ref: ref
+      tar: tar
+      split: split
     out: [fas]
-  getfiles:
-    run: subworkflows/scatter/helpers/getfiles.cwl
-    when: $(inputs.vcfsdir !== null) #  && inputs.sampleid === null && inputs.sampleids === null
-    in:
-      dir: vcfsdir
-    out: [vcfs, samples]
   gvcf2fasta-wf:
     run: subworkflows/scatter/gvcf2fasta/gvcf2fasta-wf.cwl
     scatter: [sampleid, vcf]
-    when: $(inputs.vcfsdir !== null)
+    when: $(inputs.tar !== true && inputs.split !== true && inputs.nonref !== true)
     scatterMethod: dotproduct
     in:
       sampleid: 
-        source: getfiles/samples
+        source: get_sample_ids/sampleids
         default: []
       vcf: 
-        source: getfiles/vcfs
+        source: vcf_throttle/vcfs
         default: []
       genomebed: genomebed
       ref: ref
       gqcutoff: gqcutoff
+      tar: tar
+      split: split
+      nonref: nonref
     out: [fas]
 
index 7dfa7c0593d77f2a9d54b8bc25b2b3b3fec76d7d..f9b1d5cc97dd6d98a1bacdf8b47a02d5a1b05b60 100644 (file)
@@ -13,9 +13,6 @@ requirements:
 hints:
   DockerRequirement:
     dockerPull: vcfutil
-  arv:UsePreemptible:
-    usePreemptible: true
-
 inputs:
   sampleid:
     type: string
index 63d20fbc3802f166ff33a888285bd237fa144fac..f4df93126ab9bcdc10e165cef7d19b3721c8a37f 100644 (file)
@@ -29,7 +29,7 @@ inputs:
     label: Script to run bcftools consensus
     default:
       class: File
-      location: src/bcftools-consensus.sh
+      location: ../../../src/bcftools-consensus.sh
 outputs:
   fas:
     type: File[]
index 1a4369d698beb2e9cb54e44d03d49ce43869a1db..a3fbca71d97c310b8d5a34e5620281f51f3c5daf 100644 (file)
@@ -31,7 +31,7 @@ inputs:
     label: Script to untar and concatenate vcf tar ball
     default:
       class: File
-      location: src/concat-get_bed_varonlyvcf.sh
+      location: ../../../src/concat-get_bed_varonlyvcf.sh
 outputs:
   nocallbed:
     type: File
diff --git a/cwl/gvcf2fasta/subworkflows/scatter/helpers/get_sample_ids.cwl b/cwl/gvcf2fasta/subworkflows/scatter/helpers/get_sample_ids.cwl
new file mode 100644 (file)
index 0000000..aea9e9f
--- /dev/null
@@ -0,0 +1,36 @@
+# Copyright (C) The Lightning Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+cwlVersion: v1.2
+class: ExpressionTool
+label: Create list of VCFs and sample names
+hints:
+  LoadListingRequirement:
+    loadListing: shallow_listing
+inputs:
+  vcfs:
+    type: File[]
+    label: Input VCFs
+outputs:
+  sampleids:
+    type: string[]
+    label: Sample names of VCFs
+requirements:
+  InlineJavascriptRequirement: {}
+expression: |
+  ${
+    var samples = [];
+    for (var i = 0; i < inputs.vcfs.length; i++) {
+      var file = inputs.vcfs[i];
+      if (file.nameext == ".vcf") {
+        var sample = file.basename.split(".").slice(0, -1).join(".");
+        samples.push(sample);
+      }
+      if (file.nameext == ".gz") {
+        var sample = file.basename.split(".").slice(0, -2).join(".");
+        samples.push(sample);
+      }
+    }
+    return {"sampleids": samples};
+  }
\ No newline at end of file
diff --git a/cwl/gvcf2fasta/subworkflows/scatter/helpers/get_vcfs.cwl b/cwl/gvcf2fasta/subworkflows/scatter/helpers/get_vcfs.cwl
new file mode 100644 (file)
index 0000000..5ee4ee9
--- /dev/null
@@ -0,0 +1,19 @@
+# Copyright (C) The Lightning Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+class: Workflow
+cwlVersion: v1.2
+
+inputs:
+    vcf_files: 
+        type: File[]?
+    transformed_vcfs: 
+        type: File[]?
+steps: []
+outputs:
+    vcfs: 
+        type: File[]
+        outputSource:
+            - vcf_files
+            - transformed_vcfs
+        pickValue: first_non_null
\ No newline at end of file
index 3680713bcfee24f14ff11e7fba2bd0138cb3d506..8e76046982146ae825ca5d1529436385216e4747 100644 (file)
@@ -16,22 +16,16 @@ outputs:
   vcfs:
     type: File[]
     label: Output VCFs
-  samples:
-    type: string[]
-    label: Sample names of VCFs
 requirements:
   InlineJavascriptRequirement: {}
 expression: |
   ${
     var vcfs = [];
-    var samples = [];
     for (var i = 0; i < inputs.dir.listing.length; i++) {
       var file = inputs.dir.listing[i];
       if (file.nameext == ".gz") {
         vcfs.push(file);
-        var sample = file.basename.split(".").slice(0, -2).join(".");
-        samples.push(sample);
       }
     }
-    return {"vcfs": vcfs, "samples": samples};
+    return {"vcfs": vcfs};
   }
index a4fb5589a81ed448a869822d43189760b1b8010f..19490d807181272d412dca41bb0be1e27b51d0f6 100644 (file)
@@ -17,15 +17,21 @@ hints:
     outputTTL: 604800
 
 inputs:
-  vcfsdir:
-    type: Directory
-    label: Input directory of VCFs
+  vcfs:
+    type: File[]
+    label: Input files of VCFs
+  sampleids:
+    type: string[]
+    label: Sample IDs
   genomebed:
     type: File
     label: Whole genome BED
   ref:
     type: File
     label: Reference FASTA
+  gqcutoff:
+    type: int
+    label: GQ (Genotype Quality) cutoff for filtering
 
 outputs:
   fas:
@@ -38,18 +44,14 @@ outputs:
     outputSource: gvcf2fasta-wf/fas
 
 steps:
-  getfiles:
-    run: helpers/getfiles.cwl
-    in:
-      dir: vcfsdir
-    out: [vcfs, samples]
   gvcf2fasta-wf:
     run: gvcf2fasta/gvcf2fasta-wf.cwl
     scatter: [sampleid, vcf]
     scatterMethod: dotproduct
     in:
-      sampleid: getfiles/samples
-      vcf: getfiles/vcfs
+      sampleid: sampleids
+      vcf: vcfs
       genomebed: genomebed
       ref: ref
+      gqcutoff: gqcutoff
     out: [fas]
index d3427b74ecd15f65bbada347b6491580f94c7684..d209e81291e88de3117e5a93d4ca786dc54ac016 100644 (file)
@@ -2,12 +2,11 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-FROM arvados/jobs
-MAINTAINER Jiayong Li <jli@curii.com>
+FROM python:3.9-buster
 
 USER root
 
-RUN apt-get update -q
+RUN apt-get update 
 
 RUN apt-get install -qy build-essential wget cmake zlib1g-dev \
     libbz2-dev liblzma-dev libncurses5-dev libncursesw5-dev git vcftools
@@ -63,4 +62,4 @@ WORKDIR /
 
 # Installing gvcf_regions
 
-RUN git clone https://github.com/lijiayong/gvcf_regions
+RUN git clone https://github.com/acoleman2000/gvcf_regions