From 28a7def16942dc2381429a88b16b3a2c2f589443 Mon Sep 17 00:00:00 2001
From: Alex Coleman <alex.coleman@curii.com>
Date: Fri, 4 Aug 2023 10:32:45 -0600
Subject: [PATCH] 20765: Adding gvcf regions subdir.

Additionally updating Dockerfiles that previously cloned repo.

Arvados-DCO-1.1-Signed-off-by: Alex Coleman <alex.coleman@curii.com>
---
 docker/cgivar2vcfbed/Dockerfile |  11 +-
 docker/vcfutil/Dockerfile       |  16 +-
 gvcf_regions/README.md          |  46 +++++
 gvcf_regions/gvcf_regions.py    | 301 ++++++++++++++++++++++++++++++++
 4 files changed, 362 insertions(+), 12 deletions(-)
 create mode 100644 gvcf_regions/README.md
 create mode 100755 gvcf_regions/gvcf_regions.py

diff --git a/docker/cgivar2vcfbed/Dockerfile b/docker/cgivar2vcfbed/Dockerfile
index eb8ee227ab..f6aa205c86 100644
--- a/docker/cgivar2vcfbed/Dockerfile
+++ b/docker/cgivar2vcfbed/Dockerfile
@@ -2,8 +2,10 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-FROM arvados/jobs
-MAINTAINER Jiayong Li <jli@curii.com>
+# build instruction:
+# docker build -t dockername --file=/path/to/lightning/docker/lightning/Dockerfile /path/to/lightning
+
+FROM python:3.11-buster
 
 USER root
 
@@ -11,6 +13,9 @@ RUN apt-get update -q
 
 RUN apt-get install -qy build-essential wget cmake zlib1g-dev git
 
+# Seting up gvcf_regions
+COPY ./gvcf_regions /gvcf_regions
+
 # Installing cgatools 1.8.0
 
 RUN wget https://sourceforge.net/projects/cgatools/files/1.8.0/cgatools-1.8.0.1-linux_binary-x86_64.tar.gz && \
@@ -35,6 +40,4 @@ RUN wget https://github.com/arq5x/bedtools2/releases/download/v2.25.0/bedtools-2
 
 WORKDIR /
 
-# Installing gvcf_regions
 
-RUN git clone https://github.com/lijiayong/gvcf_regions
diff --git a/docker/vcfutil/Dockerfile b/docker/vcfutil/Dockerfile
index d3427b74ec..38cdedb809 100644
--- a/docker/vcfutil/Dockerfile
+++ b/docker/vcfutil/Dockerfile
@@ -2,8 +2,11 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
-FROM arvados/jobs
-MAINTAINER Jiayong Li <jli@curii.com>
+# build instruction:
+# docker build -t dockername --file=/path/to/lightning/docker/vcfutil/Dockerfile /path/to/lightning
+
+
+FROM python:3.11-buster
 
 USER root
 
@@ -12,6 +15,9 @@ RUN apt-get update -q
 RUN apt-get install -qy build-essential wget cmake zlib1g-dev \
     libbz2-dev liblzma-dev libncurses5-dev libncursesw5-dev git vcftools
 
+# Setting up gvcf_region
+COPY ./gvcf_regions /gvcf_regions
+
 # Getting HTSLIB 1.9 for tabix/bgzip
 
 RUN wget https://github.com/samtools/htslib/releases/download/1.9/htslib-1.9.tar.bz2 && tar -xjvf htslib-1.9.tar.bz2 && \
@@ -58,9 +64,3 @@ WORKDIR /
 RUN wget https://github.com/arq5x/bedtools2/releases/download/v2.27.1/bedtools-2.27.1.tar.gz && \
     tar -xzvf bedtools-2.27.1.tar.gz && \
     cd bedtools2 && make && cp bin/* /usr/local/bin
-
-WORKDIR /
-
-# Installing gvcf_regions
-
-RUN git clone https://github.com/lijiayong/gvcf_regions
diff --git a/gvcf_regions/README.md b/gvcf_regions/README.md
new file mode 100644
index 0000000000..1a44f23c3a
--- /dev/null
+++ b/gvcf_regions/README.md
@@ -0,0 +1,46 @@
+[comment]: # Copyright (C) The Lightning Authors. All rights reserved.
+[comment]: # ()
+[comment]: # SPDX-License-Identifier: AGPL-3.0
+gVCF Regions
+========
+
+gVCF Regions is a command line tool that output the called regions of a gVCF file in BED format. 
+It handles four main types of gVCFs (Complete Genomics, Freebayes, GATK), with the capability 
+to customize the settings of 'called regions'.
+
+### gVCF Regions Command
+```
+gvcf_regions.py [-h] [--unreported_is_called]
+                       [--ignore_phrases IGNORE_PHRASES [IGNORE_PHRASES ...]]
+                       [--min_GQ MIN_GQ] [--min_QUAL MIN_QUAL]
+                       [--pass_phrases PASS_PHRASES [PASS_PHRASES ...]]
+                       [--gvcf_type {complete_genomics,freebayes,gatk}]
+                       GVCF
+
+Output the called regions of a gvcf file to stdout in bed format.
+
+positional arguments:
+  GVCF                  input gvcf file, accept gzipped and unzipped files, or
+                        "-" for stream
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --unreported_is_called
+                        use this flag to treat unreported sites as called
+  --ignore_phrases IGNORE_PHRASES [IGNORE_PHRASES ...]
+                        list of phrases considered as discarded, e.g., CNV,
+                        ME. A line that contains any of the ignore phrases is
+                        discarded.
+  --min_GQ MIN_GQ       minimum GQ (Genotype Quality) considered as called
+  --min_QUAL MIN_QUAL   minimum QUAL considered as called
+  --pass_phrases PASS_PHRASES [PASS_PHRASES ...]
+                        list of phrases considered as called, e.g., PASS,
+                        REFCALL. A line must contain any of the pass phrases
+                        to be considered as called.
+  --gvcf_type {complete_genomics,freebayes,gatk}
+                        type of gvcf output. [unreported_is_called,
+                        ignore_phrases, min_GQ, min_QUAL, pass_phrases]
+                        presets: complete_genomics: [True, ['CNV', 'INS:ME'],
+                        None, None, ['PASS']]. freebayes: [False, None, None,
+                        None, ['PASS']]. gatk: [False, None, 5, None, None].
+```
diff --git a/gvcf_regions/gvcf_regions.py b/gvcf_regions/gvcf_regions.py
new file mode 100755
index 0000000000..81ce9c6704
--- /dev/null
+++ b/gvcf_regions/gvcf_regions.py
@@ -0,0 +1,301 @@
+# Copyright (C) The Lightning Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+#!/usr/bin/env python
+
+import argparse
+import gzip
+import sys
+
+def is_header(line):
+    """Check if a line is header."""
+
+    return line.startswith('#')
+
+def has_END(line):
+    """Check if a line has the 'END=' tag."""
+
+    return 'END=' in line
+
+# FIELD index
+# CHROM 0, POS 1, REF 3, QUAL 5, INFO 7, FORMAT 8, sample 9
+
+def get_END(line):
+    """Extract the END position of a line from the 'END=' tag under INFO."""
+
+    INFO = line.strip().split('\t')[7]
+    INFO_fields = INFO.split(';')
+    for INFO_field in INFO_fields:
+        if INFO_field.split('=')[0] == 'END':
+            END = int(INFO_field.split('=')[1])
+            return END
+
+def get_bed_region(line):
+    """Extract the bed region of a line.
+
+    Return:
+        tuple: (line_start, line_end)"""
+
+    fields = line.strip().split('\t')
+    POS = int(fields[1])
+    REF = fields[3]
+
+    # vcf: 1 index, inclusive start, inclusive end
+    # bed: 0 index, inclusive start, exlusive end
+
+    # starting postion of the line in bed index
+    if POS == 0:
+        line_start = 0
+    else:
+        line_start = POS - 1
+    # ending position of the line in bed index
+    if has_END(line):
+        line_end = get_END(line)
+    else:
+        line_end = POS - 1 + len(REF)
+    return (line_start, line_end)
+
+def get_GQ(line):
+    """Extract the GQ (Genotype Quality) of a line."""
+
+    fields = line.strip().split('\t')
+    FORMAT, sample = fields[8], fields[9]
+    FORMAT_fields = FORMAT.split(':')
+    sample_fields = sample.split(':')
+
+    # No genotype quality present in gVCF. Happens in some lines of GATK gVCF output.
+    try:
+        GQ_index = FORMAT_fields.index('GQ')
+        GQ = int(sample_fields[GQ_index])
+    except ValueError:
+        GQ = 0
+    return GQ
+
+def get_GT(line):
+    """Extract the GT (Genotype) of a line."""
+
+    fields = line.strip().split('\t')
+    FORMAT, sample = fields[8], fields[9]
+    FORMAT_fields = FORMAT.split(':')
+    sample_fields = sample.split(':')
+
+    try:
+        GT_index = FORMAT_fields.index('GT')
+        GT = sample_fields[GT_index]
+    except ValueError:
+        GT = None
+    return GT
+
+
+def is_considered(line, ignore_phrases):
+    """Check if a line should be considered. A line that contains any of the
+    ignore phrases is discarded. E.g., to discard CNV and ME lines of Complete Genomics
+    gvcf, set ignore_phrases = ['CNV', 'INS:ME'].
+
+    Args:
+       ignore_phrases (None or non-empty list)"""
+
+    if ignore_phrases != None:
+        for ignore_phrase in ignore_phrases:
+            if ignore_phrase in line:
+                return False
+    return True
+
+def is_called(line, min_GQ, min_QUAL, pass_phrases):
+    """Check if a line is considered as 'called', i.e., if its GQ is at least
+    min_GQ, if its QUAL is at least min_QUAL, if it contains any of the
+    pass phrases, E.g., for freebayes gvcf, set pass_phrases = ['PASS'],
+    and if the GT field has no '.' (no call).
+
+    Args:
+        min_GQ (None or int)
+        min_QUAL (None or float)
+        pass_phrases (None or non-empty list)"""
+
+    fields = line.strip().split('\t')
+    GQ_passes = (min_GQ == None or get_GQ(line) >= min_GQ)
+    QUAL_passes = (min_QUAL == None or float(fields[5]) >= min_QUAL)
+
+    line_has_pass_phrases = True
+    if pass_phrases != None:
+        for pass_phrase in pass_phrases:
+            if pass_phrase in line:
+                line_has_pass_phrases = True
+                break
+            else:
+                line_has_pass_phrases = False
+
+    GT = get_GT(line)
+    if GT:
+        GT_without_nocall = not ('.' in GT)
+    else:
+        GT_without_nocall = True
+
+    return GQ_passes and QUAL_passes and line_has_pass_phrases and GT_without_nocall
+
+def gvcf_regions(gvcf, unreported_is_called, ignore_phrases,
+                        min_GQ, min_QUAL, pass_phrases):
+    """Generate the called regions of a gvcf as a bed file.
+
+    Args:
+        unreported_is_called (bool): whether an unreported site is considered
+            as called. E.g., in Complete Genomics gvcf, an unreported site
+            is hom-ref (called), whereas in freebayes/gatk gvcf, it is no-call.
+        ignore_phrases (None or non-empty list)
+        min_GQ (None or int)
+        min_QUAL (None or float)
+        pass_phrases (None or non-empty list)"""
+
+    if gvcf == "-":
+        g = sys.stdin
+    elif gvcf.endswith(".gz"):
+        g = gzip.open(gvcf)
+    else:
+        g = open(gvcf)
+
+    region_CHROM = ''
+    for line in g:
+        if isinstance(line, bytes):
+            line = line.decode("utf-8") 
+        # filter out header and lines with ignore phrases
+        if not is_header(line) and is_considered(line, ignore_phrases):
+            fields = line.strip().split('\t')
+            CHROM = fields[0]
+            is_line_called = is_called(line, min_GQ, min_QUAL, pass_phrases)
+            (line_start, line_end) = get_bed_region(line)
+            assert line_end is not None, line
+
+            # handle cases depending on if the previous block is called,
+            # if the current line is called, and if the previous block
+            # and the current line have a gap
+
+            # when chromosome changes
+            if region_CHROM != CHROM:
+                # when chromosome changes from one to another,
+                # write the previous region
+                if region_CHROM != '':
+                    # we assume a chromosome ends with a string of 'N's
+                    if is_previous_block_called:
+                        region_end = previous_block_end
+                        print('\t'.join([region_CHROM,
+                            str(region_start), str(region_end)]))
+                region_CHROM = CHROM
+                if is_line_called:
+                    region_start = line_start
+                    is_previous_block_called = True
+                else:
+                    # we assume a chromosome starts with a string of 'N's
+                    is_previous_block_called = False
+                # update previous_block_end
+                previous_block_end = line_end
+            # when line and previous block are on the same chromosome and
+            # they have a gap
+            elif line_start > previous_block_end:
+                if is_previous_block_called:
+                    if unreported_is_called:
+                        if not is_line_called:
+                            region_end = line_start
+                            print('\t'.join([region_CHROM,
+                                str(region_start), str(region_end)]))
+                            is_previous_block_called = False
+                    else:
+                        region_end = previous_block_end
+                        print('\t'.join([region_CHROM,
+                            str(region_start), str(region_end)]))
+                        if is_line_called:
+                            region_start = line_start
+                        else:
+                            is_previous_block_called = False
+                else:
+                    if unreported_is_called:
+                        region_start = previous_block_end
+                        if is_line_called:
+                            is_previous_block_called = True
+                        else:
+                            region_end = line_start
+                            print('\t'.join([region_CHROM,
+                                str(region_start), str(region_end)]))
+                    else:
+                        if is_line_called:
+                            region_start = line_start
+                            is_previous_block_called = True
+                # update previous_block_end
+                previous_block_end = line_end
+            # when line and previous block are on the same chromosome and
+            # they are back to back without gap
+            elif line_start == previous_block_end:
+                if is_previous_block_called:
+                    if not is_line_called:
+                        region_end = line_start
+                        print('\t'.join([region_CHROM,
+                            str(region_start), str(region_end)]))
+                        is_previous_block_called = False
+                else:
+                    if is_line_called:
+                        region_start = line_start
+                        is_previous_block_called = True
+                # update previous_block_end
+                previous_block_end = line_end
+            # when line and previous block overlap
+            else:
+                if (is_previous_block_called and is_line_called and 
+                    line_end > previous_block_end):
+                    previous_block_end = line_end
+    # at the end of file, write the previous region
+    else:
+        # we assume a chromosome ends with a string of 'N's
+        if is_previous_block_called:
+            region_end = previous_block_end
+            print('\t'.join([region_CHROM,
+                str(region_start), str(region_end)]))
+
+    if g != "-":
+        g.close()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Output the called regions \
+        of a gvcf file to stdout in bed format.')
+    parser.add_argument('gvcf', metavar='GVCF', help='input gvcf file, \
+        accept gzipped and unzipped files, or "-" for stream')
+
+    parser.add_argument('--unreported_is_called', action='store_true',
+        help='use this flag to treat unreported sites as called')
+    parser.add_argument('--ignore_phrases', nargs='+',
+        help='list of phrases considered as discarded, e.g., CNV, ME. \
+        A line that contains any of the ignore phrases is discarded.')
+    parser.add_argument('--min_GQ', type=int,
+        help='minimum GQ (Genotype Quality) considered as called')
+    parser.add_argument('--min_QUAL', type=float,
+        help='minimum QUAL considered as called')
+    parser.add_argument('--pass_phrases', nargs='+',
+        help='list of phrases considered as called, e.g., PASS, REFCALL. \
+        A line must contain any of the pass phrases to be considered as called.')
+
+    # presets of unreported_is_called, ignore_phrases, min_GQ, min_QUAL, pass_phrases
+    complete_genomics_preset = [True, ['CNV', 'INS:ME'], None, None, ['PASS']]
+    freebayes_preset = [False, None, None, None, ['PASS']]
+    gatk_preset = [False, None, 5, None, None]
+
+    gvcf_type_help = '''type of gvcf output.
+            [unreported_is_called, ignore_phrases, min_GQ, min_QUAL, pass_phrases] presets:
+            complete_genomics: %s.
+            freebayes: %s.
+            gatk: %s.''' % (complete_genomics_preset,
+                freebayes_preset, gatk_preset)
+
+    parser.add_argument("--gvcf_type",
+        choices=['complete_genomics', 'freebayes', 'gatk'], help=gvcf_type_help)
+    args = parser.parse_args()
+
+    par = [args.unreported_is_called, args.ignore_phrases,
+                args.min_GQ, args.min_QUAL, args.pass_phrases]
+
+    if args.gvcf_type == 'complete_genomics':
+        par = complete_genomics_preset
+    elif args.gvcf_type == 'freebayes':
+        par = freebayes_preset
+    elif args.gvcf_type == 'gatk':
+        par = gatk_preset
+
+    all_par = [args.gvcf] + par
+    gvcf_regions(*all_par)
-- 
2.39.5