X-Git-Url: https://git.arvados.org/lightning.git/blobdiff_plain/76fbc75a359348e2a91546a70b8d2c738865cce2..d98453405db2aa86de956d47226af77f3c28c55f:/vcf2fasta.go diff --git a/vcf2fasta.go b/vcf2fasta.go index 08eedf9446..e1a3a3944e 100644 --- a/vcf2fasta.go +++ b/vcf2fasta.go @@ -1,4 +1,8 @@ -package main +// Copyright (C) The Lightning Authors. All rights reserved. +// +// SPDX-License-Identifier: AGPL-3.0 + +package lightning import ( "bufio" @@ -22,7 +26,7 @@ import ( "sync" "syscall" - "git.arvados.org/arvados.git/sdk/go/arvados" + "github.com/klauspost/pgzip" log "github.com/sirupsen/logrus" ) @@ -102,18 +106,20 @@ func (cmd *vcf2fasta) RunCommand(prog string, args []string, stdin io.Reader, st if err != nil { return 1 } - if cmd.vcpus = len(cmd.batchArgs.Slice(infiles)) * 2; cmd.vcpus > 32 { + batchsize := (len(infiles) + cmd.batchArgs.batches - 1) / cmd.batchArgs.batches + if cmd.vcpus = batchsize * 2; cmd.vcpus > 32 { cmd.vcpus = 32 } } - client := arvados.NewClientFromEnv() runner := arvadosContainerRunner{ Name: "lightning vcf2fasta", - Client: client, + Client: arvadosClientFromEnv, ProjectUUID: cmd.projectUUID, RAM: 2<<30 + int64(cmd.vcpus)<<28, VCPUs: cmd.vcpus, Priority: *priority, + KeepCache: 2, + APIAccess: true, Mounts: map[string]map[string]interface{}{ "/gvcf_regions.py": map[string]interface{}{ "kind": "text", @@ -237,7 +243,7 @@ func (cmd *vcf2fasta) vcf2fasta(infile string, phase int) error { } defer outf.Close() bufw := bufio.NewWriterSize(outf, 8*1024*1024) - gzipw := gzip.NewWriter(bufw) + gzipw := pgzip.NewWriter(bufw) defer gzipw.Close() var maskfifo string // filename of mask fifo if we're running bedtools, otherwise "" @@ -247,12 +253,13 @@ func (cmd *vcf2fasta) vcf2fasta(infile string, phase int) error { if cmd.mask { chrSize := map[string]int{} - vcffile, err := os.Open(infile) + vcffile, err := open(infile) if err != nil { return err } defer vcffile.Close() var rdr io.Reader = vcffile + rdr = bufio.NewReaderSize(rdr, 8*1024*1024) if strings.HasSuffix(infile, ".gz") { rdr, err = gzip.NewReader(vcffile) if err != nil { @@ -279,8 +286,18 @@ func (cmd *vcf2fasta) vcf2fasta(infile string, phase int) error { if err = scanner.Err(); err != nil { return fmt.Errorf("error scanning input file %q: %s", infile, err) } + var regions bytes.Buffer - bedargs := []string{"python2", "-", "--gvcf_type", cmd.gvcfType, infile} + bedargs := []string{"python2", "-"} + if cmd.gvcfType == "complete_genomics_pass_all" { + bedargs = append(bedargs, + "--ignore_phrases", "CNV", "INS:ME", + "--unreported_is_called", + ) + } else if cmd.gvcfType != "" { + bedargs = append(bedargs, "--gvcf_type", cmd.gvcfType) + } + bedargs = append(bedargs, infile) bed := exec.CommandContext(ctx, bedargs[0], bedargs[1:]...) bed.Stdin = bytes.NewBuffer(cmd.gvcfRegionsPyData) bed.Stdout = ®ions @@ -296,7 +313,7 @@ func (cmd *vcf2fasta) vcf2fasta(infile string, phase int) error { // Read chromosome sizes from genome file in // case any weren't specified in the VCF // header. - genomeFile, err := os.Open(cmd.genomeFile) + genomeFile, err := open(cmd.genomeFile) if err != nil { return fmt.Errorf("error opening genome file %q: %s", cmd.genomeFile, err) }