Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>
bufw := bufio.NewWriter(output)
tilelib := &tileLibrary{
bufw := bufio.NewWriter(output)
tilelib := &tileLibrary{
retainTileSequences: true,
}
err = tilelib.LoadGob(context.Background(), input, nil)
retainTileSequences: true,
}
err = tilelib.LoadGob(context.Background(), input, nil)
for variant := 1; variant <= len(tvs); variant++ {
variant, hash := tileVariantID(variant), tvs[variant-1]
tileseq := tilelib.TileVariantSequence(tileLibRef{Tag: tag, Variant: variant})
for variant := 1; variant <= len(tvs); variant++ {
variant, hash := tileVariantID(variant), tvs[variant-1]
tileseq := tilelib.TileVariantSequence(tileLibRef{Tag: tag, Variant: variant})
- if len(tileseq) < taglen {
+ if len(tileseq) == 0 {
+ continue
+ } else if len(tileseq) < taglen {
return fmt.Errorf("tilevar %d,%d has sequence len %d < taglen %d", tag, variant, len(tileseq), taglen)
}
var refpart []byte
return fmt.Errorf("tilevar %d,%d has sequence len %d < taglen %d", tag, variant, len(tileseq), taglen)
}
var refpart []byte
fasta=$(lightning vcf2fasta -project ${project} -priority ${priority} -ref ${ref_fa} -genome ${genome} -mask=true ${gvcf}) ; echo fasta=${fasta}
# fasta=su92l-4zz18-9nq05jifgz7iult
fasta=$(lightning vcf2fasta -project ${project} -priority ${priority} -ref ${ref_fa} -genome ${genome} -mask=true ${gvcf}) ; echo fasta=${fasta}
# fasta=su92l-4zz18-9nq05jifgz7iult
-ref37_lib=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -include-no-calls ${ref37_fa}) ; echo ref37_lib=${ref37_lib}
+ref37_lib=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -save-incomplete-tiles ${ref37_fa}) ; echo ref37_lib=${ref37_lib}
# ref37_lib=su92l-4zz18-vnhlv3g6yp1azls/library.gob
# 539s
# ref37_lib=su92l-4zz18-vnhlv3g6yp1azls/library.gob
# 539s
-ref38_lib=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -include-no-calls ${ref_fa}) ; echo ref38_lib=${ref38_lib}
+ref38_lib=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true -save-incomplete-tiles ${ref_fa}) ; echo ref38_lib=${ref38_lib}
# ref38_lib=su92l-4zz18-swebknshfwsvys6/library.gob
unfiltered=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true ${fasta}) ; echo unfiltered=${unfiltered}
# ref38_lib=su92l-4zz18-swebknshfwsvys6/library.gob
unfiltered=$(lightning import -project ${project} -priority ${priority} -tag-library ${tagset} -skip-ooo=true -output-tiles=true ${fasta}) ; echo unfiltered=${unfiltered}
# exportvcf=su92l-4zz18-gz4svr6zyvipueu/export.csv
# 5506s
# exportvcf=su92l-4zz18-gz4svr6zyvipueu/export.csv
# 5506s
+exporthgvs=$(lightning export -project ${project} -priority ${priority} -i ${merged38} -output-format hgvs -ref /mnt/su92l-4zz18-u77iyyy7cb05xqv/hg38.fa.gz -output-bed hg38.bed) ; echo exporthgvs=${exporthgvs}
+#
+#
+
stats=$(lightning stats -project ${project} -priority ${priority} -i ${merged}) ; echo stats=${stats}
filtered=$(lightning filter -project ${project} -priority ${priority} -i ${merged} -min-coverage "0.9" -max-variants "30") ; echo filtered=${filtered}
stats=$(lightning stats -project ${project} -priority ${priority} -i ${merged}) ; echo stats=${stats}
filtered=$(lightning filter -project ${project} -priority ${priority} -i ${merged} -min-coverage "0.9" -max-variants "30") ; echo filtered=${filtered}
# 6155s
# pcapy=$(lightning pca -project ${project} -priority ${priority} -i ${numpy}) ; echo pcapy=${pcapy}
comvar=$(lightning numpy-comvar -project ${project} -priority ${priority} -i ${numpy} -annotations ${numpy%/matrix.npy}/annotations.tsv) ; echo comvar=${comvar}
# 6155s
# pcapy=$(lightning pca -project ${project} -priority ${priority} -i ${numpy}) ; echo pcapy=${pcapy}
comvar=$(lightning numpy-comvar -project ${project} -priority ${priority} -i ${numpy} -annotations ${numpy%/matrix.npy}/annotations.tsv) ; echo comvar=${comvar}
+# comvar=su92l-4zz18-s1yhngobdvcoc2e/commonvariants.csv
var mtx sync.Mutex
var cgs []CompactGenome
tilelib := tileLibrary{
var mtx sync.Mutex
var cgs []CompactGenome
tilelib := tileLibrary{
}
err = tilelib.LoadGob(context.Background(), input, func(cg CompactGenome) {
if *pick != "" && *pick != cg.Name {
}
err = tilelib.LoadGob(context.Background(), input, func(cg CompactGenome) {
if *pick != "" && *pick != cg.Name {
continue
}
genometile := tileVariant[tileLibRef{Tag: libref.Tag, Variant: variant}]
continue
}
genometile := tileVariant[tileLibRef{Tag: libref.Tag, Variant: variant}]
+ if len(genometile.Sequence) == 0 {
+ // Hash is known but sequence
+ // is not, e.g., retainNoCalls
+ // was false during import
+ continue
+ }
refSequence := reftile.Sequence
// If needed, extend the reference
// sequence up to the tag at the end
refSequence := reftile.Sequence
// If needed, extend the reference
// sequence up to the tag at the end
defer input.Close()
}
tilelib := &tileLibrary{
defer input.Close()
}
tilelib := &tileLibrary{
retainTileSequences: true,
compactGenomes: map[string][]tileVariantID{},
}
retainTileSequences: true,
compactGenomes: map[string][]tileVariantID{},
}
- tagLibraryFile string
- refFile string
- outputFile string
- projectUUID string
- runLocal bool
- skipOOO bool
- outputTiles bool
- includeNoCalls bool
- outputStats string
- encoder *gob.Encoder
+ tagLibraryFile string
+ refFile string
+ outputFile string
+ projectUUID string
+ runLocal bool
+ skipOOO bool
+ outputTiles bool
+ saveIncompleteTiles bool
+ outputStats string
+ encoder *gob.Encoder
}
func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
}
func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
flags.BoolVar(&cmd.runLocal, "local", false, "run on local host (default: run in an arvados container)")
flags.BoolVar(&cmd.skipOOO, "skip-ooo", false, "skip out-of-order tags")
flags.BoolVar(&cmd.outputTiles, "output-tiles", false, "include tile variant sequences in output file")
flags.BoolVar(&cmd.runLocal, "local", false, "run on local host (default: run in an arvados container)")
flags.BoolVar(&cmd.skipOOO, "skip-ooo", false, "skip out-of-order tags")
flags.BoolVar(&cmd.outputTiles, "output-tiles", false, "include tile variant sequences in output file")
- flags.BoolVar(&cmd.includeNoCalls, "include-no-calls", false, "treat tiles with no-calls as regular tiles")
+ flags.BoolVar(&cmd.saveIncompleteTiles, "save-incomplete-tiles", false, "treat tiles with no-calls as regular tiles")
flags.StringVar(&cmd.outputStats, "output-stats", "", "output stats to `file` (json)")
priority := flags.Int("priority", 500, "container request priority")
pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
flags.StringVar(&cmd.outputStats, "output-stats", "", "output stats to `file` (json)")
priority := flags.Int("priority", 500, "container request priority")
pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
Name: "lightning import",
Client: arvados.NewClientFromEnv(),
ProjectUUID: cmd.projectUUID,
Name: "lightning import",
Client: arvados.NewClientFromEnv(),
ProjectUUID: cmd.projectUUID,
- RAM: 60000000000,
- VCPUs: 16,
+ RAM: 80000000000,
+ VCPUs: 32,
Priority: *priority,
}
err = runner.TranslatePaths(&cmd.tagLibraryFile, &cmd.refFile, &cmd.outputFile)
Priority: *priority,
}
err = runner.TranslatePaths(&cmd.tagLibraryFile, &cmd.refFile, &cmd.outputFile)
"-loglevel=" + *loglevel,
fmt.Sprintf("-skip-ooo=%v", cmd.skipOOO),
fmt.Sprintf("-output-tiles=%v", cmd.outputTiles),
"-loglevel=" + *loglevel,
fmt.Sprintf("-skip-ooo=%v", cmd.skipOOO),
fmt.Sprintf("-output-tiles=%v", cmd.outputTiles),
- fmt.Sprintf("-include-no-calls=%v", cmd.includeNoCalls),
+ fmt.Sprintf("-save-incomplete-tiles=%v", cmd.saveIncompleteTiles),
"-output-stats", "/mnt/output/stats.json",
"-tag-library", cmd.tagLibraryFile,
"-ref", cmd.refFile,
"-output-stats", "/mnt/output/stats.json",
"-tag-library", cmd.tagLibraryFile,
"-ref", cmd.refFile,
bufw := bufio.NewWriter(output)
cmd.encoder = gob.NewEncoder(bufw)
bufw := bufio.NewWriter(output)
cmd.encoder = gob.NewEncoder(bufw)
- tilelib := &tileLibrary{taglib: taglib, includeNoCalls: cmd.includeNoCalls, skipOOO: cmd.skipOOO}
+ tilelib := &tileLibrary{taglib: taglib, retainNoCalls: cmd.saveIncompleteTiles, skipOOO: cmd.skipOOO}
if cmd.outputTiles {
cmd.encoder.Encode(LibraryEntry{TagSet: taglib.Tags()})
tilelib.encoder = cmd.encoder
if cmd.outputTiles {
cmd.encoder.Encode(LibraryEntry{TagSet: taglib.Tags()})
tilelib.encoder = cmd.encoder
cmd.errs = make(chan error, 1)
cmd.tilelib = &tileLibrary{
cmd.errs = make(chan error, 1)
cmd.tilelib = &tileLibrary{
- encoder: encoder,
- includeNoCalls: true,
+ encoder: encoder,
+ retainNoCalls: true,
}
cmd.mapped = map[string]map[tileLibRef]tileVariantID{}
}
cmd.mapped = map[string]map[tileLibRef]tileVariantID{}
}
log.Print("reading")
tilelib := tileLibrary{
}
log.Print("reading")
tilelib := tileLibrary{
compactGenomes: map[string][]tileVariantID{},
}
err = tilelib.LoadGob(context.Background(), input, nil)
compactGenomes: map[string][]tileVariantID{},
}
err = tilelib.LoadGob(context.Background(), input, nil)
args := []string{"-local=true", "-o=" + libfile[i], "-skip-ooo=true", "-output-tiles", "-tag-library", "testdata/tags"}
if i == 0 {
// ref only
args := []string{"-local=true", "-o=" + libfile[i], "-skip-ooo=true", "-output-tiles", "-tag-library", "testdata/tags"}
if i == 0 {
// ref only
- args = append(args, "-include-no-calls")
+ args = append(args, "-save-incomplete-tiles")
}
args = append(args, infile)
code := (&importer{}).RunCommand("lightning import", args, bytes.NewReader(nil), &bytes.Buffer{}, os.Stderr)
}
args = append(args, infile)
code := (&importer{}).RunCommand("lightning import", args, bytes.NewReader(nil), &bytes.Buffer{}, os.Stderr)
bedout, err := ioutil.ReadFile(tmpdir + "/export.bed")
c.Check(err, check.IsNil)
c.Logf("%s", string(bedout))
bedout, err := ioutil.ReadFile(tmpdir + "/export.bed")
c.Check(err, check.IsNil)
c.Logf("%s", string(bedout))
- c.Check(string(bedout), check.Equals, `chr1 0 248 0 500 . 0 224
+ c.Check(string(bedout), check.Equals, `chr1 0 248 0 1000 . 0 224
chr1 224 372 1 1000 . 248 348
chr1 348 496 2 1000 . 372 472
chr1 472 572 3 1000 . 496 572
chr1 224 372 1 1000 . 248 348
chr1 348 496 2 1000 . 372 472
chr1 472 572 3 1000 . 496 572
}
type tileLibrary struct {
}
type tileLibrary struct {
skipOOO bool
retainTileSequences bool
skipOOO bool
retainTileSequences bool
basesOut += countBases(job.fasta[last.pos+last.taglen : f.pos+f.taglen])
} else {
// If we dropped this tile
basesOut += countBases(job.fasta[last.pos+last.taglen : f.pos+f.taglen])
} else {
// If we dropped this tile
- // (because !includeNoCalls),
+ // (because !retainNoCalls),
// set taglen=0 so the
// overlapping tag is counted
// toward coverage on the
// set taglen=0 so the
// overlapping tag is counted
// toward coverage on the
// Return a tileLibRef for a tile with the given tag and sequence,
// adding the sequence to the library if needed.
func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
// Return a tileLibRef for a tile with the given tag and sequence,
// adding the sequence to the library if needed.
func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef {
- if !tilelib.includeNoCalls {
+ dropSeq := false
+ if !tilelib.retainNoCalls {
for _, b := range seq {
if b != 'a' && b != 'c' && b != 'g' && b != 't' {
for _, b := range seq {
if b != 'a' && b != 'c' && b != 'g' && b != 't' {
- // return "tile not found" if seq has any
- // no-calls
- return tileLibRef{Tag: tag}
}
tilelib.variants++
tilelib.variant[tag] = append(tilelib.variant[tag], seqhash)
}
tilelib.variants++
tilelib.variant[tag] = append(tilelib.variant[tag], seqhash)
- if tilelib.retainTileSequences {
+ if tilelib.retainTileSequences && !dropSeq {
if tilelib.seq == nil {
tilelib.seq = map[[blake2b.Size256]byte][]byte{}
}
if tilelib.seq == nil {
tilelib.seq = map[[blake2b.Size256]byte][]byte{}
}
tilelib.mtx.Unlock()
if tilelib.encoder != nil {
tilelib.mtx.Unlock()
if tilelib.encoder != nil {
+ saveSeq := seq
+ if dropSeq {
+ // Save the hash, but not the sequence
+ saveSeq = nil
+ }
tilelib.encoder.Encode(LibraryEntry{
TileVariants: []TileVariant{{
Tag: tag,
Variant: variant,
Blake2b: seqhash,
tilelib.encoder.Encode(LibraryEntry{
TileVariants: []TileVariant{{
Tag: tag,
Variant: variant,
Blake2b: seqhash,