From: Tom Clegg Date: Fri, 25 Sep 2020 19:56:42 +0000 (-0400) Subject: Option to output tile library when importing. X-Git-Url: https://git.arvados.org/lightning.git/commitdiff_plain/8a9c807d4b80f61a8485c0fc07abfe23f9d50202 Option to output tile library when importing. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff --git a/gob.go b/gob.go index d7ac24e48f..9426af9871 100644 --- a/gob.go +++ b/gob.go @@ -13,14 +13,16 @@ type CompactGenome struct { Variants []tileVariantID } +type TileVariant struct { + Tag tagID + Blake2b [blake2b.Size256]byte + Sequence []byte +} + type LibraryEntry struct { TagSet [][]byte CompactGenomes []CompactGenome - TileVariants []struct { - Tag tagID - Blake2b [blake2b.Size]byte - Sequence []byte - } + TileVariants []TileVariant } func ReadCompactGenomes(rdr io.Reader) ([]CompactGenome, error) { diff --git a/import.go b/import.go index a396636fcb..3ce00ce19c 100644 --- a/import.go +++ b/import.go @@ -32,6 +32,7 @@ type importer struct { projectUUID string runLocal bool skipOOO bool + outputTiles bool encoder *gob.Encoder } @@ -50,6 +51,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std flags.StringVar(&cmd.projectUUID, "project", "", "project `UUID` for output data") flags.BoolVar(&cmd.runLocal, "local", false, "run on local host (default: run in an arvados container)") flags.BoolVar(&cmd.skipOOO, "skip-ooo", false, "skip out-of-order tags") + flags.BoolVar(&cmd.outputTiles, "output-tiles", false, "include tile variant sequences in output file") priority := flags.Int("priority", 500, "container request priority") pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`") loglevel := flags.String("loglevel", "info", "logging threshold (trace, debug, info, warn, error, fatal, or panic)") @@ -108,7 +110,7 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std err = errors.New("cannot specify output file in container mode: not implemented") return 1 } - runner.Args = append([]string{"import", "-local=true", "-loglevel=" + *loglevel, fmt.Sprintf("-skip-ooo=%v", cmd.skipOOO), "-tag-library", cmd.tagLibraryFile, "-ref", cmd.refFile, "-o", cmd.outputFile}, inputs...) + runner.Args = append([]string{"import", "-local=true", "-loglevel=" + *loglevel, fmt.Sprintf("-skip-ooo=%v", cmd.skipOOO), "-tag-library", cmd.tagLibraryFile, "-ref", cmd.refFile, fmt.Sprintf("-output-tiles=%v", cmd.outputTiles), "-o", cmd.outputFile}, inputs...) var output string output, err = runner.Run() if err != nil { @@ -123,15 +125,10 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std return 1 } - tilelib, err := cmd.loadTileLibrary() + taglib, err := cmd.loadTagLibrary() if err != nil { return 1 } - go func() { - for range time.Tick(10 * time.Minute) { - log.Printf("tilelib.Len() == %d", tilelib.Len()) - } - }() var output io.WriteCloser if cmd.outputFile == "-" { @@ -146,6 +143,16 @@ func (cmd *importer) RunCommand(prog string, args []string, stdin io.Reader, std bufw := bufio.NewWriter(output) cmd.encoder = gob.NewEncoder(bufw) + tilelib := &tileLibrary{taglib: taglib, skipOOO: cmd.skipOOO} + if cmd.outputTiles { + tilelib.encoder = cmd.encoder + } + go func() { + for range time.Tick(10 * time.Minute) { + log.Printf("tilelib.Len() == %d", tilelib.Len()) + } + }() + err = cmd.tileInputs(tilelib, infiles) if err != nil { return 1 @@ -178,7 +185,7 @@ func (cmd *importer) tileFasta(tilelib *tileLibrary, infile string) (tileSeq, er return tilelib.TileFasta(infile, input) } -func (cmd *importer) loadTileLibrary() (*tileLibrary, error) { +func (cmd *importer) loadTagLibrary() (*tagLibrary, error) { log.Printf("tag library %s load starting", cmd.tagLibraryFile) f, err := os.Open(cmd.tagLibraryFile) if err != nil { @@ -202,7 +209,7 @@ func (cmd *importer) loadTileLibrary() (*tileLibrary, error) { return nil, fmt.Errorf("cannot tile: tag library is empty") } log.Printf("tag library %s load done", cmd.tagLibraryFile) - return &tileLibrary{taglib: &taglib, skipOOO: cmd.skipOOO}, nil + return &taglib, nil } func listInputFiles(paths []string) (files []string, err error) { diff --git a/tilelib.go b/tilelib.go index 49f5169665..ca3857566f 100644 --- a/tilelib.go +++ b/tilelib.go @@ -3,6 +3,7 @@ package main import ( "bufio" "bytes" + "encoding/gob" "io" "strings" "sync" @@ -51,6 +52,8 @@ type tileLibrary struct { // count [][]int // seq map[[blake2b.Size]byte][]byte variants int + // if non-nil, write out any tile variants added while tiling + encoder *gob.Encoder mtx sync.Mutex } @@ -153,7 +156,6 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef { } } tilelib.mtx.Lock() - defer tilelib.mtx.Unlock() // if tilelib.seq == nil { // tilelib.seq = map[[blake2b.Size]byte][]byte{} // } @@ -163,11 +165,24 @@ func (tilelib *tileLibrary) getRef(tag tagID, seq []byte) tileLibRef { seqhash := blake2b.Sum256(seq) for i, varhash := range tilelib.variant[tag] { if varhash == seqhash { + tilelib.mtx.Unlock() return tileLibRef{tag: tag, variant: tileVariantID(i + 1)} } } tilelib.variants++ tilelib.variant[tag] = append(tilelib.variant[tag], seqhash) // tilelib.seq[seqhash] = append([]byte(nil), seq...) - return tileLibRef{tag: tag, variant: tileVariantID(len(tilelib.variant[tag]))} + ret := tileLibRef{tag: tag, variant: tileVariantID(len(tilelib.variant[tag]))} + tilelib.mtx.Unlock() + + if tilelib.encoder != nil { + tilelib.encoder.Encode(LibraryEntry{ + TileVariants: []TileVariant{{ + Tag: tag, + Blake2b: seqhash, + Sequence: seq, + }}, + }) + } + return ret }