+
+ log.Print("makeMask: building mask")
+ var mask mask
+ for _, line := range bytes.Split(regions, []byte{'\n'}) {
+ if bytes.HasPrefix(line, []byte{'#'}) {
+ continue
+ }
+ fields := bytes.Split(line, []byte{'\t'})
+ if len(fields) < 3 {
+ continue
+ }
+ refseqname := string(fields[0])
+ if strings.HasPrefix(refseqname, "chr") {
+ refseqname = refseqname[3:]
+ }
+ start, err1 := strconv.Atoi(string(fields[1]))
+ end, err2 := strconv.Atoi(string(fields[2]))
+ if err1 == nil && err2 == nil {
+ // BED
+ } else {
+ start, err1 = strconv.Atoi(string(fields[3]))
+ end, err2 = strconv.Atoi(string(fields[4]))
+ if err1 == nil && err2 == nil {
+ // GFF/GTF
+ end++
+ } else {
+ return nil, fmt.Errorf("cannot parse input line as BED or GFF/GTF: %q", line)
+ }
+ }
+ mask.Add(refseqname, start-expandRegions, end+expandRegions)
+ }
+ log.Print("makeMask: mask.Freeze")
+ mask.Freeze()
+ return &mask, nil
+}
+
+func chooseTiles(tilelib *tileLibrary, regionsFilename string, expandRegions int) (drop []bool, err error) {
+ if regionsFilename == "" {
+ return
+ }
+ mask, err := makeMask(regionsFilename, expandRegions)
+ if err != nil {
+ return
+ }
+
+ tagset := tilelib.taglib.Tags()
+ if len(tagset) == 0 {
+ err = errors.New("cannot choose tiles by region in a library without tags")
+ return
+ }
+ taglen := len(tagset[0])
+
+ log.Print("chooseTiles: check ref tiles")
+ // Find position+size of each reference tile, and if it
+ // intersects any of the desired regions, set drop[tag]=false.
+ //
+ // (Note it doesn't quite work to do the more obvious thing --
+ // start with drop=false and change to true when ref tiles
+ // intersect target regions -- because that would give us
+ // drop=false for tiles that don't appear at all in the
+ // reference.)
+ //
+ // TODO: (optionally?) don't drop tags for which some tile
+ // variants are spanning tiles, i.e., where the reference tile
+ // does not intersect the desired regions, but a spanning tile
+ // from a genome does.
+ drop = make([]bool, len(tilelib.variant))
+ for i := range drop {
+ drop[i] = true
+ }
+ for refname, refseqs := range tilelib.refseqs {
+ for refseqname, reftiles := range refseqs {
+ if strings.HasPrefix(refseqname, "chr") {
+ refseqname = refseqname[3:]
+ }
+ tileend := 0
+ for _, libref := range reftiles {
+ if libref.Variant < 1 {
+ err = fmt.Errorf("reference %q seq %q uses variant zero at tag %d", refname, refseqname, libref.Tag)
+ return
+ }
+ seq := tilelib.TileVariantSequence(libref)
+ if len(seq) < taglen {
+ err = fmt.Errorf("reference %q seq %q uses tile %d variant %d with sequence len %d < taglen %d", refname, refseqname, libref.Tag, libref.Variant, len(seq), taglen)
+ return
+ }
+ tilestart := tileend
+ tileend = tilestart + len(seq) - taglen
+ if mask.Check(refseqname, tilestart, tileend) {
+ drop[libref.Tag] = false
+ }
+ }
+ }
+ }
+
+ log.Print("chooseTiles: done")
+ return
+}
+
+func recodeOnehot(in []int16, incols int) (out []int16, librefs []tileLibRef, outcols int) {
+ rows := len(in) / incols
+ maxvalue := make([]int16, incols)
+ for row := 0; row < rows; row++ {
+ for col := 0; col < incols; col++ {
+ if v := in[row*incols+col]; maxvalue[col] < v {
+ maxvalue[col] = v
+ }
+ }
+ }
+ outcol := make([]int, incols)
+ dropped := 0
+ for incol, maxv := range maxvalue {
+ outcol[incol] = outcols
+ if maxv == 0 {
+ dropped++
+ }
+ for v := 1; v <= int(maxv); v++ {
+ librefs = append(librefs, tileLibRef{Tag: tagID(incol), Variant: tileVariantID(v)})
+ outcols++
+ }
+ }
+ log.Printf("recodeOnehot: dropped %d input cols with zero maxvalue", dropped)
+
+ out = make([]int16, rows*outcols)
+ for inidx, row := 0, 0; row < rows; row++ {
+ outrow := out[row*outcols:]
+ for col := 0; col < incols; col++ {
+ if v := in[inidx]; v > 0 {
+ outrow[outcol[col]+int(v)-1] = 1
+ }
+ inidx++
+ }
+ }
+ return