-// Read training set file(s) from path (may be dir or file) and set up
-// cmd.trainingSet.
-//
-// cmd.trainingSet[i] == n >= 0 if cmd.cgnames[i] is the nth training
-// set sample.
-//
-// cmd.trainingSet[i] == -1 if cmd.cgnames[i] is not in the training
-// set.
-func (cmd *sliceNumpy) loadTrainingSet(path string) error {
- cmd.trainingSet = make([]int, len(cmd.cgnames))
- if path == "" {
- cmd.trainingSetSize = len(cmd.cgnames)
- for i := range cmd.trainingSet {
- cmd.trainingSet[i] = i
- }
- return nil
- }
- for i := range cmd.trainingSet {
- cmd.trainingSet[i] = -1
- }
- infiles, err := allFiles(path, nil)
- if err != nil {
- return err
- }
- for _, infile := range infiles {
- f, err := open(infile)
- if err != nil {
- return err
- }
- buf, err := io.ReadAll(f)
- f.Close()
- if err != nil {
- return err
- }
- for _, tsv := range bytes.Split(buf, []byte{'\n'}) {
- if len(tsv) == 0 {
- continue
- }
- split := strings.Split(string(tsv), "\t")
- pattern := split[0]
- found := -1
- for i, name := range cmd.cgnames {
- if strings.Contains(name, pattern) {
- if found >= 0 {
- log.Warnf("pattern %q in %s already matched sample ID %q -- not using %q", pattern, infile, cmd.cgnames[found], name)
- } else {
- found = i
- cmd.trainingSet[found] = 1
- }
- }
- }
- if found < 0 {
- log.Warnf("pattern %q in %s does not match any genome IDs", pattern, infile)
- continue
- }
- }
- }
- tsi := 0
- for i, x := range cmd.trainingSet {
- if x == 1 {
- cmd.trainingSet[i] = tsi
- tsi++
- }
- }
- cmd.trainingSetSize = tsi + 1
- return nil
-}
-