15 "git.arvados.org/arvados.git/sdk/go/arvados"
16 log "github.com/sirupsen/logrus"
19 type filterer struct {
23 func (cmd *filterer) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
27 fmt.Fprintf(stderr, "%s\n", err)
30 flags := flag.NewFlagSet("", flag.ContinueOnError)
31 flags.SetOutput(stderr)
32 pprof := flags.String("pprof", "", "serve Go profile data at http://`[addr]:port`")
33 runlocal := flags.Bool("local", false, "run on local host (default: run in an arvados container)")
34 projectUUID := flags.String("project", "", "project `UUID` for output data")
35 priority := flags.Int("priority", 500, "container request priority")
36 inputFilename := flags.String("i", "-", "input `file`")
37 outputFilename := flags.String("o", "-", "output `file`")
38 maxvariants := flags.Int("max-variants", -1, "drop tiles with more than `N` variants")
39 mincoverage := flags.Float64("min-coverage", 1, "drop tiles with coverage less than `P` across all haplotypes (0 < P ≤ 1)")
40 maxtag := flags.Int("max-tag", -1, "drop tiles with tag ID > `N`")
41 err = flags.Parse(args)
42 if err == flag.ErrHelp {
45 } else if err != nil {
52 log.Println(http.ListenAndServe(*pprof, nil))
57 if *outputFilename != "-" {
58 err = errors.New("cannot specify output file in container mode: not implemented")
61 runner := arvadosContainerRunner{
62 Name: "lightning filter",
63 Client: arvados.NewClientFromEnv(),
64 ProjectUUID: *projectUUID,
69 err = runner.TranslatePaths(inputFilename)
73 runner.Args = []string{"filter", "-local=true",
75 "-o", "/mnt/output/library.gob",
76 "-max-variants", fmt.Sprintf("%d", *maxvariants),
77 "-min-coverage", fmt.Sprintf("%f", *mincoverage),
78 "-max-tag", fmt.Sprintf("%d", *maxtag),
81 output, err = runner.Run()
85 fmt.Fprintln(stdout, output+"/library.gob")
89 var infile io.ReadCloser
90 if *inputFilename == "-" {
91 infile = ioutil.NopCloser(stdin)
93 infile, err = os.Open(*inputFilename)
100 cgs, err := ReadCompactGenomes(infile)
108 log.Printf("reading done, %d genomes", len(cgs))
110 log.Print("filtering")
112 for _, cg := range cgs {
113 if ntags < len(cg.Variants)/2 {
114 ntags = len(cg.Variants) / 2
116 if *maxvariants < 0 {
119 maxVariantID := tileVariantID(*maxvariants)
120 for idx, variant := range cg.Variants {
121 if variant > maxVariantID {
122 for _, cg := range cgs {
123 if len(cg.Variants) > idx {
124 cg.Variants[idx & ^1] = 0
125 cg.Variants[idx|1] = 0
132 if *maxtag >= 0 && ntags > *maxtag {
134 for i, cg := range cgs {
135 if len(cg.Variants) > *maxtag*2 {
136 cgs[i].Variants = cg.Variants[:*maxtag*2]
141 if *mincoverage < 1 {
142 mincov := int(*mincoverage * float64(len(cgs)*2))
143 cov := make([]int, ntags)
144 for _, cg := range cgs {
145 for idx, variant := range cg.Variants {
151 for tag, c := range cov {
153 for _, cg := range cgs {
154 if len(cg.Variants) > tag*2 {
155 cg.Variants[tag*2] = 0
156 cg.Variants[tag*2+1] = 0
163 log.Print("filtering done")
165 var outfile io.WriteCloser
166 if *outputFilename == "-" {
167 outfile = nopCloser{cmd.output}
169 outfile, err = os.OpenFile(*outputFilename, os.O_CREATE|os.O_WRONLY, 0777)
173 defer outfile.Close()
175 w := bufio.NewWriter(outfile)
176 enc := gob.NewEncoder(w)
178 err = enc.Encode(LibraryEntry{
184 log.Print("writing done")
189 err = outfile.Close()