18870: Need to declare NODES as array
[arvados.git] / services / crunchstat / crunchstat.go
index 8c05069dbf91acc92f6d0fccb1827203a61d14b1..6383eae5452dd1d145420e7da41ce773878b5cef 100644 (file)
@@ -1,8 +1,13 @@
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
 package main
 
 import (
        "bufio"
        "flag"
+       "fmt"
        "io"
        "log"
        "os"
@@ -11,30 +16,53 @@ import (
        "syscall"
        "time"
 
-       "git.curoverse.com/arvados.git/lib/crunchstat"
+       "git.arvados.org/arvados.git/lib/cmd"
+       "git.arvados.org/arvados.git/lib/crunchstat"
 )
 
 const MaxLogLine = 1 << 14 // Child stderr lines >16KiB will be split
 
+var (
+       signalOnDeadPPID  int = 15
+       ppidCheckInterval     = time.Second
+       version               = "dev"
+)
+
 func main() {
        reporter := crunchstat.Reporter{
                Logger: log.New(os.Stderr, "crunchstat: ", 0),
        }
 
-       flag.StringVar(&reporter.CgroupRoot, "cgroup-root", "", "Root of cgroup tree")
-       flag.StringVar(&reporter.CgroupParent, "cgroup-parent", "", "Name of container parent under cgroup")
-       flag.StringVar(&reporter.CIDFile, "cgroup-cid", "", "Path to container id file")
-       pollMsec := flag.Int64("poll", 1000, "Reporting interval, in milliseconds")
+       flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError)
+       flags.StringVar(&reporter.CgroupRoot, "cgroup-root", "", "Root of cgroup tree")
+       flags.StringVar(&reporter.CgroupParent, "cgroup-parent", "", "Name of container parent under cgroup")
+       flags.StringVar(&reporter.CIDFile, "cgroup-cid", "", "Path to container id file")
+       flags.IntVar(&signalOnDeadPPID, "signal-on-dead-ppid", signalOnDeadPPID, "Signal to send child if crunchstat's parent process disappears (0 to disable)")
+       flags.DurationVar(&ppidCheckInterval, "ppid-check-interval", ppidCheckInterval, "Time between checks for parent process disappearance")
+       pollMsec := flags.Int64("poll", 1000, "Reporting interval, in milliseconds")
+       getVersion := flags.Bool("version", false, "Print version information and exit.")
+
+       if ok, code := cmd.ParseFlags(flags, os.Args[0], os.Args[1:], "program [args ...]", os.Stderr); !ok {
+               os.Exit(code)
+       } else if *getVersion {
+               fmt.Printf("crunchstat %s\n", version)
+               return
+       } else if flags.NArg() == 0 {
+               fmt.Fprintf(os.Stderr, "missing required argument: program (try -help)\n")
+               os.Exit(2)
+       }
 
-       flag.Parse()
+       reporter.Logger.Printf("crunchstat %s started", version)
 
        if reporter.CgroupRoot == "" {
                reporter.Logger.Fatal("error: must provide -cgroup-root")
+       } else if signalOnDeadPPID < 0 {
+               reporter.Logger.Fatalf("-signal-on-dead-ppid=%d is invalid (use a positive signal number, or 0 to disable)", signalOnDeadPPID)
        }
-       reporter.Poll = time.Duration(*pollMsec) * time.Millisecond
+       reporter.PollPeriod = time.Duration(*pollMsec) * time.Millisecond
 
        reporter.Start()
-       err := runCommand(flag.Args(), reporter.Logger)
+       err := runCommand(flags.Args(), reporter.Logger)
        reporter.Stop()
 
        if err, ok := err.(*exec.ExitError); ok {
@@ -77,8 +105,13 @@ func runCommand(argv []string, logger *log.Logger) error {
        signal.Notify(sigChan, syscall.SIGTERM)
        signal.Notify(sigChan, syscall.SIGINT)
 
+       // Kill our child proc if our parent process disappears
+       if signalOnDeadPPID != 0 {
+               go sendSignalOnDeadPPID(ppidCheckInterval, signalOnDeadPPID, os.Getppid(), cmd, logger)
+       }
+
        // Funnel stderr through our channel
-       stderr_pipe, err := cmd.StderrPipe()
+       stderrPipe, err := cmd.StderrPipe()
        if err != nil {
                logger.Fatalln("error in StderrPipe:", err)
        }
@@ -92,11 +125,33 @@ func runCommand(argv []string, logger *log.Logger) error {
        os.Stdin.Close()
        os.Stdout.Close()
 
-       copyPipeToChildLog(stderr_pipe, log.New(os.Stderr, "", 0))
+       copyPipeToChildLog(stderrPipe, log.New(os.Stderr, "", 0))
 
        return cmd.Wait()
 }
 
+func sendSignalOnDeadPPID(intvl time.Duration, signum, ppidOrig int, cmd *exec.Cmd, logger *log.Logger) {
+       ticker := time.NewTicker(intvl)
+       for range ticker.C {
+               ppid := os.Getppid()
+               if ppid == ppidOrig {
+                       continue
+               }
+               if cmd.Process == nil {
+                       // Child process isn't running yet
+                       continue
+               }
+               logger.Printf("notice: crunchstat ppid changed from %d to %d -- killing child pid %d with signal %d", ppidOrig, ppid, cmd.Process.Pid, signum)
+               err := cmd.Process.Signal(syscall.Signal(signum))
+               if err != nil {
+                       logger.Printf("error: sending signal: %s", err)
+                       continue
+               }
+               ticker.Stop()
+               break
+       }
+}
+
 func copyPipeToChildLog(in io.ReadCloser, logger *log.Logger) {
        reader := bufio.NewReaderSize(in, MaxLogLine)
        var prefix string