10585: Add crunchstat -signal-on-dead-ppid option.
authorTom Clegg <tom@curoverse.com>
Fri, 16 Dec 2016 21:02:24 +0000 (16:02 -0500)
committerTom Clegg <tom@curoverse.com>
Fri, 16 Dec 2016 21:02:24 +0000 (16:02 -0500)
services/crunchstat/crunchstat.go
services/crunchstat/crunchstat_test.go

index cae95fdd9d6cfd30110764e4ea7c87188c0ed6aa..510df3bac676afbc56589e5343b0a20e002baea7 100644 (file)
@@ -16,6 +16,9 @@ import (
 
 const MaxLogLine = 1 << 14 // Child stderr lines >16KiB will be split
 
+var signalOnDeadPPID int
+var ppidCheckInterval = time.Second
+
 func main() {
        reporter := crunchstat.Reporter{
                Logger: log.New(os.Stderr, "crunchstat: ", 0),
@@ -24,6 +27,8 @@ func main() {
        flag.StringVar(&reporter.CgroupRoot, "cgroup-root", "", "Root of cgroup tree")
        flag.StringVar(&reporter.CgroupParent, "cgroup-parent", "", "Name of container parent under cgroup")
        flag.StringVar(&reporter.CIDFile, "cgroup-cid", "", "Path to container id file")
+       flag.IntVar(&signalOnDeadPPID, "signal-on-dead-ppid", 15, "Signal to send child if crunchstat's parent process disappears")
+       flag.DurationVar(&ppidCheckInterval, "ppid-check-interval", ppidCheckInterval, "Time between checks for parent process disappearance")
        pollMsec := flag.Int64("poll", 1000, "Reporting interval, in milliseconds")
 
        flag.Parse()
@@ -77,6 +82,11 @@ func runCommand(argv []string, logger *log.Logger) error {
        signal.Notify(sigChan, syscall.SIGTERM)
        signal.Notify(sigChan, syscall.SIGINT)
 
+       // Kill our child proc if our parent process disappears
+       if signalOnDeadPPID != 0 {
+               go sendSignalOnDeadPPID(signalOnDeadPPID, os.Getppid(), cmd, logger)
+       }
+
        // Funnel stderr through our channel
        stderr_pipe, err := cmd.StderrPipe()
        if err != nil {
@@ -97,6 +107,26 @@ func runCommand(argv []string, logger *log.Logger) error {
        return cmd.Wait()
 }
 
+func sendSignalOnDeadPPID(signum, ppidOrig int, cmd *exec.Cmd, logger *log.Logger) {
+       for _ = range time.NewTicker(ppidCheckInterval).C {
+               ppid := os.Getppid()
+               if ppid == ppidOrig {
+                       continue
+               }
+               if cmd.Process == nil {
+                       // Child process isn't running yet
+                       continue
+               }
+               logger.Printf("notice: crunchstat ppid changed from %d to %d -- killing child pid %d with signal %d", ppidOrig, ppid, cmd.Process.Pid, signum)
+               err := cmd.Process.Signal(syscall.Signal(signum))
+               if err != nil {
+                       logger.Printf("error: sending signal: %d", err)
+                       continue
+               }
+               break
+       }
+}
+
 func copyPipeToChildLog(in io.ReadCloser, logger *log.Logger) {
        reader := bufio.NewReaderSize(in, MaxLogLine)
        var prefix string
index fe3b56d25876fd832d3596abe3db8e40852ebbf7..759b3aa073c11df927923b703141587b8dbbabd9 100644 (file)
@@ -3,9 +3,15 @@ package main
 import (
        "bufio"
        "bytes"
+       "fmt"
        "io"
+       "io/ioutil"
        "log"
        "math/rand"
+       "os"
+       "os/exec"
+       "sync"
+       "syscall"
        "testing"
        "time"
 )
@@ -82,3 +88,147 @@ func bufLogger() (*log.Logger, *bufio.Reader) {
        logger := log.New(w, "", 0)
        return logger, bufio.NewReader(r)
 }
+
+func TestSignalOnDeadPPID(t *testing.T) {
+       if !testDeadParent(t, 0) {
+               t.Fatal("child should still be alive after parent dies")
+       }
+       if testDeadParent(t, 15) {
+               t.Fatal("child should have been killed when parent died")
+       }
+}
+
+// testDeadParent returns true if crunchstat's child proc is still
+// alive after its parent dies.
+func testDeadParent(t *testing.T, signum int) bool {
+       var err error
+       var bin, childlockfile, parentlockfile *os.File
+       for _, f := range []**os.File{&bin, &childlockfile, &parentlockfile} {
+               *f, err = ioutil.TempFile("", "crunchstat_")
+               if err != nil {
+                       t.Fatal(err)
+               }
+               defer (*f).Close()
+               defer os.Remove((*f).Name())
+       }
+
+       bin.Close()
+       err = exec.Command("go", "build", "-o", bin.Name()).Run()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       err = syscall.Flock(int(parentlockfile.Fd()), syscall.LOCK_EX)
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       cmd := exec.Command("bash", "-c", `
+set -e
+"$BINFILE" -cgroup-root=/none -ppid-check-interval=10ms -signal-on-dead-ppid="$SIGNUM" bash -c '
+    set -e
+    unlock() {
+        flock --unlock "$CHILDLOCKFD"
+        kill %1
+    }
+    trap unlock TERM
+    flock --exclusive "$CHILDLOCKFD"
+    echo -n "$$" > "$CHILDLOCKFILE"
+    flock --unlock "$PARENTLOCKFD"
+    sleep 20 </dev/null >/dev/null 2>/dev/null &
+    wait %1
+    unlock
+' &
+
+# wait for inner bash to start, to ensure $BINFILE has seen this bash proc as its initial PPID
+flock --exclusive "$PARENTLOCKFILE" true
+`)
+       cmd.Env = append(os.Environ(),
+               "SIGNUM="+fmt.Sprintf("%d", signum),
+               "PARENTLOCKFD=3",
+               "PARENTLOCKFILE="+parentlockfile.Name(),
+               "CHILDLOCKFD=4",
+               "CHILDLOCKFILE="+childlockfile.Name(),
+               "BINFILE="+bin.Name())
+       cmd.ExtraFiles = []*os.File{parentlockfile, childlockfile}
+       stderr, err := cmd.StderrPipe()
+       if err != nil {
+               t.Fatal(err)
+       }
+       stdout, err := cmd.StdoutPipe()
+       if err != nil {
+               t.Fatal(err)
+       }
+       cmd.Start()
+       defer cmd.Wait()
+
+       var wg sync.WaitGroup
+       wg.Add(2)
+       defer wg.Wait()
+       for _, rdr := range []io.ReadCloser{stderr, stdout} {
+               go func(rdr io.ReadCloser) {
+                       defer wg.Done()
+                       buf := make([]byte, 1024)
+                       for {
+                               n, err := rdr.Read(buf)
+                               if n > 0 {
+                                       t.Logf("%s", buf[:n])
+                               }
+                               if err != nil {
+                                       return
+                               }
+                       }
+               }(rdr)
+       }
+
+       // Wait until inner bash process releases parentlockfile
+       // (which means it has locked childlockfile and written its
+       // PID)
+       err = exec.Command("flock", "--exclusive", parentlockfile.Name(), "true").Run()
+       if err != nil {
+               t.Fatal(err)
+       }
+
+       childDone := make(chan bool)
+       go func() {
+               // Notify the main thread when the inner bash process
+               // releases its lock on childlockfile (which means
+               // either its sleep process ended or it received a
+               // TERM signal).
+               t0 := time.Now()
+               err = exec.Command("flock", "--exclusive", childlockfile.Name(), "true").Run()
+               if err != nil {
+                       t.Fatal(err)
+               }
+               t.Logf("child done after %s", time.Since(t0))
+               close(childDone)
+       }()
+
+       select {
+       case <-time.After(500 * time.Millisecond):
+               // Inner bash process is still alive after the timeout
+               // period. Kill it now, so our stdout and stderr pipes
+               // can finish and we don't leave a mess of child procs
+               // behind.
+               buf, err := ioutil.ReadFile(childlockfile.Name())
+               if err != nil {
+                       t.Fatal(err)
+               }
+               var childPID int
+               _, err = fmt.Sscanf(string(buf), "%d", &childPID)
+               if err != nil {
+                       t.Fatal(err)
+               }
+               child, err := os.FindProcess(childPID)
+               if err != nil {
+                       t.Fatal(err)
+               }
+               child.Signal(syscall.Signal(15))
+               return true
+
+       case <-childDone:
+               // Inner bash process ended soon after its grandparent
+               // ended.
+               return false
+       }
+}