16347: Use health check endpoint to check keepstore readiness.
authorTom Clegg <tom@curii.com>
Thu, 7 Oct 2021 20:21:48 +0000 (16:21 -0400)
committerTom Clegg <tom@curii.com>
Fri, 8 Oct 2021 15:21:03 +0000 (11:21 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

lib/crunchrun/crunchrun.go

index 7a2afeacca3272b07df5a1bad4ea1cad8b9b8755..d70dd1c42855e65d779a48fa66e6ef4711050d59 100644 (file)
@@ -1906,26 +1906,34 @@ func startLocalKeepstore(configData ConfigData, logbuf io.Writer) (*exec.Cmd, er
        if err != nil {
                return nil, fmt.Errorf("error starting keepstore process: %w", err)
        }
+       cmdExited := false
+       go func() {
+               cmd.Wait()
+               cmdExited = true
+       }()
        ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(time.Second*10))
        defer cancel()
        poll := time.NewTicker(time.Second / 10)
        defer poll.Stop()
        client := http.Client{}
        for range poll.C {
-               testReq, err := http.NewRequestWithContext(ctx, "GET", url, nil)
+               testReq, err := http.NewRequestWithContext(ctx, "GET", url+"/_health/ping", nil)
+               testReq.Header.Set("Authorization", "Bearer "+configData.Cluster.ManagementToken)
                if err != nil {
                        return nil, err
                }
                resp, err := client.Do(testReq)
                if err == nil {
-                       // Success -- don't need to check the
-                       // response, we just need to know it's
-                       // accepting requests.
                        resp.Body.Close()
-                       break
+                       if resp.StatusCode == http.StatusOK {
+                               break
+                       }
+               }
+               if cmdExited {
+                       return nil, fmt.Errorf("keepstore child process exited")
                }
                if ctx.Err() != nil {
-                       return nil, fmt.Errorf("timed out waiting for new keepstore process to accept a request")
+                       return nil, fmt.Errorf("timed out waiting for new keepstore process to report healthy")
                }
        }
        os.Setenv("ARVADOS_KEEP_SERVICES", url)