1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: Apache-2.0
20 "git.arvados.org/arvados.git/lib/cmd"
21 "git.arvados.org/arvados.git/lib/config"
22 "git.arvados.org/arvados.git/sdk/go/arvados"
23 "git.arvados.org/arvados.git/sdk/go/auth"
24 "git.arvados.org/arvados.git/sdk/go/ctxlog"
25 "github.com/ghodss/yaml"
26 "github.com/sirupsen/logrus"
29 const defaultTimeout = arvados.Duration(2 * time.Second)
31 // Aggregator implements service.Handler. It handles "GET /_health/all"
32 // by checking the health of all configured services on the cluster
33 // and responding 200 if everything is healthy.
34 type Aggregator struct {
36 httpClient *http.Client
37 timeout arvados.Duration
39 Cluster *arvados.Cluster
41 // If non-nil, Log is called after handling each request.
42 Log func(*http.Request, error)
45 func (agg *Aggregator) setup() {
46 agg.httpClient = &http.Client{
47 Transport: &http.Transport{
48 TLSClientConfig: &tls.Config{
49 InsecureSkipVerify: agg.Cluster.TLS.Insecure,
54 // this is always the case, except in the test suite
55 agg.timeout = defaultTimeout
59 func (agg *Aggregator) CheckHealth() error {
63 func (agg *Aggregator) Done() <-chan struct{} {
67 func (agg *Aggregator) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
68 agg.setupOnce.Do(agg.setup)
69 sendErr := func(statusCode int, err error) {
70 resp.WriteHeader(statusCode)
71 json.NewEncoder(resp).Encode(map[string]string{"error": err.Error()})
77 resp.Header().Set("Content-Type", "application/json")
79 if !agg.checkAuth(req) {
80 sendErr(http.StatusUnauthorized, errUnauthorized)
83 if req.URL.Path == "/_health/all" {
84 json.NewEncoder(resp).Encode(agg.ClusterHealth())
85 } else if req.URL.Path == "/_health/ping" {
86 resp.Write(healthyBody)
88 sendErr(http.StatusNotFound, errNotFound)
96 type ClusterHealthResponse struct {
97 // "OK" if all needed services are OK, otherwise "ERROR".
98 Health string `json:"health"`
100 // An entry for each known health check of each known instance
101 // of each needed component: "instance of service S on node N
102 // reports health-check C is OK."
103 Checks map[string]CheckResult `json:"checks"`
105 // An entry for each service type: "service S is OK." This
106 // exposes problems that can't be expressed in Checks, like
107 // "service S is needed, but isn't configured to run
109 Services map[arvados.ServiceName]ServiceHealth `json:"services"`
112 type CheckResult struct {
113 Health string `json:"health"`
114 Error string `json:"error,omitempty"`
115 HTTPStatusCode int `json:",omitempty"`
116 HTTPStatusText string `json:",omitempty"`
117 Response map[string]interface{} `json:"response"`
118 ResponseTime json.Number `json:"responseTime"`
121 type ServiceHealth struct {
122 Health string `json:"health"`
126 func (agg *Aggregator) ClusterHealth() ClusterHealthResponse {
127 agg.setupOnce.Do(agg.setup)
128 resp := ClusterHealthResponse{
130 Checks: make(map[string]CheckResult),
131 Services: make(map[arvados.ServiceName]ServiceHealth),
135 wg := sync.WaitGroup{}
136 for svcName, svc := range agg.Cluster.Services.Map() {
137 // Ensure svc is listed in resp.Services.
139 if _, ok := resp.Services[svcName]; !ok {
140 resp.Services[svcName] = ServiceHealth{Health: "ERROR"}
144 checkURLs := map[arvados.URL]bool{}
145 for addr := range svc.InternalURLs {
146 checkURLs[addr] = true
148 if len(checkURLs) == 0 && svc.ExternalURL.Host != "" {
149 checkURLs[svc.ExternalURL] = true
151 for addr := range checkURLs {
153 go func(svcName arvados.ServiceName, addr arvados.URL) {
155 var result CheckResult
156 pingURL, err := agg.pingURL(addr)
158 result = CheckResult{
163 result = agg.ping(pingURL)
168 resp.Checks[fmt.Sprintf("%s+%s", svcName, pingURL)] = result
169 if result.Health == "OK" {
170 h := resp.Services[svcName]
173 resp.Services[svcName] = h
175 resp.Health = "ERROR"
182 // Report ERROR if a needed service didn't fail any checks
183 // merely because it isn't configured to run anywhere.
184 for _, sh := range resp.Services {
185 if sh.Health != "OK" {
186 resp.Health = "ERROR"
193 func (agg *Aggregator) pingURL(svcURL arvados.URL) (*url.URL, error) {
194 base := url.URL(svcURL)
195 return base.Parse("/_health/ping")
198 func (agg *Aggregator) ping(target *url.URL) (result CheckResult) {
201 result.ResponseTime = json.Number(fmt.Sprintf("%.6f", time.Since(t0).Seconds()))
203 result.Health = "ERROR"
205 req, err := http.NewRequest("GET", target.String(), nil)
207 result.Error = err.Error()
210 req.Header.Set("Authorization", "Bearer "+agg.Cluster.ManagementToken)
212 // Avoid workbench1's redirect-http-to-https feature
213 req.Header.Set("X-Forwarded-Proto", "https")
215 ctx, cancel := context.WithTimeout(req.Context(), time.Duration(agg.timeout))
217 req = req.WithContext(ctx)
218 resp, err := agg.httpClient.Do(req)
220 result.Error = err.Error()
223 result.HTTPStatusCode = resp.StatusCode
224 result.HTTPStatusText = resp.Status
225 err = json.NewDecoder(resp.Body).Decode(&result.Response)
227 result.Error = fmt.Sprintf("cannot decode response: %s", err)
228 } else if resp.StatusCode != http.StatusOK {
229 result.Error = fmt.Sprintf("HTTP %d %s", resp.StatusCode, resp.Status)
230 } else if h, _ := result.Response["health"].(string); h != "OK" {
231 if e, ok := result.Response["error"].(string); ok && e != "" {
235 result.Error = fmt.Sprintf("health=%q in ping response", h)
243 func (agg *Aggregator) checkAuth(req *http.Request) bool {
244 creds := auth.CredentialsFromRequest(req)
245 for _, token := range creds.Tokens {
246 if token != "" && token == agg.Cluster.ManagementToken {
253 var errSilent = errors.New("")
255 var CheckCommand cmd.Handler = checkCommand{}
257 type checkCommand struct{}
259 func (ccmd checkCommand) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
260 logger := ctxlog.New(stderr, "json", "info")
261 ctx := ctxlog.Context(context.Background(), logger)
262 err := ccmd.run(ctx, prog, args, stdin, stdout, stderr)
264 if err != errSilent {
265 fmt.Fprintln(stdout, err.Error())
272 func (ccmd checkCommand) run(ctx context.Context, prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) error {
273 flags := flag.NewFlagSet("", flag.ContinueOnError)
274 flags.SetOutput(stderr)
275 loader := config.NewLoader(stdin, ctxlog.New(stderr, "text", "info"))
276 loader.SetupFlags(flags)
277 versionFlag := flags.Bool("version", false, "Write version information to stdout and exit 0")
278 timeout := flags.Duration("timeout", defaultTimeout.Duration(), "Maximum time to wait for health responses")
279 if ok, _ := cmd.ParseFlags(flags, prog, args, "", stderr); !ok {
280 // cmd.ParseFlags already reported the error
282 } else if *versionFlag {
283 cmd.Version.RunCommand(prog, args, stdin, stdout, stderr)
286 cfg, err := loader.Load()
290 cluster, err := cfg.GetCluster("")
294 logger := ctxlog.New(stderr, cluster.SystemLogs.Format, cluster.SystemLogs.LogLevel).WithFields(logrus.Fields{
295 "ClusterID": cluster.ClusterID,
297 ctx = ctxlog.Context(ctx, logger)
298 agg := Aggregator{Cluster: cluster, timeout: arvados.Duration(*timeout)}
299 resp := agg.ClusterHealth()
300 buf, err := yaml.Marshal(resp)
305 if resp.Health != "OK" {
306 return fmt.Errorf("health check failed")