16217: Add prometheus metrics.
[arvados.git] / services / ws / router.go
index 77744974d32b7bb45905268461280e8444ca91e6..5f40143fcde0ddc2e33ac027259c3ccdb67ff9f0 100644 (file)
@@ -1,4 +1,8 @@
-package main
+// Copyright (C) The Arvados Authors. All rights reserved.
+//
+// SPDX-License-Identifier: AGPL-3.0
+
+package ws
 
 import (
        "encoding/json"
@@ -9,8 +13,12 @@ import (
        "sync/atomic"
        "time"
 
-       "git.curoverse.com/arvados.git/sdk/go/ctxlog"
-       "github.com/Sirupsen/logrus"
+       "git.arvados.org/arvados.git/lib/cmd"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/ctxlog"
+       "git.arvados.org/arvados.git/sdk/go/health"
+       "github.com/prometheus/client_golang/prometheus"
+       "github.com/sirupsen/logrus"
        "golang.org/x/net/websocket"
 )
 
@@ -22,13 +30,16 @@ type wsConn interface {
 }
 
 type router struct {
-       Config         *wsConfig
+       client         *arvados.Client
+       cluster        *arvados.Cluster
        eventSource    eventSource
        newPermChecker func() permChecker
 
        handler   *handler
        mux       *http.ServeMux
        setupOnce sync.Once
+       done      chan struct{}
+       reg       *prometheus.Registry
 
        lastReqID  int64
        lastReqMtx sync.Mutex
@@ -46,42 +57,63 @@ type debugStatuser interface {
 }
 
 func (rtr *router) setup() {
+       mSockets := prometheus.NewGaugeVec(prometheus.GaugeOpts{
+               Namespace: "arvados",
+               Subsystem: "ws",
+               Name:      "sockets",
+               Help:      "Number of connected sockets",
+       }, []string{"version"})
+       rtr.reg.MustRegister(mSockets)
+
        rtr.handler = &handler{
-               PingTimeout: rtr.Config.PingTimeout.Duration(),
-               QueueSize:   rtr.Config.ClientEventQueue,
+               PingTimeout: time.Duration(rtr.cluster.API.SendTimeout),
+               QueueSize:   rtr.cluster.API.WebsocketClientEventQueue,
        }
        rtr.mux = http.NewServeMux()
-       rtr.mux.Handle("/websocket", rtr.makeServer(newSessionV0))
-       rtr.mux.Handle("/arvados/v1/events.ws", rtr.makeServer(newSessionV1))
        rtr.mux.Handle("/debug.json", rtr.jsonHandler(rtr.DebugStatus))
        rtr.mux.Handle("/status.json", rtr.jsonHandler(rtr.Status))
 
-       health := http.NewServeMux()
-       rtr.mux.Handle("/_health/", rtr.mgmtAuth(health))
-       health.Handle("/_health/ping", rtr.jsonHandler(rtr.HealthFunc(func() error { return nil })))
-       health.Handle("/_health/db", rtr.jsonHandler(rtr.HealthFunc(rtr.eventSource.DBHealth)))
+       rtr.mux.Handle("/websocket", rtr.makeServer(newSessionV0, mSockets.WithLabelValues("0")))
+       rtr.mux.Handle("/arvados/v1/events.ws", rtr.makeServer(newSessionV1, mSockets.WithLabelValues("1")))
+       rtr.mux.Handle("/_health/", &health.Handler{
+               Token:  rtr.cluster.ManagementToken,
+               Prefix: "/_health/",
+               Routes: health.Routes{
+                       "db": rtr.eventSource.DBHealth,
+               },
+               Log: func(r *http.Request, err error) {
+                       if err != nil {
+                               ctxlog.FromContext(r.Context()).WithError(err).Error("error")
+                       }
+               },
+       })
 }
 
-func (rtr *router) makeServer(newSession sessionFactory) *websocket.Server {
+func (rtr *router) makeServer(newSession sessionFactory, gauge prometheus.Gauge) *websocket.Server {
+       var connected int64
        return &websocket.Server{
                Handshake: func(c *websocket.Config, r *http.Request) error {
                        return nil
                },
                Handler: websocket.Handler(func(ws *websocket.Conn) {
                        t0 := time.Now()
-                       log := logger(ws.Request().Context())
-                       log.Info("connected")
+                       logger := ctxlog.FromContext(ws.Request().Context())
+                       logger.Info("connected")
+                       atomic.AddInt64(&connected, 1)
+                       gauge.Set(float64(atomic.LoadInt64(&connected)))
 
-                       stats := rtr.handler.Handle(ws, rtr.eventSource,
+                       stats := rtr.handler.Handle(ws, logger, rtr.eventSource,
                                func(ws wsConn, sendq chan<- interface{}) (session, error) {
-                                       return newSession(ws, sendq, rtr.eventSource.DB(), rtr.newPermChecker(), &rtr.Config.Client)
+                                       return newSession(ws, sendq, rtr.eventSource.DB(), rtr.newPermChecker(), rtr.client)
                                })
 
-                       log.WithFields(logrus.Fields{
+                       logger.WithFields(logrus.Fields{
                                "elapsed": time.Now().Sub(t0).Seconds(),
                                "stats":   stats,
                        }).Info("disconnect")
                        ws.Close()
+                       atomic.AddInt64(&connected, -1)
+                       gauge.Set(float64(atomic.LoadInt64(&connected)))
                }),
        }
 }
@@ -107,24 +139,10 @@ func (rtr *router) DebugStatus() interface{} {
        return s
 }
 
-var pingResponseOK = map[string]string{"health": "OK"}
-
-func (rtr *router) HealthFunc(f func() error) func() interface{} {
-       return func() interface{} {
-               err := f()
-               if err == nil {
-                       return pingResponseOK
-               }
-               return map[string]string{
-                       "health": "ERROR",
-                       "error":  err.Error(),
-               }
-       }
-}
-
 func (rtr *router) Status() interface{} {
        return map[string]interface{}{
                "Clients": atomic.LoadInt64(&rtr.status.ReqsActive),
+               "Version": cmd.Version.String(),
        }
 }
 
@@ -134,7 +152,7 @@ func (rtr *router) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
        atomic.AddInt64(&rtr.status.ReqsActive, 1)
        defer atomic.AddInt64(&rtr.status.ReqsActive, -1)
 
-       logger := logger(req.Context()).
+       logger := ctxlog.FromContext(req.Context()).
                WithField("RequestID", rtr.newReqID())
        ctx := ctxlog.Context(req.Context(), logger)
        req = req.WithContext(ctx)
@@ -145,23 +163,9 @@ func (rtr *router) ServeHTTP(resp http.ResponseWriter, req *http.Request) {
        rtr.mux.ServeHTTP(resp, req)
 }
 
-func (rtr *router) mgmtAuth(h http.Handler) http.Handler {
-       return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-               if rtr.Config.ManagementToken == "" {
-                       http.Error(w, "disabled", http.StatusNotFound)
-               } else if ah := r.Header.Get("Authorization"); ah == "" {
-                       http.Error(w, "authorization required", http.StatusUnauthorized)
-               } else if ah != "Bearer "+rtr.Config.ManagementToken {
-                       http.Error(w, "authorization error", http.StatusForbidden)
-               } else {
-                       h.ServeHTTP(w, r)
-               }
-       })
-}
-
 func (rtr *router) jsonHandler(fn func() interface{}) http.Handler {
        return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-               logger := logger(r.Context())
+               logger := ctxlog.FromContext(r.Context())
                w.Header().Set("Content-Type", "application/json")
                enc := json.NewEncoder(w)
                err := enc.Encode(fn())
@@ -172,3 +176,12 @@ func (rtr *router) jsonHandler(fn func() interface{}) http.Handler {
                }
        })
 }
+
+func (rtr *router) CheckHealth() error {
+       rtr.setupOnce.Do(rtr.setup)
+       return rtr.eventSource.DBHealth()
+}
+
+func (rtr *router) Done() <-chan struct{} {
+       return rtr.done
+}