15340: Add metrics for cloud ops/errors and instance disappearances.
authorTom Clegg <tclegg@veritasgenetics.com>
Fri, 14 Jun 2019 19:27:30 +0000 (15:27 -0400)
committerTom Clegg <tclegg@veritasgenetics.com>
Fri, 14 Jun 2019 19:27:30 +0000 (15:27 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>

lib/dispatchcloud/dispatcher.go
lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/driver.go
lib/dispatchcloud/worker/pool.go

index bc699d92804092d8dbbc37bdcd3d8180b67e70c1..12c60ecb11177871a4b5230adddd0e313898270c 100644 (file)
@@ -132,12 +132,12 @@ func (disp *dispatcher) initialize() {
                disp.sshKey = key
        }
 
-       instanceSet, err := newInstanceSet(disp.Cluster, disp.InstanceSetID, disp.logger)
+       disp.reg = prometheus.NewRegistry()
+       instanceSet, err := newInstanceSet(disp.Cluster, disp.InstanceSetID, disp.logger, disp.reg)
        if err != nil {
                disp.logger.Fatalf("error initializing driver: %s", err)
        }
        disp.instanceSet = instanceSet
-       disp.reg = prometheus.NewRegistry()
        disp.pool = worker.NewPool(disp.logger, disp.ArvClient, disp.reg, disp.InstanceSetID, disp.instanceSet, disp.newExecutor, disp.sshKey.PublicKey(), disp.Cluster)
        disp.queue = container.NewQueue(disp.logger, disp.reg, disp.typeChooser, disp.ArvClient)
 
index 012621f12f633fe9c352e2f6bb847dadb965a59d..6b73e71ccd4267c5e9e6f13749499d8feb3f1a6f 100644 (file)
@@ -49,6 +49,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
        }
 
        s.cluster = &arvados.Cluster{
+               ManagementToken: "test-management-token",
                Containers: arvados.ContainersConfig{
                        DispatchPrivateKey: string(dispatchprivraw),
                        StaleLockTimeout:   arvados.Duration(5 * time.Millisecond),
@@ -193,6 +194,18 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
                        c.Fatalf("timed out with %d containers (%v), %d instances (%+v)", len(ents), ents, len(insts), insts)
                }
        }
+
+       req := httptest.NewRequest("GET", "/metrics", nil)
+       req.Header.Set("Authorization", "Bearer "+s.cluster.ManagementToken)
+       resp := httptest.NewRecorder()
+       s.disp.ServeHTTP(resp, req)
+       c.Check(resp.Code, check.Equals, http.StatusOK)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="0",operation="Create"} [^0].*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="0",operation="List"} [^0].*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="1",operation="Create"} [^0].*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="1",operation="List"} 0\n.*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="shutdown"} [^0].*`)
+       c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="unknown"} 0\n.*`)
 }
 
 func (s *DispatcherSuite) TestAPIPermissions(c *check.C) {
index b67b5d054b57d172b940255a8318b76dd21af3b8..a8f3d5b5edce06411d7001809caf05ab51e49846 100644 (file)
@@ -12,6 +12,7 @@ import (
        "git.curoverse.com/arvados.git/lib/cloud/azure"
        "git.curoverse.com/arvados.git/lib/cloud/ec2"
        "git.curoverse.com/arvados.git/sdk/go/arvados"
+       "github.com/prometheus/client_golang/prometheus"
        "github.com/sirupsen/logrus"
        "golang.org/x/crypto/ssh"
 )
@@ -21,13 +22,14 @@ var drivers = map[string]cloud.Driver{
        "ec2":   ec2.Driver,
 }
 
-func newInstanceSet(cluster *arvados.Cluster, setID cloud.InstanceSetID, logger logrus.FieldLogger) (cloud.InstanceSet, error) {
+func newInstanceSet(cluster *arvados.Cluster, setID cloud.InstanceSetID, logger logrus.FieldLogger, reg *prometheus.Registry) (cloud.InstanceSet, error) {
        driver, ok := drivers[cluster.Containers.CloudVMs.Driver]
        if !ok {
                return nil, fmt.Errorf("unsupported cloud driver %q", cluster.Containers.CloudVMs.Driver)
        }
        sharedResourceTags := cloud.SharedResourceTags(cluster.Containers.CloudVMs.ResourceTags)
        is, err := driver.InstanceSet(cluster.Containers.CloudVMs.DriverParameters, setID, sharedResourceTags, logger)
+       is = newInstrumentedInstanceSet(is, reg)
        if maxops := cluster.Containers.CloudVMs.MaxCloudOpsPerSecond; maxops > 0 {
                is = rateLimitedInstanceSet{
                        InstanceSet: is,
@@ -113,3 +115,65 @@ nextInstance:
        }).WithError(err).Debugf("filteringInstanceSet returning instances")
        return returning, err
 }
+
+func newInstrumentedInstanceSet(is cloud.InstanceSet, reg *prometheus.Registry) cloud.InstanceSet {
+       cv := prometheus.NewCounterVec(prometheus.CounterOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "driver_operations",
+               Help:      "Number of instance-create/destroy/list operations performed via cloud driver.",
+       }, []string{"operation", "error"})
+
+       // Create all counters, so they are reported with zero values
+       // (instead of being missing) until they are incremented.
+       for _, op := range []string{"Create", "List", "Destroy", "SetTags"} {
+               for _, error := range []string{"0", "1"} {
+                       cv.WithLabelValues(op, error).Add(0)
+               }
+       }
+
+       reg.MustRegister(cv)
+       return instrumentedInstanceSet{is, cv}
+}
+
+type instrumentedInstanceSet struct {
+       cloud.InstanceSet
+       cv *prometheus.CounterVec
+}
+
+func (is instrumentedInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID, tags cloud.InstanceTags, init cloud.InitCommand, pk ssh.PublicKey) (cloud.Instance, error) {
+       inst, err := is.InstanceSet.Create(it, image, tags, init, pk)
+       is.cv.WithLabelValues("Create", boolLabelValue(err != nil)).Inc()
+       return instrumentedInstance{inst, is.cv}, err
+}
+
+func (is instrumentedInstanceSet) Instances(tags cloud.InstanceTags) ([]cloud.Instance, error) {
+       instances, err := is.InstanceSet.Instances(tags)
+       is.cv.WithLabelValues("List", boolLabelValue(err != nil)).Inc()
+       return instances, err
+}
+
+type instrumentedInstance struct {
+       cloud.Instance
+       cv *prometheus.CounterVec
+}
+
+func (inst instrumentedInstance) Destroy() error {
+       err := inst.Instance.Destroy()
+       inst.cv.WithLabelValues("Destroy", boolLabelValue(err != nil)).Inc()
+       return err
+}
+
+func (inst instrumentedInstance) SetTags(tags cloud.InstanceTags) error {
+       err := inst.Instance.SetTags(tags)
+       inst.cv.WithLabelValues("SetTags", boolLabelValue(err != nil)).Inc()
+       return err
+}
+
+func boolLabelValue(v bool) string {
+       if v {
+               return "1"
+       } else {
+               return "0"
+       }
+}
index 0ee36a96ff1d23d3c27e48679dba4b31007299f4..8616d6e9a5d9de42d3b36c966f44362d2b632fbd 100644 (file)
@@ -169,6 +169,7 @@ type Pool struct {
        mInstancesPrice    *prometheus.GaugeVec
        mVCPUs             *prometheus.GaugeVec
        mMemory            *prometheus.GaugeVec
+       mDisappearances    *prometheus.CounterVec
 }
 
 type createCall struct {
@@ -556,6 +557,16 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                Help:      "Total memory on all cloud VMs.",
        }, []string{"category"})
        reg.MustRegister(wp.mMemory)
+       wp.mDisappearances = prometheus.NewCounterVec(prometheus.CounterOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "instances_disappeared",
+               Help:      "Number of occurrences of an instance disappearing from the cloud provider's list of instances.",
+       }, []string{"state"})
+       for _, v := range stateString {
+               wp.mDisappearances.WithLabelValues(v).Add(0)
+       }
+       reg.MustRegister(wp.mDisappearances)
 }
 
 func (wp *Pool) runMetrics() {
@@ -778,6 +789,7 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
                        "WorkerState": wkr.state,
                })
                logger.Info("instance disappeared in cloud")
+               wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
                delete(wp.workers, id)
                go wkr.Close()
                notify = true