15026: Add cloudtest command.
[arvados.git] / lib / dispatchcloud / worker / pool.go
index 0ee36a96ff1d23d3c27e48679dba4b31007299f4..e00a8683a1c59aab03d3f0d066b72f9a7715dea2 100644 (file)
@@ -169,6 +169,7 @@ type Pool struct {
        mInstancesPrice    *prometheus.GaugeVec
        mVCPUs             *prometheus.GaugeVec
        mMemory            *prometheus.GaugeVec
+       mDisappearances    *prometheus.CounterVec
 }
 
 type createCall struct {
@@ -291,7 +292,7 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
                        wp.tagKeyPrefix + tagKeyIdleBehavior:   string(IdleBehaviorRun),
                        wp.tagKeyPrefix + tagKeyInstanceSecret: secret,
                }
-               initCmd := cloud.InitCommand(fmt.Sprintf("umask 0177 && echo -n %q >%s", secret, instanceSecretFilename))
+               initCmd := TagVerifier{nil, secret}.InitCommand()
                inst, err := wp.instanceSet.Create(it, wp.imageID, tags, initCmd, wp.installPublicKey)
                wp.mtx.Lock()
                defer wp.mtx.Unlock()
@@ -345,7 +346,7 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior)
 // Caller must have lock.
 func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*worker, bool) {
        secret := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceSecret]
-       inst = tagVerifier{inst, secret}
+       inst = TagVerifier{inst, secret}
        id := inst.ID()
        if wkr := wp.workers[id]; wkr != nil {
                wkr.executor.SetTarget(inst)
@@ -556,6 +557,16 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                Help:      "Total memory on all cloud VMs.",
        }, []string{"category"})
        reg.MustRegister(wp.mMemory)
+       wp.mDisappearances = prometheus.NewCounterVec(prometheus.CounterOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "instances_disappeared",
+               Help:      "Number of occurrences of an instance disappearing from the cloud provider's list of instances.",
+       }, []string{"state"})
+       for _, v := range stateString {
+               wp.mDisappearances.WithLabelValues(v).Add(0)
+       }
+       reg.MustRegister(wp.mDisappearances)
 }
 
 func (wp *Pool) runMetrics() {
@@ -778,6 +789,9 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
                        "WorkerState": wkr.state,
                })
                logger.Info("instance disappeared in cloud")
+               if wp.mDisappearances != nil {
+                       wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
+               }
                delete(wp.workers, id)
                go wkr.Close()
                notify = true