15345: Add .../containers/kill management API to dispatcher.

[arvados.git] / lib / dispatchcloud / worker / pool.go
diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go

index 81a658535ea593a7a0a0c2d9231fd66f3055bdbd..97ca7f60a2a916fd6feece03e016c7fe3673e22d 100644 (file)
--- a/lib/dispatchcloud/worker/pool.go
+++ b/lib/dispatchcloud/worker/pool.go
@@ -25,6 +25,7 @@ const (
         tagKeyInstanceType   = "InstanceType"
         tagKeyIdleBehavior   = "IdleBehavior"
         tagKeyInstanceSecret = "InstanceSecret"
+       tagKeyInstanceSetID  = "InstanceSetID"
  )
  
  // An InstanceView shows a worker's current state and recent activity.
@@ -91,25 +92,27 @@ func duration(conf arvados.Duration, def time.Duration) time.Duration {
  //
  // New instances are configured and set up according to the given
  // cluster configuration.
-func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *prometheus.Registry, instanceSet cloud.InstanceSet, newExecutor func(cloud.Instance) Executor, installPublicKey ssh.PublicKey, cluster *arvados.Cluster) *Pool {
+func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *prometheus.Registry, instanceSetID cloud.InstanceSetID, instanceSet cloud.InstanceSet, newExecutor func(cloud.Instance) Executor, installPublicKey ssh.PublicKey, cluster *arvados.Cluster) *Pool {
         wp := &Pool{
                 logger:             logger,
                 arvClient:          arvClient,
+               instanceSetID:      instanceSetID,
                 instanceSet:        &throttledInstanceSet{InstanceSet: instanceSet},
                 newExecutor:        newExecutor,
-               bootProbeCommand:   cluster.CloudVMs.BootProbeCommand,
-               imageID:            cloud.ImageID(cluster.CloudVMs.ImageID),
+               bootProbeCommand:   cluster.Containers.CloudVMs.BootProbeCommand,
+               imageID:            cloud.ImageID(cluster.Containers.CloudVMs.ImageID),
                 instanceTypes:      cluster.InstanceTypes,
-               maxProbesPerSecond: cluster.Dispatch.MaxProbesPerSecond,
-               probeInterval:      duration(cluster.Dispatch.ProbeInterval, defaultProbeInterval),
-               syncInterval:       duration(cluster.CloudVMs.SyncInterval, defaultSyncInterval),
-               timeoutIdle:        duration(cluster.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
-               timeoutBooting:     duration(cluster.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
-               timeoutProbe:       duration(cluster.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
-               timeoutShutdown:    duration(cluster.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
-               timeoutTERM:        duration(cluster.Dispatch.TimeoutTERM, defaultTimeoutTERM),
-               timeoutSignal:      duration(cluster.Dispatch.TimeoutSignal, defaultTimeoutSignal),
+               maxProbesPerSecond: cluster.Containers.CloudVMs.MaxProbesPerSecond,
+               probeInterval:      duration(cluster.Containers.CloudVMs.ProbeInterval, defaultProbeInterval),
+               syncInterval:       duration(cluster.Containers.CloudVMs.SyncInterval, defaultSyncInterval),
+               timeoutIdle:        duration(cluster.Containers.CloudVMs.TimeoutIdle, defaultTimeoutIdle),
+               timeoutBooting:     duration(cluster.Containers.CloudVMs.TimeoutBooting, defaultTimeoutBooting),
+               timeoutProbe:       duration(cluster.Containers.CloudVMs.TimeoutProbe, defaultTimeoutProbe),
+               timeoutShutdown:    duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown),
+               timeoutTERM:        duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM),
+               timeoutSignal:      duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal),
                 installPublicKey:   installPublicKey,
+               tagKeyPrefix:       cluster.Containers.CloudVMs.TagKeyPrefix,
                 stop:               make(chan bool),
         }
         wp.registerMetrics(reg)
@@ -128,6 +131,7 @@ type Pool struct {
         // configuration
         logger             logrus.FieldLogger
         arvClient          *arvados.Client
+       instanceSetID      cloud.InstanceSetID
         instanceSet        *throttledInstanceSet
         newExecutor        func(cloud.Instance) Executor
         bootProbeCommand   string
@@ -143,13 +147,14 @@ type Pool struct {
         timeoutTERM        time.Duration
         timeoutSignal      time.Duration
         installPublicKey   ssh.PublicKey
+       tagKeyPrefix       string
  
         // private state
         subscribers  map[<-chan struct{}]chan<- struct{}
         creating     map[string]createCall // unfinished (cloud.InstanceSet)Create calls (key is instance secret)
         workers      map[cloud.InstanceID]*worker
         loaded       bool                 // loaded list of instances from InstanceSet at least once
-       exited       map[string]time.Time // containers whose crunch-run proc has exited, but KillContainer has not been called
+       exited       map[string]time.Time // containers whose crunch-run proc has exited, but ForgetContainer has not been called
         atQuotaUntil time.Time
         atQuotaErr   cloud.QuotaError
         stop         chan bool
@@ -164,6 +169,7 @@ type Pool struct {
         mInstancesPrice    *prometheus.GaugeVec
         mVCPUs             *prometheus.GaugeVec
         mMemory            *prometheus.GaugeVec
+       mDisappearances    *prometheus.CounterVec
  }
  
  type createCall struct {
@@ -281,9 +287,10 @@ func (wp *Pool) Create(it arvados.InstanceType) bool {
         go func() {
                 defer wp.notify()
                 tags := cloud.InstanceTags{
-                       tagKeyInstanceType:   it.Name,
-                       tagKeyIdleBehavior:   string(IdleBehaviorRun),
-                       tagKeyInstanceSecret: secret,
+                       wp.tagKeyPrefix + tagKeyInstanceSetID:  string(wp.instanceSetID),
+                       wp.tagKeyPrefix + tagKeyInstanceType:   it.Name,
+                       wp.tagKeyPrefix + tagKeyIdleBehavior:   string(IdleBehaviorRun),
+                       wp.tagKeyPrefix + tagKeyInstanceSecret: secret,
                 }
                 initCmd := cloud.InitCommand(fmt.Sprintf("umask 0177 && echo -n %q >%s", secret, instanceSecretFilename))
                 inst, err := wp.instanceSet.Create(it, wp.imageID, tags, initCmd, wp.installPublicKey)
@@ -338,7 +345,8 @@ func (wp *Pool) SetIdleBehavior(id cloud.InstanceID, idleBehavior IdleBehavior)
  //
  // Caller must have lock.
  func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*worker, bool) {
-       inst = tagVerifier{inst}
+       secret := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceSecret]
+       inst = tagVerifier{inst, secret}
         id := inst.ID()
         if wkr := wp.workers[id]; wkr != nil {
                 wkr.executor.SetTarget(inst)
@@ -349,7 +357,7 @@ func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*wor
         }
  
         state := StateUnknown
-       if _, ok := wp.creating[inst.Tags()[tagKeyInstanceSecret]]; ok {
+       if _, ok := wp.creating[secret]; ok {
                 state = StateBooting
         }
  
@@ -359,7 +367,7 @@ func (wp *Pool) updateWorker(inst cloud.Instance, it arvados.InstanceType) (*wor
         // process); otherwise, default to "run". After this,
         // wkr.idleBehavior is the source of truth, and will only be
         // changed via SetIdleBehavior().
-       idleBehavior := IdleBehavior(inst.Tags()[tagKeyIdleBehavior])
+       idleBehavior := IdleBehavior(inst.Tags()[wp.tagKeyPrefix+tagKeyIdleBehavior])
         if !validIdleBehavior[idleBehavior] {
                 idleBehavior = IdleBehaviorRun
         }
@@ -438,7 +446,7 @@ func (wp *Pool) CountWorkers() map[State]int {
  // In the returned map, the time value indicates when the Pool
  // observed that the container process had exited. A container that
  // has not yet exited has a zero time value. The caller should use
-// KillContainer() to garbage-collect the entries for exited
+// ForgetContainer() to garbage-collect the entries for exited
  // containers.
  func (wp *Pool) Running() map[string]time.Time {
         wp.setupOnce.Do(wp.setup)
@@ -485,18 +493,15 @@ func (wp *Pool) StartContainer(it arvados.InstanceType, ctr arvados.Container) b
  //
  // KillContainer returns immediately; the act of killing the container
  // takes some time, and runs in the background.
-func (wp *Pool) KillContainer(uuid string, reason string) {
+//
+// KillContainer returns false if the container has already ended.
+func (wp *Pool) KillContainer(uuid string, reason string) bool {
         wp.mtx.Lock()
         defer wp.mtx.Unlock()
         logger := wp.logger.WithFields(logrus.Fields{
                 "ContainerUUID": uuid,
                 "Reason":        reason,
         })
-       if _, ok := wp.exited[uuid]; ok {
-               logger.Debug("clearing placeholder for exited crunch-run process")
-               delete(wp.exited, uuid)
-               return
-       }
         for _, wkr := range wp.workers {
                 rr := wkr.running[uuid]
                 if rr == nil {
@@ -504,10 +509,30 @@ func (wp *Pool) KillContainer(uuid string, reason string) {
                 }
                 if rr != nil {
                         rr.Kill(reason)
-                       return
+                       return true
                 }
         }
         logger.Debug("cannot kill: already disappeared")
+       return false
+}
+
+// ForgetContainer clears the placeholder for the given exited
+// container, so it isn't returned by subsequent calls to Running().
+//
+// ForgetContainer has no effect if the container has not yet exited.
+//
+// The "container exited at time T" placeholder (which necessitates
+// ForgetContainer) exists to make it easier for the caller
+// (scheduler) to distinguish a container that exited without
+// finalizing its state from a container that exited too recently for
+// its final state to have appeared in the scheduler's queue cache.
+func (wp *Pool) ForgetContainer(uuid string) {
+       wp.mtx.Lock()
+       defer wp.mtx.Unlock()
+       if _, ok := wp.exited[uuid]; ok {
+               wp.logger.WithField("ContainerUUID", uuid).Debug("clearing placeholder for exited crunch-run process")
+               delete(wp.exited, uuid)
+       }
  }
  
  func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
@@ -549,6 +574,16 @@ func (wp *Pool) registerMetrics(reg *prometheus.Registry) {
                 Help:      "Total memory on all cloud VMs.",
         }, []string{"category"})
         reg.MustRegister(wp.mMemory)
+       wp.mDisappearances = prometheus.NewCounterVec(prometheus.CounterOpts{
+               Namespace: "arvados",
+               Subsystem: "dispatchcloud",
+               Name:      "instances_disappeared",
+               Help:      "Number of occurrences of an instance disappearing from the cloud provider's list of instances.",
+       }, []string{"state"})
+       for _, v := range stateString {
+               wp.mDisappearances.WithLabelValues(v).Add(0)
+       }
+       reg.MustRegister(wp.mDisappearances)
  }
  
  func (wp *Pool) runMetrics() {
@@ -691,6 +726,18 @@ func (wp *Pool) Instances() []InstanceView {
         return r
  }
  
+// KillInstance destroys a cloud VM instance. It returns an error if
+// the given instance does not exist.
+func (wp *Pool) KillInstance(id cloud.InstanceID, reason string) error {
+       wkr, ok := wp.workers[id]
+       if !ok {
+               return errors.New("instance not found")
+       }
+       wkr.logger.WithField("Reason", reason).Info("shutting down")
+       wkr.shutdown()
+       return nil
+}
+
  func (wp *Pool) setup() {
         wp.creating = map[string]createCall{}
         wp.exited = map[string]time.Time{}
@@ -716,7 +763,7 @@ func (wp *Pool) getInstancesAndSync() error {
         }
         wp.logger.Debug("getting instance list")
         threshold := time.Now()
-       instances, err := wp.instanceSet.Instances(cloud.InstanceTags{})
+       instances, err := wp.instanceSet.Instances(cloud.InstanceTags{wp.tagKeyPrefix + tagKeyInstanceSetID: string(wp.instanceSetID)})
         if err != nil {
                 wp.instanceSet.throttleInstances.CheckRateLimitError(err, wp.logger, "list instances", wp.notify)
                 return err
@@ -736,7 +783,7 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
         notify := false
  
         for _, inst := range instances {
-               itTag := inst.Tags()[tagKeyInstanceType]
+               itTag := inst.Tags()[wp.tagKeyPrefix+tagKeyInstanceType]
                 it, ok := wp.instanceTypes[itTag]
                 if !ok {
                         wp.logger.WithField("Instance", inst).Errorf("unknown InstanceType tag %q --- ignoring", itTag)
@@ -759,6 +806,9 @@ func (wp *Pool) sync(threshold time.Time, instances []cloud.Instance) {
                         "WorkerState": wkr.state,
                 })
                 logger.Info("instance disappeared in cloud")
+               if wp.mDisappearances != nil {
+                       wp.mDisappearances.WithLabelValues(stateString[wkr.state]).Inc()
+               }
                 delete(wp.workers, id)
                 go wkr.Close()
                 notify = true