X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/82c36f0a89f9bfccb083a1da37e0ab1d675e9538..8288bba27e9beff7273aeb65c5200248e52bab02:/lib/dispatchcloud/worker/worker.go

diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go
index 64e1f7797a..b01a820cd6 100644
--- a/lib/dispatchcloud/worker/worker.go
+++ b/lib/dispatchcloud/worker/worker.go
@@ -6,15 +6,15 @@ package worker
 
 import (
 	"bytes"
-	"encoding/json"
 	"fmt"
+	"path/filepath"
 	"strings"
 	"sync"
 	"time"
 
-	"git.curoverse.com/arvados.git/lib/cloud"
-	"git.curoverse.com/arvados.git/sdk/go/arvados"
-	"git.curoverse.com/arvados.git/sdk/go/stats"
+	"git.arvados.org/arvados.git/lib/cloud"
+	"git.arvados.org/arvados.git/sdk/go/arvados"
+	"git.arvados.org/arvados.git/sdk/go/stats"
 	"github.com/sirupsen/logrus"
 )
 
@@ -54,6 +54,23 @@ func (s State) MarshalText() ([]byte, error) {
 	return []byte(stateString[s]), nil
 }
 
+// BootOutcome is the result of a worker boot. It is used as a label in a metric.
+type BootOutcome string
+
+const (
+	BootOutcomeFailed      BootOutcome = "failure"
+	BootOutcomeSucceeded   BootOutcome = "success"
+	BootOutcomeAborted     BootOutcome = "aborted"
+	BootOutcomeDisappeared BootOutcome = "disappeared"
+)
+
+var validBootOutcomes = map[BootOutcome]bool{
+	BootOutcomeFailed:      true,
+	BootOutcomeSucceeded:   true,
+	BootOutcomeAborted:     true,
+	BootOutcomeDisappeared: true,
+}
+
 // IdleBehavior indicates the behavior desired when a node becomes idle.
 type IdleBehavior string
 
@@ -74,22 +91,75 @@ type worker struct {
 	executor Executor
 	wp       *Pool
 
-	mtx          sync.Locker // must be wp's Locker.
-	state        State
-	idleBehavior IdleBehavior
-	instance     cloud.Instance
-	instType     arvados.InstanceType
-	vcpus        int64
-	memory       int64
-	appeared     time.Time
-	probed       time.Time
-	updated      time.Time
-	busy         time.Time
-	destroyed    time.Time
-	lastUUID     string
-	running      map[string]struct{} // remember to update state idle<->running when this changes
-	starting     map[string]struct{} // remember to update state idle<->running when this changes
-	probing      chan struct{}
+	mtx                 sync.Locker // must be wp's Locker.
+	state               State
+	idleBehavior        IdleBehavior
+	instance            cloud.Instance
+	instType            arvados.InstanceType
+	vcpus               int64
+	memory              int64
+	appeared            time.Time
+	probed              time.Time
+	updated             time.Time
+	busy                time.Time
+	destroyed           time.Time
+	firstSSHConnection  time.Time
+	lastUUID            string
+	running             map[string]*remoteRunner // remember to update state idle<->running when this changes
+	starting            map[string]*remoteRunner // remember to update state idle<->running when this changes
+	probing             chan struct{}
+	bootOutcomeReported bool
+	timeToReadyReported bool
+	staleRunLockSince   time.Time
+}
+
+func (wkr *worker) onUnkillable(uuid string) {
+	wkr.mtx.Lock()
+	defer wkr.mtx.Unlock()
+	logger := wkr.logger.WithField("ContainerUUID", uuid)
+	if wkr.idleBehavior == IdleBehaviorHold {
+		logger.Warn("unkillable container, but worker has IdleBehavior=Hold")
+		return
+	}
+	logger.Warn("unkillable container, draining worker")
+	wkr.setIdleBehavior(IdleBehaviorDrain)
+}
+
+func (wkr *worker) onKilled(uuid string) {
+	wkr.mtx.Lock()
+	defer wkr.mtx.Unlock()
+	wkr.closeRunner(uuid)
+	go wkr.wp.notify()
+}
+
+// caller must have lock.
+func (wkr *worker) reportBootOutcome(outcome BootOutcome) {
+	if wkr.bootOutcomeReported {
+		return
+	}
+	if wkr.wp.mBootOutcomes != nil {
+		wkr.wp.mBootOutcomes.WithLabelValues(string(outcome)).Inc()
+	}
+	wkr.bootOutcomeReported = true
+}
+
+// caller must have lock.
+func (wkr *worker) reportTimeBetweenFirstSSHAndReadyForContainer() {
+	if wkr.timeToReadyReported {
+		return
+	}
+	if wkr.wp.mTimeToSSH != nil {
+		wkr.wp.mTimeToReadyForContainer.Observe(time.Since(wkr.firstSSHConnection).Seconds())
+	}
+	wkr.timeToReadyReported = true
+}
+
+// caller must have lock.
+func (wkr *worker) setIdleBehavior(idleBehavior IdleBehavior) {
+	wkr.logger.WithField("IdleBehavior", idleBehavior).Info("set idle behavior")
+	wkr.idleBehavior = idleBehavior
+	wkr.saveTags()
+	wkr.shutdownIfIdle()
 }
 
 // caller must have lock.
@@ -98,57 +168,31 @@ func (wkr *worker) startContainer(ctr arvados.Container) {
 		"ContainerUUID": ctr.UUID,
 		"Priority":      ctr.Priority,
 	})
-	logger = logger.WithField("Instance", wkr.instance.ID())
 	logger.Debug("starting container")
-	wkr.starting[ctr.UUID] = struct{}{}
+	rr := newRemoteRunner(ctr.UUID, wkr)
+	wkr.starting[ctr.UUID] = rr
 	if wkr.state != StateRunning {
 		wkr.state = StateRunning
 		go wkr.wp.notify()
 	}
 	go func() {
-		env := map[string]string{
-			"ARVADOS_API_HOST":  wkr.wp.arvClient.APIHost,
-			"ARVADOS_API_TOKEN": wkr.wp.arvClient.AuthToken,
-		}
-		if wkr.wp.arvClient.Insecure {
-			env["ARVADOS_API_HOST_INSECURE"] = "1"
+		rr.Start()
+		if wkr.wp.mTimeFromQueueToCrunchRun != nil {
+			wkr.wp.mTimeFromQueueToCrunchRun.Observe(time.Since(ctr.CreatedAt).Seconds())
 		}
-		envJSON, err := json.Marshal(env)
-		if err != nil {
-			panic(err)
-		}
-		stdin := bytes.NewBuffer(envJSON)
-		cmd := "crunch-run --detach --stdin-env '" + ctr.UUID + "'"
-		if u := wkr.instance.RemoteUser(); u != "root" {
-			cmd = "sudo " + cmd
-		}
-		stdout, stderr, err := wkr.executor.Execute(nil, cmd, stdin)
 		wkr.mtx.Lock()
 		defer wkr.mtx.Unlock()
 		now := time.Now()
 		wkr.updated = now
 		wkr.busy = now
 		delete(wkr.starting, ctr.UUID)
-		wkr.running[ctr.UUID] = struct{}{}
-		wkr.lastUUID = ctr.UUID
-		if err != nil {
-			logger.WithField("stdout", string(stdout)).
-				WithField("stderr", string(stderr)).
-				WithError(err).
-				Error("error starting crunch-run process")
-			// Leave uuid in wkr.running, though: it's
-			// possible the error was just a communication
-			// failure and the process was in fact
-			// started.  Wait for next probe to find out.
-			return
-		}
-		logger.Info("crunch-run process started")
+		wkr.running[ctr.UUID] = rr
 		wkr.lastUUID = ctr.UUID
 	}()
 }
 
 // ProbeAndUpdate conducts appropriate boot/running probes (if any)
-// for the worker's curent state. If a previous probe is still
+// for the worker's current state. If a previous probe is still
 // running, it does nothing.
 //
 // It should be called in a new goroutine.
@@ -218,11 +262,17 @@ func (wkr *worker) probeAndUpdate() {
 			logger.Info("instance booted; will try probeRunning")
 		}
 	}
+	reportedBroken := false
 	if booted || wkr.state == StateUnknown {
-		ctrUUIDs, ok = wkr.probeRunning()
+		ctrUUIDs, reportedBroken, ok = wkr.probeRunning()
 	}
 	wkr.mtx.Lock()
 	defer wkr.mtx.Unlock()
+	if reportedBroken && wkr.idleBehavior == IdleBehaviorRun {
+		logger.Info("probe reported broken instance")
+		wkr.reportBootOutcome(BootOutcomeFailed)
+		wkr.setIdleBehavior(IdleBehaviorDrain)
+	}
 	if !ok || (!booted && len(ctrUUIDs) == 0 && len(wkr.running) == 0) {
 		if wkr.state == StateShutdown && wkr.updated.After(updated) {
 			// Skip the logging noise if shutdown was
@@ -244,6 +294,7 @@ func (wkr *worker) probeAndUpdate() {
 			// some evidence about why the node never
 			// booted, even in non-debug mode.
 			if !booted {
+				wkr.reportBootOutcome(BootOutcomeFailed)
 				logger.WithFields(logrus.Fields{
 					"Duration": dur,
 					"stderr":   string(stderr),
@@ -262,6 +313,10 @@ func (wkr *worker) probeAndUpdate() {
 		// not yet running when ctrUUIDs was generated. Leave
 		// wkr.running alone and wait for the next probe to
 		// catch up on any changes.
+		logger.WithFields(logrus.Fields{
+			"updated":     updated,
+			"wkr.updated": wkr.updated,
+		}).Debug("skipping worker state update due to probe/sync race")
 		return
 	}
 
@@ -274,34 +329,14 @@ func (wkr *worker) probeAndUpdate() {
 		// advantage of the non-busy state, though.
 		wkr.busy = updateTime
 	}
-	changed := false
 
-	// Build a new "running" map. Set changed=true if it differs
-	// from the existing map (wkr.running) to ensure the scheduler
-	// gets notified below.
-	running := map[string]struct{}{}
-	for _, uuid := range ctrUUIDs {
-		running[uuid] = struct{}{}
-		if _, ok := wkr.running[uuid]; !ok {
-			if _, ok := wkr.starting[uuid]; !ok {
-				// We didn't start it -- it must have
-				// been started by a previous
-				// dispatcher process.
-				logger.WithField("ContainerUUID", uuid).Info("crunch-run process detected")
-			}
-			changed = true
-		}
-	}
-	for uuid := range wkr.running {
-		if _, ok := running[uuid]; !ok {
-			logger.WithField("ContainerUUID", uuid).Info("crunch-run process ended")
-			wkr.wp.notifyExited(uuid, updateTime)
-			changed = true
-		}
-	}
+	changed := wkr.updateRunning(ctrUUIDs)
 
 	// Update state if this was the first successful boot-probe.
 	if booted && (wkr.state == StateUnknown || wkr.state == StateBooting) {
+		if wkr.state == StateBooting {
+			wkr.reportTimeBetweenFirstSSHAndReadyForContainer()
+		}
 		// Note: this will change again below if
 		// len(wkr.starting)+len(wkr.running) > 0.
 		wkr.state = StateIdle
@@ -317,14 +352,13 @@ func (wkr *worker) probeAndUpdate() {
 
 	// Log whenever a run-probe reveals crunch-run processes
 	// appearing/disappearing before boot-probe succeeds.
-	if wkr.state == StateUnknown && len(running) != len(wkr.running) {
+	if wkr.state == StateUnknown && changed {
 		logger.WithFields(logrus.Fields{
-			"RunningContainers": len(running),
+			"RunningContainers": len(wkr.running),
 			"State":             wkr.state,
 		}).Info("crunch-run probe succeeded, but boot probe is still failing")
 	}
 
-	wkr.running = running
 	if wkr.state == StateIdle && len(wkr.starting)+len(wkr.running) > 0 {
 		wkr.state = StateRunning
 	} else if wkr.state == StateRunning && len(wkr.starting)+len(wkr.running) == 0 {
@@ -332,19 +366,21 @@ func (wkr *worker) probeAndUpdate() {
 	}
 	wkr.updated = updateTime
 	if booted && (initialState == StateUnknown || initialState == StateBooting) {
+		wkr.reportBootOutcome(BootOutcomeSucceeded)
 		logger.WithFields(logrus.Fields{
-			"RunningContainers": len(running),
+			"RunningContainers": len(wkr.running),
 			"State":             wkr.state,
 		}).Info("probes succeeded, instance is in service")
 	}
 	go wkr.wp.notify()
 }
 
-func (wkr *worker) probeRunning() (running []string, ok bool) {
-	cmd := "crunch-run --list"
+func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
+	cmd := wkr.wp.runnerCmd + " --list"
 	if u := wkr.instance.RemoteUser(); u != "root" {
 		cmd = "sudo " + cmd
 	}
+	before := time.Now()
 	stdout, stderr, err := wkr.executor.Execute(nil, cmd, nil)
 	if err != nil {
 		wkr.logger.WithFields(logrus.Fields{
@@ -352,13 +388,60 @@ func (wkr *worker) probeRunning() (running []string, ok bool) {
 			"stdout":  string(stdout),
 			"stderr":  string(stderr),
 		}).WithError(err).Warn("probe failed")
-		return nil, false
+		wkr.wp.mRunProbeDuration.WithLabelValues("fail").Observe(time.Now().Sub(before).Seconds())
+		return
 	}
-	stdout = bytes.TrimRight(stdout, "\n")
-	if len(stdout) == 0 {
-		return nil, true
+	wkr.logger.WithFields(logrus.Fields{
+		"Command": cmd,
+		"stdout":  string(stdout),
+		"stderr":  string(stderr),
+	}).Debug("probe succeeded")
+	wkr.wp.mRunProbeDuration.WithLabelValues("success").Observe(time.Now().Sub(before).Seconds())
+	ok = true
+
+	staleRunLock := false
+	for _, s := range strings.Split(string(stdout), "\n") {
+		// Each line of the "crunch-run --list" output is one
+		// of the following:
+		//
+		// * a container UUID, indicating that processes
+		//   related to that container are currently running.
+		//   Optionally followed by " stale", indicating that
+		//   the crunch-run process itself has exited (the
+		//   remaining process is probably arv-mount).
+		//
+		// * the string "broken", indicating that the instance
+		//   appears incapable of starting containers.
+		//
+		// See ListProcesses() in lib/crunchrun/background.go.
+		if s == "" {
+			// empty string following final newline
+		} else if s == "broken" {
+			reportsBroken = true
+		} else if !strings.HasPrefix(s, wkr.wp.cluster.ClusterID) {
+			// Ignore crunch-run processes that belong to
+			// a different cluster (e.g., a single host
+			// running multiple clusters with the loopback
+			// driver)
+			continue
+		} else if toks := strings.Split(s, " "); len(toks) == 1 {
+			running = append(running, s)
+		} else if toks[1] == "stale" {
+			wkr.logger.WithField("ContainerUUID", toks[0]).Info("probe reported stale run lock")
+			staleRunLock = true
+		}
 	}
-	return strings.Split(string(stdout), "\n"), true
+	wkr.mtx.Lock()
+	defer wkr.mtx.Unlock()
+	if !staleRunLock {
+		wkr.staleRunLockSince = time.Time{}
+	} else if wkr.staleRunLockSince.IsZero() {
+		wkr.staleRunLockSince = time.Now()
+	} else if dur := time.Now().Sub(wkr.staleRunLockSince); dur > wkr.wp.timeoutStaleRunLock {
+		wkr.logger.WithField("Duration", dur).Warn("reporting broken after reporting stale run lock for too long")
+		reportsBroken = true
+	}
+	return
 }
 
 func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
@@ -377,9 +460,46 @@ func (wkr *worker) probeBooted() (ok bool, stderr []byte) {
 		return false, stderr
 	}
 	logger.Info("boot probe succeeded")
+	if err = wkr.wp.loadRunnerData(); err != nil {
+		wkr.logger.WithError(err).Warn("cannot boot worker: error loading runner binary")
+		return false, stderr
+	} else if len(wkr.wp.runnerData) == 0 {
+		// Assume crunch-run is already installed
+	} else if _, stderr2, err := wkr.copyRunnerData(); err != nil {
+		wkr.logger.WithError(err).WithField("stderr", string(stderr2)).Warn("error copying runner binary")
+		return false, stderr2
+	} else {
+		stderr = append(stderr, stderr2...)
+	}
 	return true, stderr
 }
 
+func (wkr *worker) copyRunnerData() (stdout, stderr []byte, err error) {
+	hash := fmt.Sprintf("%x", wkr.wp.runnerMD5)
+	dstdir, _ := filepath.Split(wkr.wp.runnerCmd)
+	logger := wkr.logger.WithFields(logrus.Fields{
+		"hash": hash,
+		"path": wkr.wp.runnerCmd,
+	})
+
+	stdout, stderr, err = wkr.executor.Execute(nil, `md5sum `+wkr.wp.runnerCmd, nil)
+	if err == nil && len(stderr) == 0 && bytes.Equal(stdout, []byte(hash+"  "+wkr.wp.runnerCmd+"\n")) {
+		logger.Info("runner binary already exists on worker, with correct hash")
+		return
+	}
+
+	// Note touch+chmod come before writing data, to avoid the
+	// possibility of md5 being correct while file mode is
+	// incorrect.
+	cmd := `set -e; dstdir="` + dstdir + `"; dstfile="` + wkr.wp.runnerCmd + `"; mkdir -p "$dstdir"; touch "$dstfile"; chmod 0755 "$dstdir" "$dstfile"; cat >"$dstfile"`
+	if wkr.instance.RemoteUser() != "root" {
+		cmd = `sudo sh -c '` + strings.Replace(cmd, "'", "'\\''", -1) + `'`
+	}
+	logger.WithField("cmd", cmd).Info("installing runner binary on worker")
+	stdout, stderr, err = wkr.executor.Execute(nil, cmd, bytes.NewReader(wkr.wp.runnerData))
+	return
+}
+
 // caller must have lock.
 func (wkr *worker) shutdownIfBroken(dur time.Duration) bool {
 	if wkr.idleBehavior == IdleBehaviorHold {
@@ -402,27 +522,53 @@ func (wkr *worker) shutdownIfBroken(dur time.Duration) bool {
 	return true
 }
 
+// Returns true if the instance is eligible for shutdown: either it's
+// been idle too long, or idleBehavior=Drain and nothing is running.
+//
 // caller must have lock.
-func (wkr *worker) shutdownIfIdle() bool {
+func (wkr *worker) eligibleForShutdown() bool {
 	if wkr.idleBehavior == IdleBehaviorHold {
-		// Never shut down.
 		return false
 	}
-	age := time.Since(wkr.busy)
-
-	old := age >= wkr.wp.timeoutIdle
 	draining := wkr.idleBehavior == IdleBehaviorDrain
-	shouldShutdown := ((old || draining) && wkr.state == StateIdle) ||
-		(draining && wkr.state == StateBooting)
-	if !shouldShutdown {
+	switch wkr.state {
+	case StateBooting:
+		return draining
+	case StateIdle:
+		return draining || time.Since(wkr.busy) >= wkr.wp.timeoutIdle
+	case StateRunning:
+		if !draining {
+			return false
+		}
+		for _, rr := range wkr.running {
+			if !rr.givenup {
+				return false
+			}
+		}
+		for _, rr := range wkr.starting {
+			if !rr.givenup {
+				return false
+			}
+		}
+		// draining, and all remaining runners are just trying
+		// to force-kill their crunch-run procs
+		return true
+	default:
 		return false
 	}
+}
 
+// caller must have lock.
+func (wkr *worker) shutdownIfIdle() bool {
+	if !wkr.eligibleForShutdown() {
+		return false
+	}
 	wkr.logger.WithFields(logrus.Fields{
 		"State":        wkr.state,
-		"IdleDuration": stats.Duration(age),
+		"IdleDuration": stats.Duration(time.Since(wkr.busy)),
 		"IdleBehavior": wkr.idleBehavior,
-	}).Info("shutdown idle worker")
+	}).Info("shutdown worker")
+	wkr.reportBootOutcome(BootOutcomeAborted)
 	wkr.shutdown()
 	return true
 }
@@ -449,8 +595,8 @@ func (wkr *worker) saveTags() {
 	instance := wkr.instance
 	tags := instance.Tags()
 	update := cloud.InstanceTags{
-		tagKeyInstanceType: wkr.instType.Name,
-		tagKeyIdleBehavior: string(wkr.idleBehavior),
+		wkr.wp.tagKeyPrefix + tagKeyInstanceType: wkr.instType.Name,
+		wkr.wp.tagKeyPrefix + tagKeyIdleBehavior: string(wkr.idleBehavior),
 	}
 	save := false
 	for k, v := range update {
@@ -468,3 +614,68 @@ func (wkr *worker) saveTags() {
 		}()
 	}
 }
+
+func (wkr *worker) Close() {
+	// This might take time, so do it after unlocking mtx.
+	defer wkr.executor.Close()
+
+	wkr.mtx.Lock()
+	defer wkr.mtx.Unlock()
+	for uuid, rr := range wkr.running {
+		wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process abandoned")
+		rr.Close()
+	}
+	for uuid, rr := range wkr.starting {
+		wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process abandoned")
+		rr.Close()
+	}
+}
+
+// Add/remove entries in wkr.running to match ctrUUIDs returned by a
+// probe. Returns true if anything was added or removed.
+//
+// Caller must have lock.
+func (wkr *worker) updateRunning(ctrUUIDs []string) (changed bool) {
+	alive := map[string]bool{}
+	for _, uuid := range ctrUUIDs {
+		alive[uuid] = true
+		if _, ok := wkr.running[uuid]; ok {
+			// unchanged
+		} else if rr, ok := wkr.starting[uuid]; ok {
+			wkr.running[uuid] = rr
+			delete(wkr.starting, uuid)
+			changed = true
+		} else {
+			// We didn't start it -- it must have been
+			// started by a previous dispatcher process.
+			wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process detected")
+			wkr.running[uuid] = newRemoteRunner(uuid, wkr)
+			changed = true
+		}
+	}
+	for uuid := range wkr.running {
+		if !alive[uuid] {
+			wkr.closeRunner(uuid)
+			changed = true
+		}
+	}
+	return
+}
+
+// caller must have lock.
+func (wkr *worker) closeRunner(uuid string) {
+	rr := wkr.running[uuid]
+	if rr == nil {
+		return
+	}
+	wkr.logger.WithField("ContainerUUID", uuid).Info("crunch-run process ended")
+	delete(wkr.running, uuid)
+	rr.Close()
+
+	now := time.Now()
+	wkr.updated = now
+	wkr.wp.exited[uuid] = now
+	if wkr.state == StateRunning && len(wkr.running)+len(wkr.starting) == 0 {
+		wkr.state = StateIdle
+	}
+}