From: Tom Clegg Date: Tue, 15 Sep 2020 14:49:44 +0000 (-0400) Subject: Merge branch '16834-stale-run-lock' X-Git-Tag: 2.1.0~63 X-Git-Url: https://git.arvados.org/arvados.git/commitdiff_plain/d6598fd6339e6219a7103781433356dfde546527?hp=62f82658619bd59644b4bdac6b83be8fe87aca0d Merge branch '16834-stale-run-lock' fixes #16834 Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml index b1865a2217..15e7c7c06c 100644 --- a/lib/config/config.default.yml +++ b/lib/config/config.default.yml @@ -938,6 +938,11 @@ Clusters: # Time before repeating SIGTERM when killing a container. TimeoutSignal: 5s + # Time to give up on a process (most likely arv-mount) that + # still holds a container lockfile after its main supervisor + # process has exited, and declare the instance broken. + TimeoutStaleRunLock: 5s + # Time to give up on SIGTERM and write off the worker. TimeoutTERM: 2m diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go index 201ae36045..7ed332151b 100644 --- a/lib/config/generated_config.go +++ b/lib/config/generated_config.go @@ -944,6 +944,11 @@ Clusters: # Time before repeating SIGTERM when killing a container. TimeoutSignal: 5s + # Time to give up on a process (most likely arv-mount) that + # still holds a container lockfile after its main supervisor + # process has exited, and declare the instance broken. + TimeoutStaleRunLock: 5s + # Time to give up on SIGTERM and write off the worker. TimeoutTERM: 2m diff --git a/lib/crunchrun/background.go b/lib/crunchrun/background.go index bf039afa0a..8cdba72c10 100644 --- a/lib/crunchrun/background.go +++ b/lib/crunchrun/background.go @@ -218,6 +218,24 @@ func ListProcesses(stdout, stderr io.Writer) int { return nil } + proc, err := os.FindProcess(pi.PID) + if err != nil { + // FindProcess should have succeeded, even if the + // process does not exist. + fmt.Fprintf(stderr, "%s: find process %d: %s", path, pi.PID, err) + return nil + } + err = proc.Signal(syscall.Signal(0)) + if err != nil { + // Process is dead, even though lockfile was + // still locked. Most likely a stuck arv-mount + // process that inherited the lock from + // crunch-run. Report container UUID as + // "stale". + fmt.Fprintln(stdout, pi.UUID, "stale") + return nil + } + fmt.Fprintln(stdout, pi.UUID) return nil })) diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go index 6e1850410b..cb5cdf1cf4 100644 --- a/lib/dispatchcloud/dispatcher_test.go +++ b/lib/dispatchcloud/dispatcher_test.go @@ -66,6 +66,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) { ProbeInterval: arvados.Duration(5 * time.Millisecond), MaxProbesPerSecond: 1000, TimeoutSignal: arvados.Duration(3 * time.Millisecond), + TimeoutStaleRunLock: arvados.Duration(3 * time.Millisecond), TimeoutTERM: arvados.Duration(20 * time.Millisecond), ResourceTags: map[string]string{"testtag": "test value"}, TagKeyPrefix: "test:", @@ -169,6 +170,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) { stubvm.ReportBroken = time.Now().Add(time.Duration(rand.Int63n(200)) * time.Millisecond) default: stubvm.CrunchRunCrashRate = 0.1 + stubvm.ArvMountDeadlockRate = 0.1 } } s.stubDriver.Bugf = c.Errorf diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go index 132bd4d695..4d32cf221c 100644 --- a/lib/dispatchcloud/test/stub_driver.go +++ b/lib/dispatchcloud/test/stub_driver.go @@ -131,7 +131,7 @@ func (sis *StubInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID, tags: copyTags(tags), providerType: it.ProviderType, initCommand: cmd, - running: map[string]int64{}, + running: map[string]stubProcess{}, killing: map[string]bool{}, } svm.SSHService = SSHService{ @@ -189,6 +189,8 @@ type StubVM struct { CrunchRunMissing bool CrunchRunCrashRate float64 CrunchRunDetachDelay time.Duration + ArvMountMaxExitLag time.Duration + ArvMountDeadlockRate float64 ExecuteContainer func(arvados.Container) int CrashRunningContainer func(arvados.Container) @@ -198,12 +200,21 @@ type StubVM struct { initCommand cloud.InitCommand providerType string SSHService SSHService - running map[string]int64 + running map[string]stubProcess killing map[string]bool lastPID int64 + deadlocked string sync.Mutex } +type stubProcess struct { + pid int64 + + // crunch-run has exited, but arv-mount process (or something) + // still holds lock in /var/run/ + exited bool +} + func (svm *StubVM) Instance() stubInstance { svm.Lock() defer svm.Unlock() @@ -256,7 +267,7 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader, svm.Lock() svm.lastPID++ pid := svm.lastPID - svm.running[uuid] = pid + svm.running[uuid] = stubProcess{pid: pid} svm.Unlock() time.Sleep(svm.CrunchRunDetachDelay) fmt.Fprintf(stderr, "starting %s\n", uuid) @@ -273,14 +284,13 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader, logger.Print("[test] exiting crunch-run stub") svm.Lock() defer svm.Unlock() - if svm.running[uuid] != pid { + if svm.running[uuid].pid != pid { bugf := svm.sis.driver.Bugf if bugf == nil { bugf = logger.Warnf } - bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid]) - } else { - delete(svm.running, uuid) + bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s].pid==%d", pid, uuid, svm.running[uuid].pid) + return } if !completed { logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub") @@ -288,6 +298,15 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader, svm.CrashRunningContainer(ctr) } } + sproc := svm.running[uuid] + sproc.exited = true + svm.running[uuid] = sproc + svm.Unlock() + time.Sleep(svm.ArvMountMaxExitLag * time.Duration(math_rand.Float64())) + svm.Lock() + if math_rand.Float64() >= svm.ArvMountDeadlockRate { + delete(svm.running, uuid) + } }() crashluck := math_rand.Float64() @@ -333,26 +352,31 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader, if command == "crunch-run --list" { svm.Lock() defer svm.Unlock() - for uuid := range svm.running { - fmt.Fprintf(stdout, "%s\n", uuid) + for uuid, sproc := range svm.running { + if sproc.exited { + fmt.Fprintf(stdout, "%s stale\n", uuid) + } else { + fmt.Fprintf(stdout, "%s\n", uuid) + } } if !svm.ReportBroken.IsZero() && svm.ReportBroken.Before(time.Now()) { fmt.Fprintln(stdout, "broken") } + fmt.Fprintln(stdout, svm.deadlocked) return 0 } if strings.HasPrefix(command, "crunch-run --kill ") { svm.Lock() - _, running := svm.running[uuid] - if running { + sproc, running := svm.running[uuid] + if running && !sproc.exited { svm.killing[uuid] = true svm.Unlock() time.Sleep(time.Duration(math_rand.Float64()*2) * time.Millisecond) svm.Lock() - _, running = svm.running[uuid] + sproc, running = svm.running[uuid] } svm.Unlock() - if running { + if running && !sproc.exited { fmt.Fprintf(stderr, "%s: container is running\n", uuid) return 1 } diff --git a/lib/dispatchcloud/worker/pool.go b/lib/dispatchcloud/worker/pool.go index 086887cb44..953195c59d 100644 --- a/lib/dispatchcloud/worker/pool.go +++ b/lib/dispatchcloud/worker/pool.go @@ -64,15 +64,16 @@ type Executor interface { } const ( - defaultSyncInterval = time.Minute - defaultProbeInterval = time.Second * 10 - defaultMaxProbesPerSecond = 10 - defaultTimeoutIdle = time.Minute - defaultTimeoutBooting = time.Minute * 10 - defaultTimeoutProbe = time.Minute * 10 - defaultTimeoutShutdown = time.Second * 10 - defaultTimeoutTERM = time.Minute * 2 - defaultTimeoutSignal = time.Second * 5 + defaultSyncInterval = time.Minute + defaultProbeInterval = time.Second * 10 + defaultMaxProbesPerSecond = 10 + defaultTimeoutIdle = time.Minute + defaultTimeoutBooting = time.Minute * 10 + defaultTimeoutProbe = time.Minute * 10 + defaultTimeoutShutdown = time.Second * 10 + defaultTimeoutTERM = time.Minute * 2 + defaultTimeoutSignal = time.Second * 5 + defaultTimeoutStaleRunLock = time.Second * 5 // Time after a quota error to try again anyway, even if no // instances have been shutdown. @@ -115,6 +116,7 @@ func NewPool(logger logrus.FieldLogger, arvClient *arvados.Client, reg *promethe timeoutShutdown: duration(cluster.Containers.CloudVMs.TimeoutShutdown, defaultTimeoutShutdown), timeoutTERM: duration(cluster.Containers.CloudVMs.TimeoutTERM, defaultTimeoutTERM), timeoutSignal: duration(cluster.Containers.CloudVMs.TimeoutSignal, defaultTimeoutSignal), + timeoutStaleRunLock: duration(cluster.Containers.CloudVMs.TimeoutStaleRunLock, defaultTimeoutStaleRunLock), installPublicKey: installPublicKey, tagKeyPrefix: cluster.Containers.CloudVMs.TagKeyPrefix, stop: make(chan bool), @@ -152,6 +154,7 @@ type Pool struct { timeoutShutdown time.Duration timeoutTERM time.Duration timeoutSignal time.Duration + timeoutStaleRunLock time.Duration installPublicKey ssh.PublicKey tagKeyPrefix string diff --git a/lib/dispatchcloud/worker/worker.go b/lib/dispatchcloud/worker/worker.go index 9199d4bafe..517a5d193e 100644 --- a/lib/dispatchcloud/worker/worker.go +++ b/lib/dispatchcloud/worker/worker.go @@ -110,6 +110,7 @@ type worker struct { probing chan struct{} bootOutcomeReported bool timeToReadyReported bool + staleRunLockSince time.Time } func (wkr *worker) onUnkillable(uuid string) { @@ -382,13 +383,43 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) { return } ok = true + + staleRunLock := false for _, s := range strings.Split(string(stdout), "\n") { - if s == "broken" { + // Each line of the "crunch-run --list" output is one + // of the following: + // + // * a container UUID, indicating that processes + // related to that container are currently running. + // Optionally followed by " stale", indicating that + // the crunch-run process itself has exited (the + // remaining process is probably arv-mount). + // + // * the string "broken", indicating that the instance + // appears incapable of starting containers. + // + // See ListProcesses() in lib/crunchrun/background.go. + if s == "" { + // empty string following final newline + } else if s == "broken" { reportsBroken = true - } else if s != "" { + } else if toks := strings.Split(s, " "); len(toks) == 1 { running = append(running, s) + } else if toks[1] == "stale" { + wkr.logger.WithField("ContainerUUID", toks[0]).Info("probe reported stale run lock") + staleRunLock = true } } + wkr.mtx.Lock() + defer wkr.mtx.Unlock() + if !staleRunLock { + wkr.staleRunLockSince = time.Time{} + } else if wkr.staleRunLockSince.IsZero() { + wkr.staleRunLockSince = time.Now() + } else if dur := time.Now().Sub(wkr.staleRunLockSince); dur > wkr.wp.timeoutStaleRunLock { + wkr.logger.WithField("Duration", dur).Warn("reporting broken after reporting stale run lock for too long") + reportsBroken = true + } return } diff --git a/sdk/go/arvados/config.go b/sdk/go/arvados/config.go index 363d09dafb..394e30a737 100644 --- a/sdk/go/arvados/config.go +++ b/sdk/go/arvados/config.go @@ -462,6 +462,7 @@ type CloudVMsConfig struct { TimeoutProbe Duration TimeoutShutdown Duration TimeoutSignal Duration + TimeoutStaleRunLock Duration TimeoutTERM Duration ResourceTags map[string]string TagKeyPrefix string