From d3fbb6b33a5b4a55dda969c7e7360cc31e1de0f6 Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Fri, 21 Aug 2020 10:38:25 -0400 Subject: [PATCH] 16723: Fix double-container-running bug in test stub. Two parts: 1. Update state to Complete before crunch-run exits, not after. 2. Don't call the "crashed while running" hook from the test stub unless changing state to Running actually succeeded. Also, call the "crashed while running" hook from the test stub if changing state from Running to Complete fails. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- lib/dispatchcloud/test/stub_driver.go | 71 ++++++++++++++++----------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/lib/dispatchcloud/test/stub_driver.go b/lib/dispatchcloud/test/stub_driver.go index 3e2f8efe01..f45d37352f 100644 --- a/lib/dispatchcloud/test/stub_driver.go +++ b/lib/dispatchcloud/test/stub_driver.go @@ -265,49 +265,64 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader, }) logger.Printf("[test] starting crunch-run stub") go func() { + var ctr arvados.Container + var started, completed bool + defer func() { + logger.Print("[test] exiting crunch-run stub") + svm.Lock() + defer svm.Unlock() + if svm.running[uuid] != pid { + if !completed { + logger.Errorf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid]) + } + } else { + delete(svm.running, uuid) + } + if !completed { + logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub") + if started && svm.CrashRunningContainer != nil { + svm.CrashRunningContainer(ctr) + } + } + }() + crashluck := math_rand.Float64() + wantCrash := crashluck < svm.CrunchRunCrashRate + wantCrashEarly := crashluck < svm.CrunchRunCrashRate/2 + ctr, ok := queue.Get(uuid) if !ok { logger.Print("[test] container not in queue") return } - defer func() { - if ctr.State == arvados.ContainerStateRunning && svm.CrashRunningContainer != nil { - svm.CrashRunningContainer(ctr) - } - }() - - if crashluck > svm.CrunchRunCrashRate/2 { - time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond) - ctr.State = arvados.ContainerStateRunning - if !queue.Notify(ctr) { - ctr, _ = queue.Get(uuid) - logger.Print("[test] erroring out because state=Running update was rejected") - return - } - } - time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond) svm.Lock() - defer svm.Unlock() - if svm.running[uuid] != pid { - logger.Print("[test] container was killed") + killed := svm.running[uuid] != pid + svm.Unlock() + if killed || wantCrashEarly { return } - delete(svm.running, uuid) - if crashluck < svm.CrunchRunCrashRate { + ctr.State = arvados.ContainerStateRunning + started = queue.Notify(ctr) + if !started { + ctr, _ = queue.Get(uuid) + logger.Print("[test] erroring out because state=Running update was rejected") + return + } + + if wantCrash { logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub") - } else { - if svm.ExecuteContainer != nil { - ctr.ExitCode = svm.ExecuteContainer(ctr) - } - logger.WithField("ExitCode", ctr.ExitCode).Print("[test] exiting crunch-run stub") - ctr.State = arvados.ContainerStateComplete - go queue.Notify(ctr) + return + } + if svm.ExecuteContainer != nil { + ctr.ExitCode = svm.ExecuteContainer(ctr) } + logger.WithField("ExitCode", ctr.ExitCode).Print("[test] completing container") + ctr.State = arvados.ContainerStateComplete + completed = queue.Notify(ctr) }() return 0 } -- 2.30.2