The dispatcher simulation test occasionally fails with this error when
a crunch-run stub is killed, and the same container is rescheduled on
the same node before the first attempt's startup-phase sleep expires:
bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
... Error: [test] StubDriver bug or caller bug: pid 9 exiting, running[zzzzz-dz642-
000000000000184]==0
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>
logger.Printf("[test] starting crunch-run stub")
go func() {
var ctr arvados.Container
- var started, completed bool
+ var started, completed, killed bool
defer func() {
logger.Print("[test] exiting crunch-run stub")
svm.Lock()
defer svm.Unlock()
if svm.running[uuid] != pid {
- if !completed {
+ if !completed && !killed {
bugf := svm.sis.driver.Bugf
if bugf == nil {
bugf = logger.Warnf
time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond)
svm.Lock()
- killed := svm.running[uuid] != pid
+ killed = svm.running[uuid] != pid
svm.Unlock()
if killed || wantCrashEarly {
return