16795: Fix false-positive bug detection.
authorTom Clegg <tom@tomclegg.ca>
Tue, 1 Sep 2020 20:22:13 +0000 (16:22 -0400)
committerTom Clegg <tom@tomclegg.ca>
Tue, 1 Sep 2020 20:22:13 +0000 (16:22 -0400)
The dispatcher simulation test occasionally fails with this error when
a crunch-run stub is killed, and the same container is rescheduled on
the same node before the first attempt's startup-phase sleep expires:

    bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
... Error: [test] StubDriver bug or caller bug: pid 9 exiting, running[zzzzz-dz642-000000000000184]==0

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@tomclegg.ca>

lib/dispatchcloud/test/stub_driver.go

index 41eb20763c75248c6cea81a2e9854ad2dfde42a8..2dcd6c1283c297b7e0b830701d3709a943c019b2 100644 (file)
@@ -268,13 +268,13 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
                logger.Printf("[test] starting crunch-run stub")
                go func() {
                        var ctr arvados.Container
-                       var started, completed bool
+                       var started, completed, killed bool
                        defer func() {
                                logger.Print("[test] exiting crunch-run stub")
                                svm.Lock()
                                defer svm.Unlock()
                                if svm.running[uuid] != pid {
-                                       if !completed {
+                                       if !completed && !killed {
                                                bugf := svm.sis.driver.Bugf
                                                if bugf == nil {
                                                        bugf = logger.Warnf
@@ -305,7 +305,7 @@ func (svm *StubVM) Exec(env map[string]string, command string, stdin io.Reader,
                        time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond)
 
                        svm.Lock()
-                       killed := svm.running[uuid] != pid
+                       killed = svm.running[uuid] != pid
                        svm.Unlock()
                        if killed || wantCrashEarly {
                                return