21258: Ensure at least one boot failure. 21258-flaky-adc-test
authorTom Clegg <tom@curii.com>
Thu, 4 Jan 2024 22:51:33 +0000 (17:51 -0500)
committerTom Clegg <tom@curii.com>
Thu, 4 Jan 2024 22:51:33 +0000 (17:51 -0500)
With the previous approach, it was possible for all containers needing
a type4 instance to finish, and a different instance type to report a
quota error and cause the scheduler to shut down the now-unneeded
instance, all before the "guaranteed broken" node reached
TimeoutBooting. In such a case it would not be counted as a boot
failure.

To avoid this, the new approach induces boot failures on *all* type4
instances until 2x TimeoutBooting intervals have passed.

Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

lib/dispatchcloud/dispatcher_test.go

index 51c2c3d6a35543cf586f30daac39a0822ad5a90e..20185554b8b1828fc92e24b1c1f7ecbc8603b6fc 100644 (file)
@@ -207,6 +207,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
                finishContainer(ctr)
                return int(rand.Uint32() & 0x3)
        }
+       var type4BrokenUntil time.Time
        var countCapacityErrors int64
        vmCount := int32(0)
        s.stubDriver.Queue = queue
@@ -224,6 +225,17 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
                stubvm.CrashRunningContainer = finishContainer
                stubvm.ExtraCrunchRunArgs = "'--runtime-engine=stub' '--foo' '--extra='\\''args'\\'''"
                switch {
+               case stubvm.Instance().ProviderType() == test.InstanceType(4).ProviderType &&
+                       (type4BrokenUntil.IsZero() || time.Now().Before(type4BrokenUntil)):
+                       // Initially (at least 2*TimeoutBooting), all
+                       // instances of this type are completely
+                       // broken. This ensures the
+                       // boot_outcomes{outcome="failure"} metric is
+                       // not zero.
+                       stubvm.Broken = time.Now()
+                       if type4BrokenUntil.IsZero() {
+                               type4BrokenUntil = time.Now().Add(2 * s.cluster.Containers.CloudVMs.TimeoutBooting.Duration())
+                       }
                case n%7 == 0:
                        // some instances start out OK but then stop
                        // running any commands
@@ -235,11 +247,6 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) {
                        // some instances start out OK but then start
                        // reporting themselves as broken
                        stubvm.ReportBroken = time.Now().Add(time.Duration(rand.Int63n(200)) * time.Millisecond)
-               case n == 3:
-                       // 1 instance is completely broken, ensuring
-                       // the boot_outcomes{outcome="failure"} metric
-                       // is not zero
-                       stubvm.Broken = time.Now()
                default:
                        stubvm.CrunchRunCrashRate = 0.1
                        stubvm.ArvMountDeadlockRate = 0.1