X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/2f4429e623199b9820d2296cf3788300caa7394b..aba09e96f2bbb8e1e45c941a7e8c62bb772344e8:/lib/dispatchcloud/dispatcher_test.go diff --git a/lib/dispatchcloud/dispatcher_test.go b/lib/dispatchcloud/dispatcher_test.go index 07fdc67228..273a3836dc 100644 --- a/lib/dispatchcloud/dispatcher_test.go +++ b/lib/dispatchcloud/dispatcher_test.go @@ -6,19 +6,23 @@ package dispatchcloud import ( "context" + "crypto/tls" "encoding/json" "io/ioutil" "math/rand" "net/http" "net/http/httptest" + "net/url" "os" "sync" "time" - "git.curoverse.com/arvados.git/lib/dispatchcloud/test" - "git.curoverse.com/arvados.git/sdk/go/arvados" - "git.curoverse.com/arvados.git/sdk/go/arvadostest" - "git.curoverse.com/arvados.git/sdk/go/ctxlog" + "git.arvados.org/arvados.git/lib/config" + "git.arvados.org/arvados.git/lib/dispatchcloud/test" + "git.arvados.org/arvados.git/sdk/go/arvados" + "git.arvados.org/arvados.git/sdk/go/arvadostest" + "git.arvados.org/arvados.git/sdk/go/ctxlog" + "github.com/prometheus/client_golang/prometheus" "golang.org/x/crypto/ssh" check "gopkg.in/check.v1" ) @@ -26,11 +30,12 @@ import ( var _ = check.Suite(&DispatcherSuite{}) type DispatcherSuite struct { - ctx context.Context - cancel context.CancelFunc - cluster *arvados.Cluster - stubDriver *test.StubDriver - disp *dispatcher + ctx context.Context + cancel context.CancelFunc + cluster *arvados.Cluster + stubDriver *test.StubDriver + disp *dispatcher + error503Server *httptest.Server } func (s *DispatcherSuite) SetUpTest(c *check.C) { @@ -48,11 +53,22 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) { MinTimeBetweenCreateCalls: time.Millisecond, } + // We need the postgresql connection info from the integration + // test config. + cfg, err := config.NewLoader(nil, ctxlog.FromContext(s.ctx)).Load() + c.Assert(err, check.IsNil) + testcluster, err := cfg.GetCluster("") + c.Assert(err, check.IsNil) + s.cluster = &arvados.Cluster{ ManagementToken: "test-management-token", + PostgreSQL: testcluster.PostgreSQL, Containers: arvados.ContainersConfig{ - DispatchPrivateKey: string(dispatchprivraw), - StaleLockTimeout: arvados.Duration(5 * time.Millisecond), + CrunchRunCommand: "crunch-run", + CrunchRunArgumentsList: []string{"--foo", "--extra='args'"}, + DispatchPrivateKey: string(dispatchprivraw), + StaleLockTimeout: arvados.Duration(5 * time.Millisecond), + RuntimeEngine: "stub", CloudVMs: arvados.CloudVMsConfig{ Driver: "test", SyncInterval: arvados.Duration(10 * time.Millisecond), @@ -65,6 +81,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) { ProbeInterval: arvados.Duration(5 * time.Millisecond), MaxProbesPerSecond: 1000, TimeoutSignal: arvados.Duration(3 * time.Millisecond), + TimeoutStaleRunLock: arvados.Duration(3 * time.Millisecond), TimeoutTERM: arvados.Duration(20 * time.Millisecond), ResourceTags: map[string]string{"testtag": "test value"}, TagKeyPrefix: "test:", @@ -84,13 +101,26 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) { arvadostest.SetServiceURL(&s.cluster.Services.Controller, "https://"+os.Getenv("ARVADOS_API_HOST")+"/") arvClient, err := arvados.NewClientFromConfig(s.cluster) - c.Check(err, check.IsNil) + c.Assert(err, check.IsNil) + // Disable auto-retry + arvClient.Timeout = 0 + + s.error503Server = httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + c.Logf("503 stub: returning 503") + w.WriteHeader(http.StatusServiceUnavailable) + })) + arvClient.Client = &http.Client{ + Transport: &http.Transport{ + Proxy: s.arvClientProxy(c), + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true}}} s.disp = &dispatcher{ Cluster: s.cluster, Context: s.ctx, ArvClient: arvClient, AuthToken: arvadostest.AdminToken, + Registry: prometheus.NewRegistry(), } // Test cases can modify s.cluster before calling // initialize(), and then modify private state before calling @@ -100,6 +130,21 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) { func (s *DispatcherSuite) TearDownTest(c *check.C) { s.cancel() s.disp.Close() + s.error503Server.Close() +} + +// Intercept outgoing API requests for "/503" and respond HTTP +// 503. This lets us force (*arvados.Client)Last503() to return +// something. +func (s *DispatcherSuite) arvClientProxy(c *check.C) func(*http.Request) (*url.URL, error) { + return func(req *http.Request) (*url.URL, error) { + if req.URL.Path == "/503" { + c.Logf("arvClientProxy: proxying to 503 stub") + return url.Parse(s.error503Server.URL) + } else { + return nil, nil + } + } } // DispatchToStubDriver checks that the dispatcher wires everything @@ -107,12 +152,14 @@ func (s *DispatcherSuite) TearDownTest(c *check.C) { // a fake queue and cloud driver. The fake cloud driver injects // artificial errors in order to exercise a variety of code paths. func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) { - drivers["test"] = s.stubDriver + Drivers["test"] = s.stubDriver s.disp.setupOnce.Do(s.disp.initialize) queue := &test.Queue{ + MaxDispatchAttempts: 5, ChooseType: func(ctr *arvados.Container) (arvados.InstanceType, error) { return ChooseInstanceType(s.cluster, ctr) }, + Logger: ctxlog.TestLogger(c), } for i := 0; i < 200; i++ { queue.Containers = append(queue.Containers, arvados.Container{ @@ -141,6 +188,11 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) { return } delete(waiting, ctr.UUID) + if len(waiting) == 100 { + // trigger scheduler maxConcurrency limit + c.Logf("test: requesting 503 in order to trigger maxConcurrency limit") + s.disp.ArvClient.RequestAndDecode(nil, "GET", "503", nil, nil) + } if len(waiting) == 0 { close(done) } @@ -157,6 +209,7 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) { stubvm.CrunchRunDetachDelay = time.Duration(rand.Int63n(int64(10 * time.Millisecond))) stubvm.ExecuteContainer = executeContainer stubvm.CrashRunningContainer = finishContainer + stubvm.ExtraCrunchRunArgs = "'--runtime-engine=stub' '--foo' '--extra='\\''args'\\'''" switch n % 7 { case 0: stubvm.Broken = time.Now().Add(time.Duration(rand.Int63n(90)) * time.Millisecond) @@ -166,20 +219,28 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) { stubvm.ReportBroken = time.Now().Add(time.Duration(rand.Int63n(200)) * time.Millisecond) default: stubvm.CrunchRunCrashRate = 0.1 + stubvm.ArvMountDeadlockRate = 0.1 } } + s.stubDriver.Bugf = c.Errorf start := time.Now() go s.disp.run() err := s.disp.CheckHealth() c.Check(err, check.IsNil) - select { - case <-done: - c.Logf("containers finished (%s), waiting for instances to shutdown and queue to clear", time.Since(start)) - case <-time.After(10 * time.Second): - c.Fatalf("timed out; still waiting for %d containers: %q", len(waiting), waiting) + for len(waiting) > 0 { + waswaiting := len(waiting) + select { + case <-done: + // loop will end because len(waiting)==0 + case <-time.After(3 * time.Second): + if len(waiting) >= waswaiting { + c.Fatalf("timed out; no progress in 3s while waiting for %d containers: %q", len(waiting), waiting) + } + } } + c.Logf("containers finished (%s), waiting for instances to shutdown and queue to clear", time.Since(start)) deadline := time.Now().Add(5 * time.Second) for range time.NewTicker(10 * time.Millisecond).C { @@ -205,13 +266,33 @@ func (s *DispatcherSuite) TestDispatchToStubDriver(c *check.C) { c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="0",operation="Destroy"} [^0].*`) c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="1",operation="Create"} [^0].*`) c.Check(resp.Body.String(), check.Matches, `(?ms).*driver_operations{error="1",operation="List"} 0\n.*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*boot_outcomes{outcome="aborted"} [0-9]+\n.*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*boot_outcomes{outcome="disappeared"} [^0].*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*boot_outcomes{outcome="failure"} [^0].*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*boot_outcomes{outcome="success"} [^0].*`) c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="shutdown"} [^0].*`) c.Check(resp.Body.String(), check.Matches, `(?ms).*instances_disappeared{state="unknown"} 0\n.*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds{quantile="0.95"} [0-9.]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds_count [0-9]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ssh_seconds_sum [0-9.]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds{quantile="0.95"} [0-9.]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_count [0-9]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*time_to_ready_for_container_seconds_sum [0-9.]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_count [0-9]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_shutdown_request_to_disappearance_seconds_sum [0-9.]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_count [0-9]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*time_from_queue_to_crunch_run_seconds_sum [0-9e+.]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_count{outcome="success"} [0-9]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_sum{outcome="success"} [0-9e+.]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_count{outcome="fail"} [0-9]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*run_probe_duration_seconds_sum{outcome="fail"} [0-9e+.]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*last_503_time [1-9][0-9e+.]*`) + c.Check(resp.Body.String(), check.Matches, `(?ms).*max_concurrent_containers [1-9][0-9e+.]*`) } func (s *DispatcherSuite) TestAPIPermissions(c *check.C) { s.cluster.ManagementToken = "abcdefgh" - drivers["test"] = s.stubDriver + Drivers["test"] = s.stubDriver s.disp.setupOnce.Do(s.disp.initialize) s.disp.queue = &test.Queue{} go s.disp.run() @@ -233,7 +314,7 @@ func (s *DispatcherSuite) TestAPIPermissions(c *check.C) { func (s *DispatcherSuite) TestAPIDisabled(c *check.C) { s.cluster.ManagementToken = "" - drivers["test"] = s.stubDriver + Drivers["test"] = s.stubDriver s.disp.setupOnce.Do(s.disp.initialize) s.disp.queue = &test.Queue{} go s.disp.run() @@ -252,7 +333,7 @@ func (s *DispatcherSuite) TestAPIDisabled(c *check.C) { func (s *DispatcherSuite) TestInstancesAPI(c *check.C) { s.cluster.ManagementToken = "abcdefgh" s.cluster.Containers.CloudVMs.TimeoutBooting = arvados.Duration(time.Second) - drivers["test"] = s.stubDriver + Drivers["test"] = s.stubDriver s.disp.setupOnce.Do(s.disp.initialize) s.disp.queue = &test.Queue{} go s.disp.run() @@ -297,7 +378,7 @@ func (s *DispatcherSuite) TestInstancesAPI(c *check.C) { time.Sleep(time.Millisecond) } c.Assert(len(sr.Items), check.Equals, 1) - c.Check(sr.Items[0].Instance, check.Matches, "stub.*") + c.Check(sr.Items[0].Instance, check.Matches, "inst.*") c.Check(sr.Items[0].WorkerState, check.Equals, "booting") c.Check(sr.Items[0].Price, check.Equals, 0.123) c.Check(sr.Items[0].LastContainerUUID, check.Equals, "")