X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/bfe0ea9b824dc057f07355a928ccb64ab68b6c57..6a22686571417278c242a3fa1a7f1018594f8fc4:/lib/crunchrun/crunchrun_test.go diff --git a/lib/crunchrun/crunchrun_test.go b/lib/crunchrun/crunchrun_test.go index 786f9410a8..aa20104f33 100644 --- a/lib/crunchrun/crunchrun_test.go +++ b/lib/crunchrun/crunchrun_test.go @@ -6,6 +6,7 @@ package crunchrun import ( "bytes" + "context" "crypto/md5" "encoding/json" "errors" @@ -13,14 +14,20 @@ import ( "io" "io/ioutil" "log" + "math/rand" "net/http" "net/http/httptest" + "net/http/httputil" + "net/url" "os" "os/exec" + "path" "regexp" "runtime/pprof" + "strconv" "strings" "sync" + "sync/atomic" "syscall" "testing" "time" @@ -31,9 +38,10 @@ import ( "git.arvados.org/arvados.git/sdk/go/arvadosclient" "git.arvados.org/arvados.git/sdk/go/arvadostest" "git.arvados.org/arvados.git/sdk/go/manifest" - "golang.org/x/net/context" . "gopkg.in/check.v1" + git_client "gopkg.in/src-d/go-git.v4/plumbing/transport/client" + git_http "gopkg.in/src-d/go-git.v4/plumbing/transport/http" ) // Gocheck boilerplate @@ -41,6 +49,8 @@ func TestCrunchExec(t *testing.T) { TestingT(t) } +const logLineStart = `(?m)(.*\n)*\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d\.\d+Z ` + var _ = Suite(&TestSuite{}) type TestSuite struct { @@ -410,6 +420,67 @@ func (client *KeepTestClient) ManifestFileReader(m manifest.Manifest, filename s return nil, nil } +type apiStubServer struct { + server *httptest.Server + proxy *httputil.ReverseProxy + intercept func(http.ResponseWriter, *http.Request) bool + + container arvados.Container + logs map[string]string +} + +func apiStub() (*arvados.Client, *apiStubServer) { + client := arvados.NewClientFromEnv() + apistub := &apiStubServer{} + apistub.server = httptest.NewTLSServer(apistub) + apistub.proxy = httputil.NewSingleHostReverseProxy(&url.URL{Scheme: "https", Host: client.APIHost}) + if client.Insecure { + apistub.proxy.Transport = arvados.InsecureHTTPClient.Transport + } + client.APIHost = apistub.server.Listener.Addr().String() + return client, apistub +} + +func (apistub *apiStubServer) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if apistub.intercept != nil && apistub.intercept(w, r) { + return + } + if r.Method == "POST" && r.URL.Path == "/arvados/v1/logs" { + var body struct { + Log struct { + EventType string `json:"event_type"` + Properties struct { + Text string + } + } + } + json.NewDecoder(r.Body).Decode(&body) + apistub.logs[body.Log.EventType] += body.Log.Properties.Text + return + } + if r.Method == "GET" && r.URL.Path == "/arvados/v1/collections/"+hwPDH { + json.NewEncoder(w).Encode(arvados.Collection{ManifestText: hwManifest}) + return + } + if r.Method == "GET" && r.URL.Path == "/arvados/v1/collections/"+otherPDH { + json.NewEncoder(w).Encode(arvados.Collection{ManifestText: otherManifest}) + return + } + if r.Method == "GET" && r.URL.Path == "/arvados/v1/collections/"+normalizedWithSubdirsPDH { + json.NewEncoder(w).Encode(arvados.Collection{ManifestText: normalizedManifestWithSubdirs}) + return + } + if r.Method == "GET" && r.URL.Path == "/arvados/v1/collections/"+denormalizedWithSubdirsPDH { + json.NewEncoder(w).Encode(arvados.Collection{ManifestText: denormalizedManifestWithSubdirs}) + return + } + if r.Method == "GET" && r.URL.Path == "/arvados/v1/containers/"+apistub.container.UUID { + json.NewEncoder(w).Encode(apistub.container) + return + } + apistub.proxy.ServeHTTP(w, r) +} + func (s *TestSuite) TestLoadImage(c *C) { s.runner.Container.ContainerImage = arvadostest.DockerImage112PDH s.runner.Container.Mounts = map[string]arvados.Mount{ @@ -681,8 +752,9 @@ func (s *TestSuite) fullRunHelper(c *C, record string, extraMounts []string, fn } return d, err } + client, _ := apiStub() s.runner.MkArvClient = func(token string) (IArvadosClient, IKeepClient, *arvados.Client, error) { - return &ArvTestClient{secretMounts: secretMounts}, &s.testContainerKeepClient, nil, nil + return &ArvTestClient{secretMounts: secretMounts}, &s.testContainerKeepClient, client, nil } if extraMounts != nil && len(extraMounts) > 0 { @@ -777,38 +849,50 @@ func (s *TestSuite) TestRunAlreadyRunning(c *C) { c.Check(ran, Equals, false) } -func (s *TestSuite) TestSpotInterruptionNotice(c *C) { - var failedOnce bool - var stoptime time.Time - token := "fake-ec2-metadata-token" - stub := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if !failedOnce { +func ec2MetadataServerStub(c *C, token *string, failureRate float64, stoptime *atomic.Value) *httptest.Server { + failedOnce := false + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if !failedOnce || rand.Float64() < failureRate { w.WriteHeader(http.StatusServiceUnavailable) failedOnce = true return } switch r.URL.Path { case "/latest/api/token": - fmt.Fprintln(w, token) + fmt.Fprintln(w, *token) case "/latest/meta-data/spot/instance-action": - if r.Header.Get("X-aws-ec2-metadata-token") != token { + if r.Header.Get("X-aws-ec2-metadata-token") != *token { w.WriteHeader(http.StatusUnauthorized) - } else if stoptime.IsZero() { + } else if t, _ := stoptime.Load().(time.Time); t.IsZero() { w.WriteHeader(http.StatusNotFound) } else { - fmt.Fprintf(w, `{"action":"stop","time":"%s"}`, stoptime.Format(time.RFC3339)) + fmt.Fprintf(w, `{"action":"stop","time":"%s"}`, t.Format(time.RFC3339)) } default: w.WriteHeader(http.StatusNotFound) } })) +} + +func (s *TestSuite) TestSpotInterruptionNotice(c *C) { + s.testSpotInterruptionNotice(c, 0.1) +} + +func (s *TestSuite) TestSpotInterruptionNoticeNotAvailable(c *C) { + s.testSpotInterruptionNotice(c, 1) +} + +func (s *TestSuite) testSpotInterruptionNotice(c *C, failureRate float64) { + var stoptime atomic.Value + token := "fake-ec2-metadata-token" + stub := ec2MetadataServerStub(c, &token, failureRate, &stoptime) defer stub.Close() defer func(i time.Duration, u string) { spotInterruptionCheckInterval = i ec2MetadataBaseURL = u }(spotInterruptionCheckInterval, ec2MetadataBaseURL) - spotInterruptionCheckInterval = time.Second / 4 + spotInterruptionCheckInterval = time.Second / 8 ec2MetadataBaseURL = stub.URL go s.runner.checkSpotInterruptionNotices() @@ -824,13 +908,22 @@ func (s *TestSuite) TestSpotInterruptionNotice(c *C) { "state": "Locked" }`, nil, func() int { time.Sleep(time.Second) - stoptime = time.Now().Add(time.Minute).UTC() + stoptime.Store(time.Now().Add(time.Minute).UTC()) + token = "different-fake-ec2-metadata-token" time.Sleep(time.Second) return 0 }) - c.Check(s.api.Logs["crunch-run"].String(), Matches, `(?ms).*Checking for spot interruptions every 250ms using instance metadata at http://.*`) + c.Check(s.api.Logs["crunch-run"].String(), Matches, `(?ms).*Checking for spot interruptions every 125ms using instance metadata at http://.*`) c.Check(s.api.Logs["crunch-run"].String(), Matches, `(?ms).*Error checking spot interruptions: 503 Service Unavailable.*`) - c.Check(s.api.Logs["crunch-run"].String(), Matches, `(?ms).*Cloud provider indicates instance action "stop" scheduled for time "`+stoptime.Format(time.RFC3339)+`".*`) + if failureRate == 1 { + c.Check(s.api.Logs["crunch-run"].String(), Matches, `(?ms).*Giving up on checking spot interruptions after too many consecutive failures.*`) + } else { + text := `Cloud provider scheduled instance stop at ` + stoptime.Load().(time.Time).Format(time.RFC3339) + c.Check(s.api.Logs["crunch-run"].String(), Matches, `(?ms).*`+text+`.*`) + c.Check(s.api.CalledWith("container.runtime_status.warning", "preemption notice"), NotNil) + c.Check(s.api.CalledWith("container.runtime_status.warningDetail", text), NotNil) + c.Check(s.api.CalledWith("container.runtime_status.preemptionNotice", text), NotNil) + } } func (s *TestSuite) TestRunTimeExceeded(c *C) { @@ -962,6 +1055,70 @@ func (s *TestSuite) TestLogVersionAndRuntime(c *C) { c.Check(s.api.Logs["crunch-run"].String(), Matches, `(?ms).*Using container runtime: stub.*`) } +func (s *TestSuite) testLogRSSThresholds(c *C, ram int, expected []int, notExpected int) { + s.runner.cgroupRoot = "testdata/fakestat" + s.fullRunHelper(c, `{ + "command": ["true"], + "container_image": "`+arvadostest.DockerImage112PDH+`", + "cwd": ".", + "environment": {}, + "mounts": {"/tmp": {"kind": "tmp"} }, + "output_path": "/tmp", + "priority": 1, + "runtime_constraints": {"ram": `+strconv.Itoa(ram)+`}, + "state": "Locked" + }`, nil, func() int { return 0 }) + logs := s.api.Logs["crunch-run"].String() + pattern := logLineStart + `Container using over %d%% of memory \(rss 734003200/%d bytes\)` + var threshold int + for _, threshold = range expected { + c.Check(logs, Matches, fmt.Sprintf(pattern, threshold, ram)) + } + if notExpected > threshold { + c.Check(logs, Not(Matches), fmt.Sprintf(pattern, notExpected, ram)) + } +} + +func (s *TestSuite) TestLogNoRSSThresholds(c *C) { + s.testLogRSSThresholds(c, 7340032000, []int{}, 90) +} + +func (s *TestSuite) TestLogSomeRSSThresholds(c *C) { + onePercentRSS := 7340032 + s.testLogRSSThresholds(c, 102*onePercentRSS, []int{90, 95}, 99) +} + +func (s *TestSuite) TestLogAllRSSThresholds(c *C) { + s.testLogRSSThresholds(c, 734003299, []int{90, 95, 99}, 0) +} + +func (s *TestSuite) TestLogMaximaAfterRun(c *C) { + s.runner.cgroupRoot = "testdata/fakestat" + s.runner.parentTemp = c.MkDir() + s.fullRunHelper(c, `{ + "command": ["true"], + "container_image": "`+arvadostest.DockerImage112PDH+`", + "cwd": ".", + "environment": {}, + "mounts": {"/tmp": {"kind": "tmp"} }, + "output_path": "/tmp", + "priority": 1, + "runtime_constraints": {"ram": 7340032000}, + "state": "Locked" + }`, nil, func() int { return 0 }) + logs := s.api.Logs["crunch-run"].String() + for _, expected := range []string{ + `Maximum disk usage was \d+%, \d+/\d+ bytes`, + `Maximum container memory cache usage was 73400320 bytes`, + `Maximum container memory swap usage was 320 bytes`, + `Maximum container memory pgmajfault usage was 20 faults`, + `Maximum container memory rss usage was 10%, 734003200/7340032000 bytes`, + `Maximum crunch-run memory rss usage was \d+ bytes`, + } { + c.Check(logs, Matches, logLineStart+expected) + } +} + func (s *TestSuite) TestCommitNodeInfoBeforeStart(c *C) { var collection_create, container_update arvadosclient.Dict s.fullRunHelper(c, `{ @@ -1261,6 +1418,7 @@ func (s *TestSuite) TestSetupMounts(c *C) { cr := s.runner am := &ArvMountCmdLine{} cr.RunArvMount = am.ArvMountTest + cr.containerClient, _ = apiStub() cr.ContainerArvClient = &ArvTestClient{} cr.ContainerKeepClient = &KeepTestClient{} cr.Container.OutputStorageClasses = []string{"default"} @@ -1583,7 +1741,7 @@ func (s *TestSuite) TestSetupMounts(c *C) { { i = 0 cr.ArvMountPoint = "" - (*GitMountSuite)(nil).useTestGitServer(c) + git_client.InstallProtocol("https", git_http.NewClient(arvados.InsecureHTTPClient)) cr.token = arvadostest.ActiveToken cr.Container.Mounts = make(map[string]arvados.Mount) cr.Container.Mounts = map[string]arvados.Mount{ @@ -2260,6 +2418,80 @@ func (s *TestSuite) TestCalculateCost(c *C) { c.Check(logbuf.String(), Not(Matches), `(?ms).*changed to 2\.00 .* changed to 2\.00 .*`) } +func (s *TestSuite) TestSIGUSR2CostUpdate(c *C) { + pid := os.Getpid() + now := time.Now() + pricesJSON, err := json.Marshal([]cloud.InstancePrice{ + {StartTime: now.Add(-4 * time.Hour), Price: 2.4}, + {StartTime: now.Add(-2 * time.Hour), Price: 2.6}, + }) + c.Assert(err, IsNil) + + os.Setenv("InstanceType", `{"Price":2.2}`) + defer os.Unsetenv("InstanceType") + defer func(s string) { lockdir = s }(lockdir) + lockdir = c.MkDir() + + // We can't use s.api.CalledWith because timing differences will yield + // different cost values across runs. getCostUpdate iterates over API + // calls until it finds one that sets the cost, then writes that value + // to the next index of costUpdates. + deadline := now.Add(time.Second) + costUpdates := make([]float64, 2) + costIndex := 0 + apiIndex := 0 + getCostUpdate := func() { + for ; time.Now().Before(deadline); time.Sleep(time.Second / 10) { + for apiIndex < len(s.api.Content) { + update := s.api.Content[apiIndex] + apiIndex++ + var ok bool + var cost float64 + if update, ok = update["container"].(arvadosclient.Dict); !ok { + continue + } + if cost, ok = update["cost"].(float64); !ok { + continue + } + c.Logf("API call #%d updates cost to %v", apiIndex-1, cost) + costUpdates[costIndex] = cost + costIndex++ + return + } + } + } + + s.fullRunHelper(c, `{ + "command": ["true"], + "container_image": "`+arvadostest.DockerImage112PDH+`", + "cwd": ".", + "environment": {}, + "mounts": {"/tmp": {"kind": "tmp"} }, + "output_path": "/tmp", + "priority": 1, + "runtime_constraints": {}, + "state": "Locked", + "uuid": "zzzzz-dz642-20230320101530a" + }`, nil, func() int { + s.runner.costStartTime = now.Add(-3 * time.Hour) + err := syscall.Kill(pid, syscall.SIGUSR2) + c.Check(err, IsNil, Commentf("error sending first SIGUSR2 to runner")) + getCostUpdate() + + err = os.WriteFile(path.Join(lockdir, pricesfile), pricesJSON, 0o700) + c.Check(err, IsNil, Commentf("error writing JSON prices file")) + err = syscall.Kill(pid, syscall.SIGUSR2) + c.Check(err, IsNil, Commentf("error sending second SIGUSR2 to runner")) + getCostUpdate() + + return 0 + }) + // Comparing with format strings makes it easy to ignore minor variations + // in cost across runs while keeping diagnostics pretty. + c.Check(fmt.Sprintf("%.3f", costUpdates[0]), Equals, "6.600") + c.Check(fmt.Sprintf("%.3f", costUpdates[1]), Equals, "7.600") +} + type FakeProcess struct { cmdLine []string }