X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/565612fd40474044e2afaa4fcb993c8c0197ca8e..37516bc14fdfe634c78764c15f3a8eb3a09b403c:/lib/lsf/dispatch_test.go diff --git a/lib/lsf/dispatch_test.go b/lib/lsf/dispatch_test.go index c678a9a481..e1e0bcae31 100644 --- a/lib/lsf/dispatch_test.go +++ b/lib/lsf/dispatch_test.go @@ -32,7 +32,9 @@ var _ = check.Suite(&suite{}) type suite struct { disp *dispatcher crTooBig arvados.ContainerRequest + crPending arvados.ContainerRequest crCUDARequest arvados.ContainerRequest + crMaxRunTime arvados.ContainerRequest } func (s *suite) TearDownTest(c *check.C) { @@ -44,7 +46,16 @@ func (s *suite) SetUpTest(c *check.C) { c.Assert(err, check.IsNil) cluster, err := cfg.GetCluster("") c.Assert(err, check.IsNil) - cluster.Containers.CloudVMs.PollInterval = arvados.Duration(time.Second) + cluster.Containers.ReserveExtraRAM = 256 << 20 + cluster.Containers.CloudVMs.PollInterval = arvados.Duration(time.Second / 4) + cluster.Containers.MinRetryPeriod = arvados.Duration(time.Second / 4) + cluster.InstanceTypes = arvados.InstanceTypeMap{ + "biggest_available_node": arvados.InstanceType{ + RAM: 100 << 30, // 100 GiB + VCPUs: 4, + IncludedScratch: 100 << 30, + Scratch: 100 << 30, + }} s.disp = newHandler(context.Background(), cluster, arvadostest.Dispatch1Token, prometheus.NewRegistry()).(*dispatcher) s.disp.lsfcli.stubCommand = func(string, ...string) *exec.Cmd { return exec.Command("bash", "-c", "echo >&2 unimplemented stub; false") @@ -66,6 +77,24 @@ func (s *suite) SetUpTest(c *check.C) { }) c.Assert(err, check.IsNil) + err = arvados.NewClientFromEnv().RequestAndDecode(&s.crPending, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{ + "container_request": map[string]interface{}{ + "runtime_constraints": arvados.RuntimeConstraints{ + RAM: 100000000, + VCPUs: 2, + KeepCacheDisk: 8 << 30, + }, + "container_image": arvadostest.DockerImage112PDH, + "command": []string{"sleep", "1"}, + "mounts": map[string]arvados.Mount{"/mnt/out": {Kind: "tmp", Capacity: 1000}}, + "output_path": "/mnt/out", + "state": arvados.ContainerRequestStateCommitted, + "priority": 1, + "container_count_max": 1, + }, + }) + c.Assert(err, check.IsNil) + err = arvados.NewClientFromEnv().RequestAndDecode(&s.crCUDARequest, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{ "container_request": map[string]interface{}{ "runtime_constraints": arvados.RuntimeConstraints{ @@ -88,6 +117,25 @@ func (s *suite) SetUpTest(c *check.C) { }) c.Assert(err, check.IsNil) + err = arvados.NewClientFromEnv().RequestAndDecode(&s.crMaxRunTime, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{ + "container_request": map[string]interface{}{ + "runtime_constraints": arvados.RuntimeConstraints{ + RAM: 1000000, + VCPUs: 1, + }, + "scheduling_parameters": arvados.SchedulingParameters{ + MaxRunTime: 124, + }, + "container_image": arvadostest.DockerImage112PDH, + "command": []string{"sleep", "123"}, + "mounts": map[string]arvados.Mount{"/mnt/out": {Kind: "tmp", Capacity: 1000}}, + "output_path": "/mnt/out", + "state": arvados.ContainerRequestStateCommitted, + "priority": 1, + "container_count_max": 1, + }, + }) + c.Assert(err, check.IsNil) } type lsfstub struct { @@ -113,12 +161,7 @@ func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ... } switch prog { case "bsub": - defaultArgs := s.disp.Cluster.Containers.LSF.BsubArgumentsList - if args[5] == s.crCUDARequest.ContainerUUID { - c.Assert(len(args), check.Equals, len(defaultArgs)+len(s.disp.Cluster.Containers.LSF.BsubCUDAArguments)) - } else { - c.Assert(len(args), check.Equals, len(defaultArgs)) - } + c.Assert(len(args) > 5, check.Equals, true) // %%J must have been rewritten to %J c.Check(args[1], check.Equals, "/tmp/crunch-run.%J.out") args = args[4:] @@ -149,15 +192,15 @@ func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ... fakejobq[nextjobid] = args[1] nextjobid++ mtx.Unlock() - case s.crTooBig.ContainerUUID: + case s.crPending.ContainerUUID: c.Check(args, check.DeepEquals, []string{ - "-J", s.crTooBig.ContainerUUID, - "-n", "1", - "-D", "954187MB", - "-R", "rusage[mem=954187MB:tmp=256MB] span[hosts=1]", - "-R", "select[mem>=954187MB]", - "-R", "select[tmp>=256MB]", - "-R", "select[ncpus>=1]"}) + "-J", s.crPending.ContainerUUID, + "-n", "2", + "-D", "352MB", + "-R", "rusage[mem=352MB:tmp=8448MB] span[hosts=1]", + "-R", "select[mem>=352MB]", + "-R", "select[tmp>=8448MB]", + "-R", "select[ncpus>=2]"}) mtx.Lock() fakejobq[nextjobid] = args[1] nextjobid++ @@ -176,6 +219,21 @@ func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ... fakejobq[nextjobid] = args[1] nextjobid++ mtx.Unlock() + case s.crMaxRunTime.ContainerUUID: + c.Check(args, check.DeepEquals, []string{ + "-J", s.crMaxRunTime.ContainerUUID, + "-n", "1", + "-D", "257MB", + "-R", "rusage[mem=257MB:tmp=2304MB] span[hosts=1]", + "-R", "select[mem>=257MB]", + "-R", "select[tmp>=2304MB]", + "-R", "select[ncpus>=1]", + "-We", "8", // 124s + 5m overhead + roundup = 8m + }) + mtx.Lock() + fakejobq[nextjobid] = args[1] + nextjobid++ + mtx.Unlock() default: c.Errorf("unexpected uuid passed to bsub: args %q", args) return exec.Command("false") @@ -186,7 +244,7 @@ func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ... var records []map[string]interface{} for jobid, uuid := range fakejobq { stat, reason := "RUN", "" - if uuid == s.crTooBig.ContainerUUID { + if uuid == s.crPending.ContainerUUID { // The real bjobs output includes a trailing ';' here: stat, reason = "PEND", "There are no suitable hosts for the job;" } @@ -241,18 +299,26 @@ func (s *suite) TestSubmit(c *check.C) { c.Error("timed out") break } + // "crTooBig" should never be submitted to lsf because + // it is bigger than any configured instance type + if ent, ok := s.disp.lsfqueue.Lookup(s.crTooBig.ContainerUUID); ok { + c.Errorf("Lookup(crTooBig) == true, ent = %#v", ent) + break + } // "queuedcontainer" should be running if _, ok := s.disp.lsfqueue.Lookup(arvadostest.QueuedContainerUUID); !ok { + c.Log("Lookup(queuedcontainer) == false") continue } - // "lockedcontainer" should be cancelled because it - // has priority 0 (no matching container requests) - if _, ok := s.disp.lsfqueue.Lookup(arvadostest.LockedContainerUUID); ok { + // "crPending" should be pending + if ent, ok := s.disp.lsfqueue.Lookup(s.crPending.ContainerUUID); !ok { + c.Logf("Lookup(crPending) == false", ent) continue } - // "crTooBig" should be cancelled because lsf stub - // reports there is no suitable instance type - if _, ok := s.disp.lsfqueue.Lookup(s.crTooBig.ContainerUUID); ok { + // "lockedcontainer" should be cancelled because it + // has priority 0 (no matching container requests) + if ent, ok := s.disp.lsfqueue.Lookup(arvadostest.LockedContainerUUID); ok { + c.Logf("Lookup(lockedcontainer) == true, ent = %#v", ent) continue } var ctr arvados.Container @@ -271,7 +337,7 @@ func (s *suite) TestSubmit(c *check.C) { c.Logf("container %s is not in the LSF queue but its arvados record has not been updated to state==Cancelled (state is %q)", s.crTooBig.ContainerUUID, ctr.State) continue } else { - c.Check(ctr.RuntimeStatus["error"], check.Equals, "There are no suitable hosts for the job;") + c.Check(ctr.RuntimeStatus["error"], check.Equals, "constraints not satisfiable by any configured instance type") } c.Log("reached desired state") break