19418: LSF: use InstanceTypes to detect unsatisfiable constraints. 19418-lsf-unsatisfiable
authorTom Clegg <tom@curii.com>
Tue, 4 Oct 2022 15:30:40 +0000 (11:30 -0400)
committerTom Clegg <tom@curii.com>
Tue, 4 Oct 2022 15:30:40 +0000 (11:30 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tom@curii.com>

doc/install/crunch2-lsf/install-dispatch.html.textile.liquid
lib/lsf/dispatch.go
lib/lsf/dispatch_test.go

index 37adffd18d4e9bef5162614b015a3155df3333a5..ded244046dde211ea2b18dab7779d5159ffc100e 100644 (file)
@@ -62,7 +62,7 @@ Alternatively, you can arrange for the arvados-dispatch-lsf process to run as an
 </notextile>
 
 
-h3(#SbatchArguments). Containers.LSF.BsubArgumentsList
+h3(#BsubArgumentsList). Containers.LSF.BsubArgumentsList
 
 When arvados-dispatch-lsf invokes @bsub@, you can add arguments to the command by specifying @BsubArgumentsList@.  You can use this to send the jobs to specific cluster partitions or add resource requests.  Set @BsubArgumentsList@ to an array of strings.
 
@@ -87,7 +87,7 @@ For example:
 
 Note that the default value for @BsubArgumentsList@ uses the @-o@ and @-e@ arguments to write stdout/stderr data to files in @/tmp@ on the compute nodes, which is helpful for troubleshooting installation/configuration problems. Ensure you have something in place to delete old files from @/tmp@, or adjust these arguments accordingly.
 
-h3(#SbatchArguments). Containers.LSF.BsubCUDAArguments
+h3(#BsubCUDAArguments). Containers.LSF.BsubCUDAArguments
 
 If the container requests access to GPUs (@runtime_constraints.cuda.device_count@ of the container request is greater than zero), the command line arguments in @BsubCUDAArguments@ will be added to the command line _after_ @BsubArgumentsList@.  This should consist of the additional @bsub@ flags your site requires to schedule the job on a node with GPU support.  Set @BsubCUDAArguments@ to an array of strings.  For example:
 
@@ -98,7 +98,7 @@ If the container requests access to GPUs (@runtime_constraints.cuda.device_count
 </pre>
 </notextile>
 
-h3(#PollPeriod). Containers.PollInterval
+h3(#PollInterval). Containers.PollInterval
 
 arvados-dispatch-lsf polls the API server periodically for new containers to run.  The @PollInterval@ option controls how often this poll happens.  Set this to a string of numbers suffixed with one of the time units @s@, @m@, or @h@.  For example:
 
@@ -122,7 +122,7 @@ Supports suffixes @KB@, @KiB@, @MB@, @MiB@, @GB@, @GiB@, @TB@, @TiB@, @PB@, @PiB
 </notextile>
 
 
-h3(#CrunchRunCommand-network). Containers.CrunchRunArgumentList: Using host networking for containers
+h3(#CrunchRunArgumentList). Containers.CrunchRunArgumentList: Using host networking for containers
 
 Older Linux kernels (prior to 3.18) have bugs in network namespace handling which can lead to compute node lockups.  This by is indicated by blocked kernel tasks in "Workqueue: netns cleanup_net".   If you are experiencing this problem, as a workaround you can disable use of network namespaces by Docker across the cluster.  Be aware this reduces container isolation, which may be a security risk.
 
@@ -134,6 +134,37 @@ Older Linux kernels (prior to 3.18) have bugs in network namespace handling whic
 </pre>
 </notextile>
 
+
+h3(#InstanceTypes). InstanceTypes: Avoid submitting jobs with unsatisfiable resource constraints
+
+LSF does not provide feedback when a submitted job's RAM, CPU, or disk space constraints cannot be satisfied by any node: the job will wait in the queue indefinitely with "pending" status, reported by Arvados as "queued".
+
+As a workaround, you can configure @InstanceTypes@ with your LSF cluster's compute node sizes. Arvados will use these sizes to determine when a container is impossible to run, and cancel it instead of submitting an LSF job.
+
+Apart from detecting non-runnable containers, the configured instance types will not have any effect on scheduling.
+
+<notextile>
+<pre>    InstanceTypes:
+      most-ram:
+        VCPUs: 8
+        RAM: 640GiB
+        IncludedScratch: 640GB
+      most-cpus:
+        VCPUs: 32
+        RAM: 256GiB
+        IncludedScratch: 640GB
+      gpu:
+        VCPUs: 8
+        RAM: 256GiB
+        IncludedScratch: 640GB
+        CUDA:
+          DriverVersion: "11.4"
+          HardwareCapability: "7.5"
+          DeviceCount: 1
+</pre>
+</notextile>
+
+
 {% assign arvados_component = 'arvados-dispatch-lsf' %}
 
 {% include 'install_packages' %}
index e2348337e62992eb4463947690e809e1927bb232..d362f66d14b3ee12b9a4fb6b197b9a34747d944c 100644 (file)
@@ -170,6 +170,19 @@ func (disp *dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Contain
        if ctr.State != dispatch.Locked {
                // already started by prior invocation
        } else if _, ok := disp.lsfqueue.Lookup(ctr.UUID); !ok {
+               if _, err := dispatchcloud.ChooseInstanceType(disp.Cluster, &ctr); errors.As(err, &dispatchcloud.ConstraintsNotSatisfiableError{}) {
+                       err := disp.arvDispatcher.Arv.Update("containers", ctr.UUID, arvadosclient.Dict{
+                               "container": map[string]interface{}{
+                                       "runtime_status": map[string]string{
+                                               "error": err.Error(),
+                                       },
+                               },
+                       }, nil)
+                       if err != nil {
+                               return fmt.Errorf("error setting runtime_status on %s: %s", ctr.UUID, err)
+                       }
+                       return disp.arvDispatcher.UpdateState(ctr.UUID, dispatch.Cancelled)
+               }
                disp.logger.Printf("Submitting container %s to LSF", ctr.UUID)
                cmd := []string{disp.Cluster.Containers.CrunchRunCommand}
                cmd = append(cmd, "--runtime-engine="+disp.Cluster.Containers.RuntimeEngine)
@@ -184,9 +197,8 @@ func (disp *dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Contain
        defer disp.logger.Printf("Done monitoring container %s", ctr.UUID)
 
        go func(uuid string) {
-               cancelled := false
                for ctx.Err() == nil {
-                       qent, ok := disp.lsfqueue.Lookup(uuid)
+                       _, ok := disp.lsfqueue.Lookup(uuid)
                        if !ok {
                                // If the container disappears from
                                // the lsf queue, there is no point in
@@ -196,25 +208,6 @@ func (disp *dispatcher) runContainer(_ *dispatch.Dispatcher, ctr arvados.Contain
                                cancel()
                                return
                        }
-                       if !cancelled && qent.Stat == "PEND" && strings.Contains(qent.PendReason, "There are no suitable hosts for the job") {
-                               disp.logger.Printf("container %s: %s", uuid, qent.PendReason)
-                               err := disp.arvDispatcher.Arv.Update("containers", uuid, arvadosclient.Dict{
-                                       "container": map[string]interface{}{
-                                               "runtime_status": map[string]string{
-                                                       "error": qent.PendReason,
-                                               },
-                                       },
-                               }, nil)
-                               if err != nil {
-                                       disp.logger.Printf("error setting runtime_status on %s: %s", uuid, err)
-                                       continue // retry
-                               }
-                               err = disp.arvDispatcher.UpdateState(uuid, dispatch.Cancelled)
-                               if err != nil {
-                                       continue // retry (UpdateState() already logged the error)
-                               }
-                               cancelled = true
-                       }
                }
        }(ctr.UUID)
 
index a99983f34a8ae4163f9a91ba59c43ab9e57c3e00..e51e719066cbdf2b3f71d245eea9a7fc326fcbc3 100644 (file)
@@ -32,6 +32,7 @@ var _ = check.Suite(&suite{})
 type suite struct {
        disp          *dispatcher
        crTooBig      arvados.ContainerRequest
+       crPending     arvados.ContainerRequest
        crCUDARequest arvados.ContainerRequest
 }
 
@@ -46,6 +47,13 @@ func (s *suite) SetUpTest(c *check.C) {
        c.Assert(err, check.IsNil)
        cluster.Containers.CloudVMs.PollInterval = arvados.Duration(time.Second / 4)
        cluster.Containers.MinRetryPeriod = arvados.Duration(time.Second / 4)
+       cluster.InstanceTypes = arvados.InstanceTypeMap{
+               "biggest_available_node": arvados.InstanceType{
+                       RAM:             100 << 30, // 100 GiB
+                       VCPUs:           4,
+                       IncludedScratch: 100 << 30,
+                       Scratch:         100 << 30,
+               }}
        s.disp = newHandler(context.Background(), cluster, arvadostest.Dispatch1Token, prometheus.NewRegistry()).(*dispatcher)
        s.disp.lsfcli.stubCommand = func(string, ...string) *exec.Cmd {
                return exec.Command("bash", "-c", "echo >&2 unimplemented stub; false")
@@ -67,6 +75,23 @@ func (s *suite) SetUpTest(c *check.C) {
        })
        c.Assert(err, check.IsNil)
 
+       err = arvados.NewClientFromEnv().RequestAndDecode(&s.crPending, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{
+               "container_request": map[string]interface{}{
+                       "runtime_constraints": arvados.RuntimeConstraints{
+                               RAM:   100000000,
+                               VCPUs: 2,
+                       },
+                       "container_image":     arvadostest.DockerImage112PDH,
+                       "command":             []string{"sleep", "1"},
+                       "mounts":              map[string]arvados.Mount{"/mnt/out": {Kind: "tmp", Capacity: 1000}},
+                       "output_path":         "/mnt/out",
+                       "state":               arvados.ContainerRequestStateCommitted,
+                       "priority":            1,
+                       "container_count_max": 1,
+               },
+       })
+       c.Assert(err, check.IsNil)
+
        err = arvados.NewClientFromEnv().RequestAndDecode(&s.crCUDARequest, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{
                "container_request": map[string]interface{}{
                        "runtime_constraints": arvados.RuntimeConstraints{
@@ -150,15 +175,15 @@ func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ...
                                fakejobq[nextjobid] = args[1]
                                nextjobid++
                                mtx.Unlock()
-                       case s.crTooBig.ContainerUUID:
+                       case s.crPending.ContainerUUID:
                                c.Check(args, check.DeepEquals, []string{
-                                       "-J", s.crTooBig.ContainerUUID,
-                                       "-n", "1",
-                                       "-D", "954187MB",
-                                       "-R", "rusage[mem=954187MB:tmp=256MB] span[hosts=1]",
-                                       "-R", "select[mem>=954187MB]",
+                                       "-J", s.crPending.ContainerUUID,
+                                       "-n", "2",
+                                       "-D", "608MB",
+                                       "-R", "rusage[mem=608MB:tmp=256MB] span[hosts=1]",
+                                       "-R", "select[mem>=608MB]",
                                        "-R", "select[tmp>=256MB]",
-                                       "-R", "select[ncpus>=1]"})
+                                       "-R", "select[ncpus>=2]"})
                                mtx.Lock()
                                fakejobq[nextjobid] = args[1]
                                nextjobid++
@@ -187,7 +212,7 @@ func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ...
                        var records []map[string]interface{}
                        for jobid, uuid := range fakejobq {
                                stat, reason := "RUN", ""
-                               if uuid == s.crTooBig.ContainerUUID {
+                               if uuid == s.crPending.ContainerUUID {
                                        // The real bjobs output includes a trailing ';' here:
                                        stat, reason = "PEND", "There are no suitable hosts for the job;"
                                }
@@ -242,23 +267,28 @@ func (s *suite) TestSubmit(c *check.C) {
                        c.Error("timed out")
                        break
                }
+               // "crTooBig" should never be submitted to lsf because
+               // it is bigger than any configured instance type
+               if ent, ok := s.disp.lsfqueue.Lookup(s.crTooBig.ContainerUUID); ok {
+                       c.Errorf("Lookup(crTooBig) == true, ent = %#v", ent)
+                       break
+               }
                // "queuedcontainer" should be running
                if _, ok := s.disp.lsfqueue.Lookup(arvadostest.QueuedContainerUUID); !ok {
                        c.Log("Lookup(queuedcontainer) == false")
                        continue
                }
+               // "crPending" should be pending
+               if ent, ok := s.disp.lsfqueue.Lookup(s.crPending.ContainerUUID); !ok {
+                       c.Logf("Lookup(crPending) == false", ent)
+                       continue
+               }
                // "lockedcontainer" should be cancelled because it
                // has priority 0 (no matching container requests)
                if ent, ok := s.disp.lsfqueue.Lookup(arvadostest.LockedContainerUUID); ok {
                        c.Logf("Lookup(lockedcontainer) == true, ent = %#v", ent)
                        continue
                }
-               // "crTooBig" should be cancelled because lsf stub
-               // reports there is no suitable instance type
-               if ent, ok := s.disp.lsfqueue.Lookup(s.crTooBig.ContainerUUID); ok {
-                       c.Logf("Lookup(crTooBig) == true, ent = %#v", ent)
-                       continue
-               }
                var ctr arvados.Container
                if err := s.disp.arvDispatcher.Arv.Get("containers", arvadostest.LockedContainerUUID, nil, &ctr); err != nil {
                        c.Logf("error getting container state for %s: %s", arvadostest.LockedContainerUUID, err)
@@ -275,7 +305,7 @@ func (s *suite) TestSubmit(c *check.C) {
                        c.Logf("container %s is not in the LSF queue but its arvados record has not been updated to state==Cancelled (state is %q)", s.crTooBig.ContainerUUID, ctr.State)
                        continue
                } else {
-                       c.Check(ctr.RuntimeStatus["error"], check.Equals, "There are no suitable hosts for the job;")
+                       c.Check(ctr.RuntimeStatus["error"], check.Equals, "constraints not satisfiable by any configured instance type")
                }
                c.Log("reached desired state")
                break