Merge branch 'main' into 18324-lsf-gpu
authorPeter Amstutz <peter.amstutz@curii.com>
Fri, 7 Jan 2022 22:19:54 +0000 (17:19 -0500)
committerPeter Amstutz <peter.amstutz@curii.com>
Fri, 7 Jan 2022 23:14:33 +0000 (18:14 -0500)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

lib/config/config.default.yml
lib/lsf/dispatch.go
lib/lsf/dispatch_test.go
sdk/go/arvados/config.go

index 21e39a8a873403e801aa02504175f83a8aa6521f..17bba5410bb4f31efceb8b1b6ed74eb372d183b8 100644 (file)
@@ -1099,6 +1099,7 @@ Clusters:
         # %C number of VCPUs
         # %M memory in MB
         # %T tmp in MB
+        # %G number of GPU devices (runtime_constraints.cuda.device_count)
         #
         # Use %% to express a literal %. The %%J in the default will be changed
         # to %J, which is interpreted by bsub itself.
@@ -1109,6 +1110,11 @@ Clusters:
         # from /tmp, or adjust the "-o" and "-e" arguments accordingly.
         BsubArgumentsList: ["-o", "/tmp/crunch-run.%%J.out", "-e", "/tmp/crunch-run.%%J.err", "-J", "%U", "-n", "%C", "-D", "%MMB", "-R", "rusage[mem=%MMB:tmp=%TMB] span[hosts=1]", "-R", "select[mem>=%MMB]", "-R", "select[tmp>=%TMB]", "-R", "select[ncpus>=%C]"]
 
+        # Arguments that will be appended to the bsub command line
+        # when submitting Arvados containers as LSF jobs with
+        # runtime_constraints.cuda.device_count > 0
+        BsubCUDAArguments: ["-gpu", "num=%G"]
+
         # Use sudo to switch to this user account when submitting LSF
         # jobs.
         #
index 537d52a072d6a503262b1a228c868afc8f28b151..c9ed5582bc0a006c00ab1d94b7161fd455b65382 100644 (file)
@@ -306,11 +306,16 @@ func (disp *dispatcher) bsubArgs(container arvados.Container) ([]string, error)
                "%M": fmt.Sprintf("%d", mem),
                "%T": fmt.Sprintf("%d", tmp),
                "%U": container.UUID,
+               "%G": fmt.Sprintf("%d", container.RuntimeConstraints.CUDA.DeviceCount),
        }
 
        re := regexp.MustCompile(`%.`)
        var substitutionErrors string
-       for _, a := range disp.Cluster.Containers.LSF.BsubArgumentsList {
+       argumentTemplate := disp.Cluster.Containers.LSF.BsubArgumentsList
+       if container.RuntimeConstraints.CUDA.DeviceCount > 0 {
+               argumentTemplate = append(argumentTemplate, disp.Cluster.Containers.LSF.BsubCUDAArguments...)
+       }
+       for _, a := range argumentTemplate {
                args = append(args, re.ReplaceAllStringFunc(a, func(s string) string {
                        subst := repl[s]
                        if len(subst) == 0 {
index c044df09f65d42f5f4aad7903b60e27160d5ec98..c678a9a4815f951f3cdf499c9c4a97c6c6deaa22 100644 (file)
@@ -30,8 +30,9 @@ func Test(t *testing.T) {
 var _ = check.Suite(&suite{})
 
 type suite struct {
-       disp     *dispatcher
-       crTooBig arvados.ContainerRequest
+       disp          *dispatcher
+       crTooBig      arvados.ContainerRequest
+       crCUDARequest arvados.ContainerRequest
 }
 
 func (s *suite) TearDownTest(c *check.C) {
@@ -64,6 +65,29 @@ func (s *suite) SetUpTest(c *check.C) {
                },
        })
        c.Assert(err, check.IsNil)
+
+       err = arvados.NewClientFromEnv().RequestAndDecode(&s.crCUDARequest, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{
+               "container_request": map[string]interface{}{
+                       "runtime_constraints": arvados.RuntimeConstraints{
+                               RAM:   16000000,
+                               VCPUs: 1,
+                               CUDA: arvados.CUDARuntimeConstraints{
+                                       DeviceCount:        1,
+                                       DriverVersion:      "11.0",
+                                       HardwareCapability: "8.0",
+                               },
+                       },
+                       "container_image":     arvadostest.DockerImage112PDH,
+                       "command":             []string{"sleep", "1"},
+                       "mounts":              map[string]arvados.Mount{"/mnt/out": {Kind: "tmp", Capacity: 1000}},
+                       "output_path":         "/mnt/out",
+                       "state":               arvados.ContainerRequestStateCommitted,
+                       "priority":            1,
+                       "container_count_max": 1,
+               },
+       })
+       c.Assert(err, check.IsNil)
+
 }
 
 type lsfstub struct {
@@ -90,7 +114,11 @@ func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ...
                switch prog {
                case "bsub":
                        defaultArgs := s.disp.Cluster.Containers.LSF.BsubArgumentsList
-                       c.Assert(len(args), check.Equals, len(defaultArgs))
+                       if args[5] == s.crCUDARequest.ContainerUUID {
+                               c.Assert(len(args), check.Equals, len(defaultArgs)+len(s.disp.Cluster.Containers.LSF.BsubCUDAArguments))
+                       } else {
+                               c.Assert(len(args), check.Equals, len(defaultArgs))
+                       }
                        // %%J must have been rewritten to %J
                        c.Check(args[1], check.Equals, "/tmp/crunch-run.%J.out")
                        args = args[4:]
@@ -134,6 +162,20 @@ func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ...
                                fakejobq[nextjobid] = args[1]
                                nextjobid++
                                mtx.Unlock()
+                       case s.crCUDARequest.ContainerUUID:
+                               c.Check(args, check.DeepEquals, []string{
+                                       "-J", s.crCUDARequest.ContainerUUID,
+                                       "-n", "1",
+                                       "-D", "528MB",
+                                       "-R", "rusage[mem=528MB:tmp=256MB] span[hosts=1]",
+                                       "-R", "select[mem>=528MB]",
+                                       "-R", "select[tmp>=256MB]",
+                                       "-R", "select[ncpus>=1]",
+                                       "-gpu", "num=1"})
+                               mtx.Lock()
+                               fakejobq[nextjobid] = args[1]
+                               nextjobid++
+                               mtx.Unlock()
                        default:
                                c.Errorf("unexpected uuid passed to bsub: args %q", args)
                                return exec.Command("false")
index a1ab713e4a1eb43864922afecb11ee41139c0252..b8c8269f12acba74feb00edc07ec7949e0db5fc4 100644 (file)
@@ -486,6 +486,7 @@ type ContainersConfig struct {
        LSF struct {
                BsubSudoUser      string
                BsubArgumentsList []string
+               BsubCUDAArguments []string
        }
 }