12630: first try at adding GPU support, no tests yet
authorPeter Amstutz <peter.amstutz@curii.com>
Mon, 22 Nov 2021 17:47:03 +0000 (12:47 -0500)
committerPeter Amstutz <peter.amstutz@curii.com>
Fri, 10 Dec 2021 16:24:50 +0000 (11:24 -0500)
Arvados-DCO-1.1-Signed-off-by: Peter Amstutz <peter.amstutz@curii.com>

lib/crunchrun/docker.go
lib/crunchrun/executor.go
lib/crunchrun/singularity.go
sdk/go/arvados/container.go

index 07f79bbcc2d11f0239a6231288a94d84a89f87fb..573df7faf0b66ac0fa3ac97195a14b8ed5cde0cc 100644 (file)
@@ -106,6 +106,13 @@ func (e *dockerExecutor) Create(spec containerSpec) error {
                        KernelMemory: spec.RAM, // kernel portion
                },
        }
+       if spec.EnableCUDA {
+               hostCfg.Resources.DeviceRequests = append(hostCfg.Resources.DeviceRequests, dockercontainer.DeviceRequest{
+                       Driver:       "nvidia",
+                       Count:        -1,
+                       Capabilities: [][]string{[]string{"gpu", "nvidia", "compute"}},
+               })
+       }
        for path, mount := range spec.BindMounts {
                bind := mount.HostPath + ":" + path
                if mount.ReadOnly {
index b7c341f3186b1af319780d19777d892be082a1cc..bffd701bcd4d0246841a2dfbb4e2d9cde5b9c2ab 100644 (file)
@@ -24,6 +24,7 @@ type containerSpec struct {
        BindMounts    map[string]bindmount
        Command       []string
        EnableNetwork bool
+       EnableCUDA    bool
        NetworkMode   string // docker network mode, normally "default"
        CgroupParent  string
        Stdin         io.Reader
index 5af023a83dc2dc61506818c88ccfadc5b0c22514..5637a9b4d924023688fc153e1f5a9ea57d199794 100644 (file)
@@ -246,6 +246,11 @@ func (e *singularityExecutor) Start() error {
        if !e.spec.EnableNetwork {
                args = append(args, "--net", "--network=none")
        }
+
+       if e.spec.EnableCUDA {
+               args = append(args, "--nv")
+       }
+
        readonlyflag := map[bool]string{
                false: "rw",
                true:  "ro",
index 7c68bdb20222f59067b5c5f1d89bad8ea6fef5fe..014fd6c2bdd27512849e37abf31ad2d2a2c5b5d0 100644 (file)
@@ -96,10 +96,14 @@ type Mount struct {
 // RuntimeConstraints specify a container's compute resources (RAM,
 // CPU) and network connectivity.
 type RuntimeConstraints struct {
-       API          bool  `json:"API"`
-       RAM          int64 `json:"ram"`
-       VCPUs        int   `json:"vcpus"`
-       KeepCacheRAM int64 `json:"keep_cache_ram"`
+       API                         bool     `json:"API"`
+       RAM                         int64    `json:"ram"`
+       VCPUs                       int      `json:"vcpus"`
+       KeepCacheRAM                int64    `json:"keep_cache_ram"`
+       CUDADriverVersion           string   `json:"cuda_driver_version"`
+       CUDACubinHardwareCapability []string `json:"cuda_cubin_hardware_capability"`
+       CUDAPTXHardwardCapability   string   `json:"cuda_ptx_hardware_capability"`
+       CUDADeviceCount             int      `json:"cuda_device_count"`
 }
 
 // SchedulingParameters specify a container's scheduling parameters