# %C number of VCPUs
# %M memory in MB
# %T tmp in MB
+ # %G number of GPU devices (runtime_constraints.cuda.device_count)
#
# Use %% to express a literal %. The %%J in the default will be changed
# to %J, which is interpreted by bsub itself.
# from /tmp, or adjust the "-o" and "-e" arguments accordingly.
BsubArgumentsList: ["-o", "/tmp/crunch-run.%%J.out", "-e", "/tmp/crunch-run.%%J.err", "-J", "%U", "-n", "%C", "-D", "%MMB", "-R", "rusage[mem=%MMB:tmp=%TMB] span[hosts=1]", "-R", "select[mem>=%MMB]", "-R", "select[tmp>=%TMB]", "-R", "select[ncpus>=%C]"]
+ # Arguments that will be appended to the bsub command line
+ # when submitting Arvados containers as LSF jobs with
+ # runtime_constraints.cuda.device_count > 0
+ BsubCUDAArguments: ["-gpu", "num=%G"]
+
# Use sudo to switch to this user account when submitting LSF
# jobs.
#
"%M": fmt.Sprintf("%d", mem),
"%T": fmt.Sprintf("%d", tmp),
"%U": container.UUID,
+ "%G": fmt.Sprintf("%d", container.RuntimeConstraints.CUDA.DeviceCount),
}
re := regexp.MustCompile(`%.`)
var substitutionErrors string
- for _, a := range disp.Cluster.Containers.LSF.BsubArgumentsList {
+ argumentTemplate := disp.Cluster.Containers.LSF.BsubArgumentsList
+ if container.RuntimeConstraints.CUDA.DeviceCount > 0 {
+ argumentTemplate = append(argumentTemplate, disp.Cluster.Containers.LSF.BsubCUDAArguments...)
+ }
+ for _, a := range argumentTemplate {
args = append(args, re.ReplaceAllStringFunc(a, func(s string) string {
subst := repl[s]
if len(subst) == 0 {
var _ = check.Suite(&suite{})
type suite struct {
- disp *dispatcher
- crTooBig arvados.ContainerRequest
+ disp *dispatcher
+ crTooBig arvados.ContainerRequest
+ crCUDARequest arvados.ContainerRequest
}
func (s *suite) TearDownTest(c *check.C) {
},
})
c.Assert(err, check.IsNil)
+
+ err = arvados.NewClientFromEnv().RequestAndDecode(&s.crCUDARequest, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{
+ "container_request": map[string]interface{}{
+ "runtime_constraints": arvados.RuntimeConstraints{
+ RAM: 16000000,
+ VCPUs: 1,
+ CUDA: arvados.CUDARuntimeConstraints{
+ DeviceCount: 1,
+ DriverVersion: "11.0",
+ HardwareCapability: "8.0",
+ },
+ },
+ "container_image": arvadostest.DockerImage112PDH,
+ "command": []string{"sleep", "1"},
+ "mounts": map[string]arvados.Mount{"/mnt/out": {Kind: "tmp", Capacity: 1000}},
+ "output_path": "/mnt/out",
+ "state": arvados.ContainerRequestStateCommitted,
+ "priority": 1,
+ "container_count_max": 1,
+ },
+ })
+ c.Assert(err, check.IsNil)
+
}
type lsfstub struct {
switch prog {
case "bsub":
defaultArgs := s.disp.Cluster.Containers.LSF.BsubArgumentsList
- c.Assert(len(args), check.Equals, len(defaultArgs))
+ if args[5] == s.crCUDARequest.ContainerUUID {
+ c.Assert(len(args), check.Equals, len(defaultArgs)+len(s.disp.Cluster.Containers.LSF.BsubCUDAArguments))
+ } else {
+ c.Assert(len(args), check.Equals, len(defaultArgs))
+ }
// %%J must have been rewritten to %J
c.Check(args[1], check.Equals, "/tmp/crunch-run.%J.out")
args = args[4:]
fakejobq[nextjobid] = args[1]
nextjobid++
mtx.Unlock()
+ case s.crCUDARequest.ContainerUUID:
+ c.Check(args, check.DeepEquals, []string{
+ "-J", s.crCUDARequest.ContainerUUID,
+ "-n", "1",
+ "-D", "528MB",
+ "-R", "rusage[mem=528MB:tmp=256MB] span[hosts=1]",
+ "-R", "select[mem>=528MB]",
+ "-R", "select[tmp>=256MB]",
+ "-R", "select[ncpus>=1]",
+ "-gpu", "num=1"})
+ mtx.Lock()
+ fakejobq[nextjobid] = args[1]
+ nextjobid++
+ mtx.Unlock()
default:
c.Errorf("unexpected uuid passed to bsub: args %q", args)
return exec.Command("false")