From 39464fc833e3ee2fb771f83dce9f94e3856c1075 Mon Sep 17 00:00:00 2001 From: Ward Vandewege Date: Sat, 23 Oct 2021 11:42:14 -0400 Subject: [PATCH] 18290: LSF: make the bsub arguments completely configurable. Arvados-DCO-1.1-Signed-off-by: Ward Vandewege --- lib/config/config.default.yml | 17 +++++++++--- lib/config/generated_config.go | 17 +++++++++--- lib/lsf/dispatch.go | 49 ++++++++++++++++++++++++---------- lib/lsf/dispatch_test.go | 9 +++---- 4 files changed, 65 insertions(+), 27 deletions(-) diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml index 4e2a0e26d4..8b51a85d9a 100644 --- a/lib/config/config.default.yml +++ b/lib/config/config.default.yml @@ -1021,14 +1021,23 @@ Clusters: AssignNodeHostname: "compute%d" LSF: - # Additional arguments to bsub when submitting Arvados - # containers as LSF jobs. + # Arguments to bsub when submitting Arvados containers as LSF jobs. + # + # Template variables starting with % will be substituted as follows: + # + # %U uuid + # %C number of cpus + # %M memory in MB + # %T tmp in MB + # + # Use %% to express a literal %. The %%J in the default will be changed + # to %J, which is interpreted by bsub itself. # # Note that the default arguments cause LSF to write two files # in /tmp on the compute node each time an Arvados container # runs. Ensure you have something in place to delete old files - # from /tmp, or adjust these arguments accordingly. - BsubArgumentsList: ["-o", "/tmp/crunch-run.%J.out", "-e", "/tmp/crunch-run.%J.err"] + # from /tmp, or adjust the "-o" and "-e" arguments accordingly. + BsubArgumentsList: ["-o", "/tmp/crunch-run.%%J.out", "-e", "/tmp/crunch-run.%%J.err", "-J", "%U", "-n", "%C", "-D", "%MMB", "-R", "rusage[mem=%MMB:tmp=%TMB] span[hosts=1]"] # Use sudo to switch to this user account when submitting LSF # jobs. diff --git a/lib/config/generated_config.go b/lib/config/generated_config.go index 875939a3e1..519d1a8e5f 100644 --- a/lib/config/generated_config.go +++ b/lib/config/generated_config.go @@ -1027,14 +1027,23 @@ Clusters: AssignNodeHostname: "compute%d" LSF: - # Additional arguments to bsub when submitting Arvados - # containers as LSF jobs. + # Arguments to bsub when submitting Arvados containers as LSF jobs. + # + # Template variables starting with % will be substituted as follows: + # + # %U uuid + # %C number of cpus + # %M memory in MB + # %T tmp in MB + # + # Use %% to express a literal %. The %%J in the default will be changed + # to %J, which is interpreted by bsub itself. # # Note that the default arguments cause LSF to write two files # in /tmp on the compute node each time an Arvados container # runs. Ensure you have something in place to delete old files - # from /tmp, or adjust these arguments accordingly. - BsubArgumentsList: ["-o", "/tmp/crunch-run.%J.out", "-e", "/tmp/crunch-run.%J.err"] + # from /tmp, or adjust the "-o" and "-e" arguments accordingly. + BsubArgumentsList: ["-o", "/tmp/crunch-run.%%J.out", "-e", "/tmp/crunch-run.%%J.err", "-J", "%U", "-n", "%C", "-D", "%MMB", "-R", "rusage[mem=%MMB:tmp=%TMB] span[hosts=1]"] # Use sudo to switch to this user account when submitting LSF # jobs. diff --git a/lib/lsf/dispatch.go b/lib/lsf/dispatch.go index d3ba605aba..d17c458e80 100644 --- a/lib/lsf/dispatch.go +++ b/lib/lsf/dispatch.go @@ -270,28 +270,49 @@ func (disp *dispatcher) bkill(ctr arvados.Container) { } func (disp *dispatcher) bsubArgs(container arvados.Container) ([]string, error) { + tmpArgs := []string{} args := []string{"bsub"} - args = append(args, disp.Cluster.Containers.LSF.BsubArgumentsList...) - args = append(args, "-J", container.UUID) - args = append(args, disp.bsubConstraintArgs(container)...) - if u := disp.Cluster.Containers.LSF.BsubSudoUser; u != "" { - args = append([]string{"sudo", "-E", "-u", u}, args...) - } - return args, nil -} + tmpArgs = append(tmpArgs, disp.Cluster.Containers.LSF.BsubArgumentsList...) -func (disp *dispatcher) bsubConstraintArgs(container arvados.Container) []string { - // TODO: propagate container.SchedulingParameters.Partitions tmp := int64(math.Ceil(float64(dispatchcloud.EstimateScratchSpace(&container)) / 1048576)) vcpus := container.RuntimeConstraints.VCPUs mem := int64(math.Ceil(float64(container.RuntimeConstraints.RAM+ container.RuntimeConstraints.KeepCacheRAM+ int64(disp.Cluster.Containers.ReserveExtraRAM)) / 1048576)) - return []string{ - "-n", fmt.Sprintf("%d", vcpus), - "-D", fmt.Sprintf("%dMB", mem), // ulimit -d (note this doesn't limit the total container memory usage) - "-R", fmt.Sprintf("rusage[mem=%dMB:tmp=%dMB] span[hosts=1]", mem, tmp), + + r := regexp.MustCompile(`([^%]|^)%([^%])`) + undoubleRE := regexp.MustCompile(`%%`) + for _, a := range tmpArgs { + tmp := r.ReplaceAllStringFunc(a, func(m string) string { + parts := r.FindStringSubmatch(m) + return parts[1] + disp.substitute(parts[2], container.UUID, vcpus, mem, tmp) + }) + // handle escaped literal % symbols + tmp = undoubleRE.ReplaceAllString(tmp, "%") + args = append(args, tmp) + } + + if u := disp.Cluster.Containers.LSF.BsubSudoUser; u != "" { + args = append([]string{"sudo", "-E", "-u", u}, args...) + } + return args, nil +} + +func (disp *dispatcher) substitute(l string, uuid string, vcpus int, mem, tmp int64) string { + var arg string + switch l { + case "C": + arg = fmt.Sprintf("%d", vcpus) + case "T": + arg = fmt.Sprintf("%d", tmp) + case "M": + arg = fmt.Sprintf("%d", mem) + case "U": + arg = uuid + default: + arg = "%" + l } + return arg } // Check the next bjobs report, and invoke TrackContainer for all the diff --git a/lib/lsf/dispatch_test.go b/lib/lsf/dispatch_test.go index 44a1a3d8cb..641453e548 100644 --- a/lib/lsf/dispatch_test.go +++ b/lib/lsf/dispatch_test.go @@ -72,11 +72,10 @@ func (stub lsfstub) stubCommand(s *suite, c *check.C) func(prog string, args ... switch prog { case "bsub": defaultArgs := s.disp.Cluster.Containers.LSF.BsubArgumentsList - c.Assert(len(args) > len(defaultArgs), check.Equals, true) - c.Check(args[:len(defaultArgs)], check.DeepEquals, defaultArgs) - args = args[len(defaultArgs):] - - c.Check(args[0], check.Equals, "-J") + c.Assert(len(args), check.Equals, len(defaultArgs)) + // %%J must have been rewritten to %J + c.Check(args[1], check.Equals, "/tmp/crunch-run.%J.out") + args = args[4:] switch args[1] { case arvadostest.LockedContainerUUID: c.Check(args, check.DeepEquals, []string{ -- 2.30.2