X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/055d9f503f719bf9aad95c57a2fa435f83537318..7f0f12c40238f3eb12a51877a755cf22357e0767:/lib/lsf/dispatch.go diff --git a/lib/lsf/dispatch.go b/lib/lsf/dispatch.go index d362f66d14..897e5803f2 100644 --- a/lib/lsf/dispatch.go +++ b/lib/lsf/dispatch.go @@ -18,6 +18,8 @@ import ( "time" "git.arvados.org/arvados.git/lib/cmd" + "git.arvados.org/arvados.git/lib/controller/dblock" + "git.arvados.org/arvados.git/lib/ctrlctx" "git.arvados.org/arvados.git/lib/dispatchcloud" "git.arvados.org/arvados.git/lib/service" "git.arvados.org/arvados.git/sdk/go/arvados" @@ -58,6 +60,7 @@ type dispatcher struct { Registry *prometheus.Registry logger logrus.FieldLogger + dbConnector ctrlctx.DBConnector lsfcli lsfcli lsfqueue lsfqueue arvDispatcher *dispatch.Dispatcher @@ -73,7 +76,9 @@ type dispatcher struct { func (disp *dispatcher) Start() { disp.initOnce.Do(func() { disp.init() + dblock.Dispatch.Lock(context.Background(), disp.dbConnector.GetDB) go func() { + defer dblock.Dispatch.Unlock() disp.checkLsfQueueForOrphans() err := disp.arvDispatcher.Run(disp.Context) if err != nil { @@ -125,6 +130,7 @@ func (disp *dispatcher) init() { lsfcli: &disp.lsfcli, } disp.ArvClient.AuthToken = disp.AuthToken + disp.dbConnector = ctrlctx.DBConnector{PostgreSQL: disp.Cluster.PostgreSQL} disp.stop = make(chan struct{}, 1) disp.stopped = make(chan struct{}) @@ -300,6 +306,15 @@ func (disp *dispatcher) bsubArgs(container arvados.Container) ([]string, error) container.RuntimeConstraints.KeepCacheRAM+ int64(disp.Cluster.Containers.ReserveExtraRAM)) / 1048576)) + maxruntime := time.Duration(container.SchedulingParameters.MaxRunTime) * time.Second + if maxruntime == 0 { + maxruntime = disp.Cluster.Containers.LSF.MaxRunTimeDefault.Duration() + } + if maxruntime > 0 { + maxruntime += disp.Cluster.Containers.LSF.MaxRunTimeOverhead.Duration() + } + maxrunminutes := int64(math.Ceil(float64(maxruntime.Seconds()) / 60)) + repl := map[string]string{ "%%": "%", "%C": fmt.Sprintf("%d", vcpus), @@ -307,6 +322,7 @@ func (disp *dispatcher) bsubArgs(container arvados.Container) ([]string, error) "%T": fmt.Sprintf("%d", tmp), "%U": container.UUID, "%G": fmt.Sprintf("%d", container.RuntimeConstraints.CUDA.DeviceCount), + "%W": fmt.Sprintf("%d", maxrunminutes), } re := regexp.MustCompile(`%.`) @@ -315,7 +331,16 @@ func (disp *dispatcher) bsubArgs(container arvados.Container) ([]string, error) if container.RuntimeConstraints.CUDA.DeviceCount > 0 { argumentTemplate = append(argumentTemplate, disp.Cluster.Containers.LSF.BsubCUDAArguments...) } - for _, a := range argumentTemplate { + for idx, a := range argumentTemplate { + if idx > 0 && (argumentTemplate[idx-1] == "-W" || argumentTemplate[idx-1] == "-We") && a == "%W" && maxrunminutes == 0 { + // LSF docs don't specify an argument to "-W" + // or "-We" that indicates "unknown", so + // instead we drop the "-W %W" part of the + // command line entirely when max runtime is + // unknown. + args = args[:len(args)-1] + continue + } args = append(args, re.ReplaceAllStringFunc(a, func(s string) string { subst := repl[s] if len(subst) == 0 {