import (
"context"
+ "crypto/hmac"
+ "crypto/sha256"
"errors"
"fmt"
"math"
"time"
"git.arvados.org/arvados.git/lib/cmd"
+ "git.arvados.org/arvados.git/lib/controller/dblock"
+ "git.arvados.org/arvados.git/lib/ctrlctx"
"git.arvados.org/arvados.git/lib/dispatchcloud"
"git.arvados.org/arvados.git/lib/service"
"git.arvados.org/arvados.git/sdk/go/arvados"
Registry *prometheus.Registry
logger logrus.FieldLogger
+ dbConnector ctrlctx.DBConnector
lsfcli lsfcli
lsfqueue lsfqueue
arvDispatcher *dispatch.Dispatcher
func (disp *dispatcher) Start() {
disp.initOnce.Do(func() {
disp.init()
+ dblock.Dispatch.Lock(context.Background(), disp.dbConnector.GetDB)
go func() {
+ defer dblock.Dispatch.Unlock()
disp.checkLsfQueueForOrphans()
err := disp.arvDispatcher.Run(disp.Context)
if err != nil {
disp.lsfcli.logger = disp.logger
disp.lsfqueue = lsfqueue{
logger: disp.logger,
- period: time.Duration(disp.Cluster.Containers.CloudVMs.PollInterval),
+ period: disp.Cluster.Containers.CloudVMs.PollInterval.Duration(),
lsfcli: &disp.lsfcli,
}
disp.ArvClient.AuthToken = disp.AuthToken
+ disp.dbConnector = ctrlctx.DBConnector{PostgreSQL: disp.Cluster.PostgreSQL}
disp.stop = make(chan struct{}, 1)
disp.stopped = make(chan struct{})
if ctr.State != dispatch.Locked {
// already started by prior invocation
- } else if _, ok := disp.lsfqueue.JobID(ctr.UUID); !ok {
+ } else if _, ok := disp.lsfqueue.Lookup(ctr.UUID); !ok {
+ if _, err := dispatchcloud.ChooseInstanceType(disp.Cluster, &ctr); errors.As(err, &dispatchcloud.ConstraintsNotSatisfiableError{}) {
+ err := disp.arvDispatcher.Arv.Update("containers", ctr.UUID, arvadosclient.Dict{
+ "container": map[string]interface{}{
+ "runtime_status": map[string]string{
+ "error": err.Error(),
+ },
+ },
+ }, nil)
+ if err != nil {
+ return fmt.Errorf("error setting runtime_status on %s: %s", ctr.UUID, err)
+ }
+ return disp.arvDispatcher.UpdateState(ctr.UUID, dispatch.Cancelled)
+ }
disp.logger.Printf("Submitting container %s to LSF", ctr.UUID)
cmd := []string{disp.Cluster.Containers.CrunchRunCommand}
cmd = append(cmd, "--runtime-engine="+disp.Cluster.Containers.RuntimeEngine)
disp.logger.Printf("Start monitoring container %v in state %q", ctr.UUID, ctr.State)
defer disp.logger.Printf("Done monitoring container %s", ctr.UUID)
- // If the container disappears from the lsf queue, there is
- // no point in waiting for further dispatch updates: just
- // clean up and return.
go func(uuid string) {
for ctx.Err() == nil {
- if _, ok := disp.lsfqueue.JobID(uuid); !ok {
+ _, ok := disp.lsfqueue.Lookup(uuid)
+ if !ok {
+ // If the container disappears from
+ // the lsf queue, there is no point in
+ // waiting for further dispatch
+ // updates: just clean up and return.
disp.logger.Printf("container %s job disappeared from LSF queue", uuid)
cancel()
return
// Try "bkill" every few seconds until the LSF job disappears
// from the queue.
- ticker := time.NewTicker(5 * time.Second)
+ ticker := time.NewTicker(disp.Cluster.Containers.CloudVMs.PollInterval.Duration() / 2)
defer ticker.Stop()
- for jobid, ok := disp.lsfqueue.JobID(ctr.UUID); ok; _, ok = disp.lsfqueue.JobID(ctr.UUID) {
- err := disp.lsfcli.Bkill(jobid)
+ for qent, ok := disp.lsfqueue.Lookup(ctr.UUID); ok; _, ok = disp.lsfqueue.Lookup(ctr.UUID) {
+ err := disp.lsfcli.Bkill(qent.ID)
if err != nil {
- disp.logger.Warnf("%s: bkill(%d): %s", ctr.UUID, jobid, err)
+ disp.logger.Warnf("%s: bkill(%s): %s", ctr.UUID, qent.ID, err)
}
<-ticker.C
}
var crArgs []string
crArgs = append(crArgs, crunchRunCommand...)
crArgs = append(crArgs, container.UUID)
- crScript := execScript(crArgs)
+
+ h := hmac.New(sha256.New, []byte(disp.Cluster.SystemRootToken))
+ fmt.Fprint(h, container.UUID)
+ authsecret := fmt.Sprintf("%x", h.Sum(nil))
+
+ crScript := execScript(crArgs, map[string]string{"GatewayAuthSecret": authsecret})
bsubArgs, err := disp.bsubArgs(container)
if err != nil {
}
func (disp *dispatcher) bkill(ctr arvados.Container) {
- if jobid, ok := disp.lsfqueue.JobID(ctr.UUID); !ok {
+ if qent, ok := disp.lsfqueue.Lookup(ctr.UUID); !ok {
disp.logger.Debugf("bkill(%s): redundant, job not in queue", ctr.UUID)
- } else if err := disp.lsfcli.Bkill(jobid); err != nil {
- disp.logger.Warnf("%s: bkill(%d): %s", ctr.UUID, jobid, err)
+ } else if err := disp.lsfcli.Bkill(qent.ID); err != nil {
+ disp.logger.Warnf("%s: bkill(%s): %s", ctr.UUID, qent.ID, err)
}
}
func (disp *dispatcher) bsubArgs(container arvados.Container) ([]string, error) {
args := []string{"bsub"}
- args = append(args, disp.Cluster.Containers.LSF.BsubArgumentsList...)
- args = append(args, "-J", container.UUID)
- args = append(args, disp.bsubConstraintArgs(container)...)
- if u := disp.Cluster.Containers.LSF.BsubSudoUser; u != "" {
- args = append([]string{"sudo", "-E", "-u", u}, args...)
- }
- return args, nil
-}
-func (disp *dispatcher) bsubConstraintArgs(container arvados.Container) []string {
- // TODO: propagate container.SchedulingParameters.Partitions
tmp := int64(math.Ceil(float64(dispatchcloud.EstimateScratchSpace(&container)) / 1048576))
vcpus := container.RuntimeConstraints.VCPUs
mem := int64(math.Ceil(float64(container.RuntimeConstraints.RAM+
container.RuntimeConstraints.KeepCacheRAM+
int64(disp.Cluster.Containers.ReserveExtraRAM)) / 1048576))
- return []string{
- "-n", fmt.Sprintf("%d", vcpus),
- "-D", fmt.Sprintf("%dMB", mem), // ulimit -d (note this doesn't limit the total container memory usage)
- "-R", fmt.Sprintf("rusage[mem=%dMB:tmp=%dMB] span[hosts=1]", mem, tmp),
+
+ repl := map[string]string{
+ "%%": "%",
+ "%C": fmt.Sprintf("%d", vcpus),
+ "%M": fmt.Sprintf("%d", mem),
+ "%T": fmt.Sprintf("%d", tmp),
+ "%U": container.UUID,
+ "%G": fmt.Sprintf("%d", container.RuntimeConstraints.CUDA.DeviceCount),
}
+
+ re := regexp.MustCompile(`%.`)
+ var substitutionErrors string
+ argumentTemplate := disp.Cluster.Containers.LSF.BsubArgumentsList
+ if container.RuntimeConstraints.CUDA.DeviceCount > 0 {
+ argumentTemplate = append(argumentTemplate, disp.Cluster.Containers.LSF.BsubCUDAArguments...)
+ }
+ for _, a := range argumentTemplate {
+ args = append(args, re.ReplaceAllStringFunc(a, func(s string) string {
+ subst := repl[s]
+ if len(subst) == 0 {
+ substitutionErrors += fmt.Sprintf("Unknown substitution parameter %s in BsubArgumentsList, ", s)
+ }
+ return subst
+ }))
+ }
+ if len(substitutionErrors) != 0 {
+ return nil, fmt.Errorf("%s", substitutionErrors[:len(substitutionErrors)-2])
+ }
+
+ if u := disp.Cluster.Containers.LSF.BsubSudoUser; u != "" {
+ args = append([]string{"sudo", "-E", "-u", u}, args...)
+ }
+ return args, nil
}
// Check the next bjobs report, and invoke TrackContainer for all the
}
}
-func execScript(args []string) []byte {
- s := "#!/bin/sh\nexec"
+func execScript(args []string, env map[string]string) []byte {
+ s := "#!/bin/sh\n"
+ for k, v := range env {
+ s += k + `='`
+ s += strings.Replace(v, `'`, `'\''`, -1)
+ s += `' `
+ }
+ s += `exec`
for _, w := range args {
s += ` '`
s += strings.Replace(w, `'`, `'\''`, -1)