- log.Printf("Monitoring container %v started", uuid)
-
- // periodically check squeue
- doneSqueue := make(chan struct{})
- go func() {
- squeueUpdater.CheckSqueue(container.UUID, true)
- ticker := time.NewTicker(dispatcher.PollInterval)
- for {
- select {
- case <-ticker.C:
- if inQ, err := squeueUpdater.CheckSqueue(container.UUID, false); err != nil {
- log.Printf("Error running squeue: %v", err)
- // don't cancel, just leave it the way it is
- } else if !inQ {
- var con dispatch.Container
- err := dispatcher.Arv.Get("containers", uuid, nil, &con)
- if err != nil {
- log.Printf("Error getting final container state: %v", err)
- }
-
- var st string
- switch con.State {
- case dispatch.Locked:
- st = dispatch.Queued
- case dispatch.Running:
- st = dispatch.Cancelled
- default:
- st = ""
- }
-
- if st != "" {
- log.Printf("Container %s in state %v but missing from slurm queue, changing to %v.",
- uuid, con.State, st)
- dispatcher.UpdateState(uuid, st)
- }
- }
- case <-doneSqueue:
- close(doneSqueue)
- ticker.Stop()
- return
- }
- }
- }()
-
- for container = range status {
- if container.State == dispatch.Locked || container.State == dispatch.Running {
- if container.Priority == 0 {
- log.Printf("Canceling container %s", container.UUID)
-
- err := exec.Command("scancel", "--name="+container.UUID).Run()
- if err != nil {
- log.Printf("Error stopping container %s with scancel: %v",
- container.UUID, err)
- if inQ, err := squeueUpdater.CheckSqueue(container.UUID, true); err != nil {
- log.Printf("Error running squeue: %v", err)
- continue
- } else if inQ {
- log.Printf("Container %s is still in squeue after scancel.",
- container.UUID)
- continue
- }
- }
-
- err = dispatcher.UpdateState(container.UUID, dispatch.Cancelled)
- }
- }
+ if err != nil {
+ log.Printf("%q %q: %s %q", cmd.Path, cmd.Args, err, msg)
+ time.Sleep(time.Second)
+ } else if sqCheck.HasUUID(ctr.UUID) {
+ log.Printf("container %s is still in squeue after scancel", ctr.UUID)
+ time.Sleep(time.Second)