19320: Account for AddedScratch in spot instance cost estimates.
[arvados.git] / lib / dispatchcloud / worker / worker.go
index 9e89d7daafc01d05b770fb065f88049dea231a7e..b2ed6c2bff5b039435944b851a9fe3646c922001 100644 (file)
@@ -6,7 +6,9 @@ package worker
 
 import (
        "bytes"
+       "encoding/json"
        "fmt"
+       "io"
        "path/filepath"
        "strings"
        "sync"
@@ -313,6 +315,10 @@ func (wkr *worker) probeAndUpdate() {
                // not yet running when ctrUUIDs was generated. Leave
                // wkr.running alone and wait for the next probe to
                // catch up on any changes.
+               logger.WithFields(logrus.Fields{
+                       "updated":     updated,
+                       "wkr.updated": wkr.updated,
+               }).Debug("skipping worker state update due to probe/sync race")
                return
        }
 
@@ -377,7 +383,12 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
                cmd = "sudo " + cmd
        }
        before := time.Now()
-       stdout, stderr, err := wkr.executor.Execute(nil, cmd, nil)
+       var stdin io.Reader
+       if prices := wkr.instance.PriceHistory(wkr.instType); len(prices) > 0 {
+               j, _ := json.Marshal(prices)
+               stdin = bytes.NewReader(j)
+       }
+       stdout, stderr, err := wkr.executor.Execute(nil, cmd, stdin)
        if err != nil {
                wkr.logger.WithFields(logrus.Fields{
                        "Command": cmd,
@@ -387,6 +398,11 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
                wkr.wp.mRunProbeDuration.WithLabelValues("fail").Observe(time.Now().Sub(before).Seconds())
                return
        }
+       wkr.logger.WithFields(logrus.Fields{
+               "Command": cmd,
+               "stdout":  string(stdout),
+               "stderr":  string(stderr),
+       }).Debug("probe succeeded")
        wkr.wp.mRunProbeDuration.WithLabelValues("success").Observe(time.Now().Sub(before).Seconds())
        ok = true
 
@@ -409,6 +425,12 @@ func (wkr *worker) probeRunning() (running []string, reportsBroken, ok bool) {
                        // empty string following final newline
                } else if s == "broken" {
                        reportsBroken = true
+               } else if !strings.HasPrefix(s, wkr.wp.cluster.ClusterID) {
+                       // Ignore crunch-run processes that belong to
+                       // a different cluster (e.g., a single host
+                       // running multiple clusters with the loopback
+                       // driver)
+                       continue
                } else if toks := strings.Split(s, " "); len(toks) == 1 {
                        running = append(running, s)
                } else if toks[1] == "stale" {