"syscall"
"time"
+ "git.arvados.org/arvados.git/lib/cloud"
"git.arvados.org/arvados.git/lib/cmd"
"git.arvados.org/arvados.git/lib/config"
"git.arvados.org/arvados.git/lib/crunchstat"
MkArvClient func(token string) (IArvadosClient, IKeepClient, *arvados.Client, error)
finalState string
parentTemp string
+ costStartTime time.Time
+ keepstore *exec.Cmd
keepstoreLogger io.WriteCloser
keepstoreLogbuf *bufThenWrite
statLogger io.WriteCloser
containerWatchdogInterval time.Duration
gateway Gateway
+
+ prices []cloud.InstancePrice
+ pricesLock sync.Mutex
}
// setupSignals sets up signal handling to gracefully terminate the
arvMountCmd = append(arvMountCmd, "--allow-other")
}
- if runner.Container.RuntimeConstraints.KeepCacheRAM > 0 {
- arvMountCmd = append(arvMountCmd, "--file-cache", fmt.Sprintf("%d", runner.Container.RuntimeConstraints.KeepCacheRAM))
+ if runner.Container.RuntimeConstraints.KeepCacheDisk > 0 {
+ keepcachedir, err := runner.MkTempDir(runner.parentTemp, "keepcache")
+ if err != nil {
+ return nil, fmt.Errorf("while creating keep cache temp dir: %v", err)
+ }
+ arvMountCmd = append(arvMountCmd, "--disk-cache", "--disk-cache-dir", keepcachedir, "--file-cache", fmt.Sprintf("%d", runner.Container.RuntimeConstraints.KeepCacheDisk))
+ } else if runner.Container.RuntimeConstraints.KeepCacheRAM > 0 {
+ arvMountCmd = append(arvMountCmd, "--ram-cache", "--file-cache", fmt.Sprintf("%d", runner.Container.RuntimeConstraints.KeepCacheRAM))
}
collectionPaths := []string{}
if err != nil {
return nil, fmt.Errorf("while trying to start arv-mount: %v", err)
}
+ if runner.hoststatReporter != nil && runner.ArvMount != nil {
+ runner.hoststatReporter.ReportPID("arv-mount", runner.ArvMount.Process.Pid)
+ }
for _, p := range collectionPaths {
_, err = os.Stat(p)
PollPeriod: runner.statInterval,
}
runner.hoststatReporter.Start()
+ runner.hoststatReporter.ReportPID("crunch-run", os.Getpid())
return nil
}
env["ARVADOS_API_TOKEN"] = tok
env["ARVADOS_API_HOST"] = os.Getenv("ARVADOS_API_HOST")
env["ARVADOS_API_HOST_INSECURE"] = os.Getenv("ARVADOS_API_HOST_INSECURE")
+ env["ARVADOS_KEEP_SERVICES"] = os.Getenv("ARVADOS_KEEP_SERVICES")
}
workdir := runner.Container.Cwd
if workdir == "." {
}
// UpdateContainerRunning updates the container state to "Running"
-func (runner *ContainerRunner) UpdateContainerRunning() error {
+func (runner *ContainerRunner) UpdateContainerRunning(logId string) error {
runner.cStateLock.Lock()
defer runner.cStateLock.Unlock()
if runner.cCancelled {
return ErrCancelled
}
- return runner.DispatcherArvClient.Update("containers", runner.Container.UUID,
- arvadosclient.Dict{"container": arvadosclient.Dict{"state": "Running", "gateway_address": runner.gateway.Address}}, nil)
+ updates := arvadosclient.Dict{
+ "gateway_address": runner.gateway.Address,
+ "state": "Running",
+ }
+ if logId != "" {
+ updates["log"] = logId
+ }
+ return runner.DispatcherArvClient.Update(
+ "containers",
+ runner.Container.UUID,
+ arvadosclient.Dict{"container": updates},
+ nil,
+ )
}
// ContainerToken returns the api_token the container (and any
if runner.finalState == "Complete" && runner.OutputPDH != nil {
update["output"] = *runner.OutputPDH
}
+ update["cost"] = runner.calculateCost(time.Now())
return runner.DispatcherArvClient.Update("containers", runner.Container.UUID, arvadosclient.Dict{"container": update}, nil)
}
runner.CrunchLog.Printf("Using FUSE mount: %s", v)
runner.CrunchLog.Printf("Using container runtime: %s", runner.executor.Runtime())
runner.CrunchLog.Printf("Executing container: %s", runner.Container.UUID)
+ runner.costStartTime = time.Now()
hostname, hosterr := os.Hostname()
if hosterr != nil {
runner.CrunchLog.Printf("Executing on host '%s'", hostname)
}
+ sigusr2 := make(chan os.Signal, 1)
+ signal.Notify(sigusr2, syscall.SIGUSR2)
+ defer signal.Stop(sigusr2)
+ runner.loadPrices()
+ go func() {
+ for range sigusr2 {
+ runner.loadPrices()
+ }
+ }()
+
runner.finalState = "Queued"
defer func() {
if err != nil {
return
}
+ if runner.keepstore != nil {
+ runner.hoststatReporter.ReportPID("keepstore", runner.keepstore.Process.Pid)
+ }
// set up FUSE mount and binds
bindmounts, err = runner.SetupMounts()
return
}
- err = runner.UpdateContainerRunning()
+ logCollection, err := runner.saveLogCollection(false)
+ var logId string
+ if err == nil {
+ logId = logCollection.PortableDataHash
+ } else {
+ runner.CrunchLog.Printf("Error committing initial log collection: %v", err)
+ }
+ err = runner.UpdateContainerRunning(logId)
if err != nil {
return
}
configFile := flags.String("config", arvados.DefaultConfigFile, "filename of cluster config file to try loading if -stdin-config=false (default is $ARVADOS_CONFIG)")
sleep := flags.Duration("sleep", 0, "Delay before starting (testing use only)")
kill := flags.Int("kill", -1, "Send signal to an existing crunch-run process for given UUID")
- list := flags.Bool("list", false, "List UUIDs of existing crunch-run processes")
+ list := flags.Bool("list", false, "List UUIDs of existing crunch-run processes (and notify them to use price data passed on stdin)")
enableMemoryLimit := flags.Bool("enable-memory-limit", true, "tell container runtime to limit container's memory usage")
enableNetwork := flags.String("container-enable-networking", "default", "enable networking \"always\" (for all containers) or \"default\" (for containers that request it)")
networkMode := flags.String("container-network-mode", "default", `Docker network mode for container (use any argument valid for docker --net)`)
runtimeEngine := flags.String("runtime-engine", "docker", "container runtime: docker or singularity")
brokenNodeHook := flags.String("broken-node-hook", "", "script to run if node is detected to be broken (for example, Docker daemon is not running)")
flags.Duration("check-containerd", 0, "Ignored. Exists for compatibility with older versions.")
+ version := flags.Bool("version", false, "Write version information to stdout and exit 0.")
ignoreDetachFlag := false
if len(args) > 0 && args[0] == "-no-detach" {
if ok, code := cmd.ParseFlags(flags, prog, args, "container-uuid", stderr); !ok {
return code
+ } else if *version {
+ fmt.Fprintln(stdout, prog, cmd.Version.String())
+ return 0
} else if !*list && flags.NArg() != 1 {
fmt.Fprintf(stderr, "missing required argument: container-uuid (try -help)\n")
return 2
switch {
case *detach && !ignoreDetachFlag:
- return Detach(containerUUID, prog, args, os.Stdin, os.Stdout, os.Stderr)
+ return Detach(containerUUID, prog, args, stdin, stdout, stderr)
case *kill >= 0:
- return KillProcess(containerUUID, syscall.Signal(*kill), os.Stdout, os.Stderr)
+ return KillProcess(containerUUID, syscall.Signal(*kill), stdout, stderr)
case *list:
- return ListProcesses(os.Stdout, os.Stderr)
+ return ListProcesses(stdin, stdout, stderr)
}
if len(containerUUID) != 27 {
return 1
}
+ cr.keepstore = keepstore
if keepstore == nil {
// Log explanation (if any) for why we're not running
// a local keepstore.
// not safe to run a gateway service without an auth
// secret
cr.CrunchLog.Printf("Not starting a gateway server (GatewayAuthSecret was not provided by dispatcher)")
- } else if gwListen := os.Getenv("GatewayAddress"); gwListen == "" {
- // dispatcher did not tell us which external IP
- // address to advertise --> no gateway service
- cr.CrunchLog.Printf("Not starting a gateway server (GatewayAddress was not provided by dispatcher)")
} else {
+ gwListen := os.Getenv("GatewayAddress")
cr.gateway = Gateway{
Address: gwListen,
AuthSecret: gwAuthSecret,
Target: cr.executor,
Log: cr.CrunchLog,
}
+ if gwListen == "" {
+ // Direct connection won't work, so we use the
+ // gateway_address field to indicate the
+ // internalURL of the controller process that
+ // has the current tunnel connection.
+ cr.gateway.ArvadosClient = cr.dispatcherClient
+ cr.gateway.UpdateTunnelURL = func(url string) {
+ cr.gateway.Address = "tunnel " + url
+ cr.DispatcherArvClient.Update("containers", containerUUID,
+ arvadosclient.Dict{"container": arvadosclient.Dict{"gateway_address": cr.gateway.Address}}, nil)
+ }
+ }
err = cr.gateway.Start()
if err != nil {
log.Printf("error starting gateway server: %s", err)
// modify the cluster configuration that we feed it on stdin.
configData.Cluster.API.MaxKeepBlobBuffers = configData.KeepBuffers
- ln, err := net.Listen("tcp", "localhost:0")
+ localaddr := localKeepstoreAddr()
+ ln, err := net.Listen("tcp", net.JoinHostPort(localaddr, "0"))
if err != nil {
return nil, err
}
return nil, err
}
ln.Close()
- url := "http://localhost:" + port
+ url := "http://" + net.JoinHostPort(localaddr, port)
fmt.Fprintf(logbuf, "starting keepstore on %s\n", url)
}
return s
}
+
+// Return a suitable local interface address for a local keepstore
+// service. Currently this is the numerically lowest non-loopback ipv4
+// address assigned to a local interface that is not in any of the
+// link-local/vpn/loopback ranges 169.254/16, 100.64/10, or 127/8.
+func localKeepstoreAddr() string {
+ var ips []net.IP
+ // Ignore error (proceed with zero IPs)
+ addrs, _ := processIPs(os.Getpid())
+ for addr := range addrs {
+ ip := net.ParseIP(addr)
+ if ip == nil {
+ // invalid
+ continue
+ }
+ if ip.Mask(net.CIDRMask(8, 32)).Equal(net.IPv4(127, 0, 0, 0)) ||
+ ip.Mask(net.CIDRMask(10, 32)).Equal(net.IPv4(100, 64, 0, 0)) ||
+ ip.Mask(net.CIDRMask(16, 32)).Equal(net.IPv4(169, 254, 0, 0)) {
+ // unsuitable
+ continue
+ }
+ ips = append(ips, ip)
+ }
+ if len(ips) == 0 {
+ return "0.0.0.0"
+ }
+ sort.Slice(ips, func(ii, jj int) bool {
+ i, j := ips[ii], ips[jj]
+ if len(i) != len(j) {
+ return len(i) < len(j)
+ }
+ for x := range i {
+ if i[x] != j[x] {
+ return i[x] < j[x]
+ }
+ }
+ return false
+ })
+ return ips[0].String()
+}
+
+func (cr *ContainerRunner) loadPrices() {
+ buf, err := os.ReadFile(filepath.Join(lockdir, pricesfile))
+ if err != nil {
+ if !os.IsNotExist(err) {
+ cr.CrunchLog.Printf("loadPrices: read: %s", err)
+ }
+ return
+ }
+ var prices []cloud.InstancePrice
+ err = json.Unmarshal(buf, &prices)
+ if err != nil {
+ cr.CrunchLog.Printf("loadPrices: decode: %s", err)
+ return
+ }
+ cr.pricesLock.Lock()
+ defer cr.pricesLock.Unlock()
+ var lastKnown time.Time
+ if len(cr.prices) > 0 {
+ lastKnown = cr.prices[0].StartTime
+ }
+ cr.prices = cloud.NormalizePriceHistory(append(prices, cr.prices...))
+ for i := len(cr.prices) - 1; i >= 0; i-- {
+ price := cr.prices[i]
+ if price.StartTime.After(lastKnown) {
+ cr.CrunchLog.Printf("Instance price changed to %#.3g at %s", price.Price, price.StartTime.UTC())
+ }
+ }
+}
+
+func (cr *ContainerRunner) calculateCost(now time.Time) float64 {
+ cr.pricesLock.Lock()
+ defer cr.pricesLock.Unlock()
+
+ // First, make a "prices" slice with the real data as far back
+ // as it goes, and (if needed) a "since the beginning of time"
+ // placeholder containing a reasonable guess about what the
+ // price was between cr.costStartTime and the earliest real
+ // data point.
+ prices := cr.prices
+ if len(prices) == 0 {
+ // use price info in InstanceType record initially
+ // provided by cloud dispatcher
+ var p float64
+ var it arvados.InstanceType
+ if j := os.Getenv("InstanceType"); j != "" && json.Unmarshal([]byte(j), &it) == nil && it.Price > 0 {
+ p = it.Price
+ }
+ prices = []cloud.InstancePrice{{Price: p}}
+ } else if prices[len(prices)-1].StartTime.After(cr.costStartTime) {
+ // guess earlier pricing was the same as the earliest
+ // price we know about
+ filler := prices[len(prices)-1]
+ filler.StartTime = time.Time{}
+ prices = append(prices, filler)
+ }
+
+ // Now that our history of price changes goes back at least as
+ // far as cr.costStartTime, add up the costs for each
+ // interval.
+ cost := 0.0
+ spanEnd := now
+ for _, ip := range prices {
+ spanStart := ip.StartTime
+ if spanStart.After(now) {
+ // pricing information from the future -- not
+ // expected from AWS, but possible in
+ // principle, and exercised by tests.
+ continue
+ }
+ last := false
+ if spanStart.Before(cr.costStartTime) {
+ spanStart = cr.costStartTime
+ last = true
+ }
+ cost += ip.Price * spanEnd.Sub(spanStart).Seconds() / 3600
+ if last {
+ break
+ }
+ spanEnd = spanStart
+ }
+
+ return cost
+}