return nil
}
runner.hoststatReporter.Stop()
+ runner.hoststatReporter.LogProcessMemMax(runner.CrunchLog)
err := runner.hoststatLogger.Close()
if err != nil {
return fmt.Errorf("error closing hoststat logs: %v", err)
}
runner.statLogger = NewThrottledLogger(w)
runner.statReporter = &crunchstat.Reporter{
- CID: runner.executor.CgroupID(),
- Logger: log.New(runner.statLogger, "", 0),
CgroupParent: runner.expectCgroupParent,
CgroupRoot: runner.cgroupRoot,
- PollPeriod: runner.statInterval,
- TempDir: runner.parentTemp,
+ CID: runner.executor.CgroupID(),
+ Logger: log.New(runner.statLogger, "", 0),
+ MemThresholds: map[string][]crunchstat.Threshold{
+ "rss": crunchstat.NewThresholdsFromPercentages(runner.Container.RuntimeConstraints.RAM, []int64{90, 95, 99}),
+ },
+ PollPeriod: runner.statInterval,
+ TempDir: runner.parentTemp,
+ ThresholdLogger: runner.CrunchLog,
}
runner.statReporter.Start()
return nil
if runner.statReporter != nil {
runner.statReporter.Stop()
+ runner.statReporter.LogMaxima(runner.CrunchLog, map[string]int64{
+ "rss": runner.Container.RuntimeConstraints.RAM,
+ })
err = runner.statLogger.Close()
if err != nil {
runner.CrunchLog.Printf("error closing crunchstat logs: %v", err)
failures = 0
if metadata != lastmetadata {
lastmetadata = metadata
- text := fmt.Sprintf("Cloud provider indicates instance action %q scheduled for time %q", metadata.Action, metadata.Time.UTC().Format(time.RFC3339))
+ text := fmt.Sprintf("Cloud provider scheduled instance %s at %s", metadata.Action, metadata.Time.UTC().Format(time.RFC3339))
runner.CrunchLog.Printf("%s", text)
runner.updateRuntimeStatus(arvadosclient.Dict{
"warning": "preemption notice",
if final {
updates["is_trashed"] = true
} else {
- exp := time.Now().Add(crunchLogUpdatePeriod * 24)
+ // We set trash_at so this collection gets
+ // automatically cleaned up eventually. It used to be
+ // 12 hours but we had a situation where the API
+ // server was down over a weekend but the containers
+ // kept running such that the log collection got
+ // trashed, so now we make it 2 weeks. refs #20378
+ exp := time.Now().Add(time.Duration(24*14) * time.Hour)
updates["trash_at"] = exp
updates["delete_at"] = exp
}
signal.Notify(sigusr2, syscall.SIGUSR2)
defer signal.Stop(sigusr2)
runner.loadPrices()
- go func() {
- for range sigusr2 {
- runner.loadPrices()
- }
- }()
+ go runner.handleSIGUSR2(sigusr2)
runner.finalState = "Queued"
ContainerUUID: containerUUID,
Target: cr.executor,
Log: cr.CrunchLog,
+ LogCollection: cr.LogCollection,
}
if gwListen == "" {
// Direct connection won't work, so we use the
return cost
}
+
+func (runner *ContainerRunner) handleSIGUSR2(sigchan chan os.Signal) {
+ for range sigchan {
+ runner.loadPrices()
+ update := arvadosclient.Dict{
+ "container": arvadosclient.Dict{
+ "cost": runner.calculateCost(time.Now()),
+ },
+ }
+ runner.DispatcherArvClient.Update("containers", runner.Container.UUID, update, nil)
+ }
+}