}
if sis.allowCreateCall.After(time.Now()) {
return nil, RateLimitError{sis.allowCreateCall}
- } else {
- sis.allowCreateCall = time.Now().Add(sis.driver.MinTimeBetweenCreateCalls)
}
-
+ sis.allowCreateCall = time.Now().Add(sis.driver.MinTimeBetweenCreateCalls)
ak := sis.driver.AuthorizedKeys
if authKey != nil {
ak = append([]ssh.PublicKey{authKey}, ak...)
tags: copyTags(tags),
providerType: it.ProviderType,
initCommand: cmd,
- running: map[string]int64{},
+ running: map[string]stubProcess{},
killing: map[string]bool{},
}
svm.SSHService = SSHService{
defer sis.mtx.RUnlock()
if sis.allowInstancesCall.After(time.Now()) {
return nil, RateLimitError{sis.allowInstancesCall}
- } else {
- sis.allowInstancesCall = time.Now().Add(sis.driver.MinTimeBetweenInstancesCalls)
}
+ sis.allowInstancesCall = time.Now().Add(sis.driver.MinTimeBetweenInstancesCalls)
var r []cloud.Instance
for _, ss := range sis.servers {
r = append(r, ss.Instance())
CrunchRunMissing bool
CrunchRunCrashRate float64
CrunchRunDetachDelay time.Duration
+ ArvMountMaxExitLag time.Duration
+ ArvMountDeadlockRate float64
ExecuteContainer func(arvados.Container) int
CrashRunningContainer func(arvados.Container)
initCommand cloud.InitCommand
providerType string
SSHService SSHService
- running map[string]int64
+ running map[string]stubProcess
killing map[string]bool
lastPID int64
+ deadlocked string
sync.Mutex
}
+type stubProcess struct {
+ pid int64
+
+ // crunch-run has exited, but arv-mount process (or something)
+ // still holds lock in /var/run/
+ exited bool
+}
+
func (svm *StubVM) Instance() stubInstance {
svm.Lock()
defer svm.Unlock()
svm.Lock()
svm.lastPID++
pid := svm.lastPID
- svm.running[uuid] = pid
+ svm.running[uuid] = stubProcess{pid: pid}
svm.Unlock()
time.Sleep(svm.CrunchRunDetachDelay)
fmt.Fprintf(stderr, "starting %s\n", uuid)
logger.Print("[test] exiting crunch-run stub")
svm.Lock()
defer svm.Unlock()
- if svm.running[uuid] != pid {
- if !completed {
- bugf := svm.sis.driver.Bugf
- if bugf == nil {
- bugf = logger.Warnf
- }
- bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s]==%d", pid, uuid, svm.running[uuid])
+ if svm.running[uuid].pid != pid {
+ bugf := svm.sis.driver.Bugf
+ if bugf == nil {
+ bugf = logger.Warnf
}
- } else {
- delete(svm.running, uuid)
+ bugf("[test] StubDriver bug or caller bug: pid %d exiting, running[%s].pid==%d", pid, uuid, svm.running[uuid].pid)
+ return
}
if !completed {
logger.WithField("State", ctr.State).Print("[test] crashing crunch-run stub")
svm.CrashRunningContainer(ctr)
}
}
+ sproc := svm.running[uuid]
+ sproc.exited = true
+ svm.running[uuid] = sproc
+ svm.Unlock()
+ time.Sleep(svm.ArvMountMaxExitLag * time.Duration(math_rand.Float64()))
+ svm.Lock()
+ if math_rand.Float64() >= svm.ArvMountDeadlockRate {
+ delete(svm.running, uuid)
+ }
}()
crashluck := math_rand.Float64()
time.Sleep(time.Duration(math_rand.Float64()*20) * time.Millisecond)
svm.Lock()
- killed := svm.running[uuid] != pid
+ killed := svm.killing[uuid]
svm.Unlock()
if killed || wantCrashEarly {
return
if command == "crunch-run --list" {
svm.Lock()
defer svm.Unlock()
- for uuid := range svm.running {
- fmt.Fprintf(stdout, "%s\n", uuid)
+ for uuid, sproc := range svm.running {
+ if sproc.exited {
+ fmt.Fprintf(stdout, "%s stale\n", uuid)
+ } else {
+ fmt.Fprintf(stdout, "%s\n", uuid)
+ }
}
if !svm.ReportBroken.IsZero() && svm.ReportBroken.Before(time.Now()) {
fmt.Fprintln(stdout, "broken")
}
+ fmt.Fprintln(stdout, svm.deadlocked)
return 0
}
if strings.HasPrefix(command, "crunch-run --kill ") {
svm.Lock()
- pid, running := svm.running[uuid]
- if running && !svm.killing[uuid] {
+ sproc, running := svm.running[uuid]
+ if running && !sproc.exited {
svm.killing[uuid] = true
- go func() {
- time.Sleep(time.Duration(math_rand.Float64()*30) * time.Millisecond)
- svm.Lock()
- defer svm.Unlock()
- if svm.running[uuid] == pid {
- // Kill only if the running entry
- // hasn't since been killed and
- // replaced with a different one.
- delete(svm.running, uuid)
- }
- delete(svm.killing, uuid)
- }()
svm.Unlock()
time.Sleep(time.Duration(math_rand.Float64()*2) * time.Millisecond)
svm.Lock()
- _, running = svm.running[uuid]
+ sproc, running = svm.running[uuid]
}
svm.Unlock()
- if running {
+ if running && !sproc.exited {
fmt.Fprintf(stderr, "%s: container is running\n", uuid)
return 1
- } else {
- fmt.Fprintf(stderr, "%s: container is not running\n", uuid)
- return 0
}
+ fmt.Fprintf(stderr, "%s: container is not running\n", uuid)
+ return 0
}
if command == "true" {
return 0