X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/58e6402a72e9ac1a210b2d318591f973a37e1e57..adcef9f939b663d88a12d5d3597c3b0184d2579f:/lib/cloud/loopback/loopback.go diff --git a/lib/cloud/loopback/loopback.go b/lib/cloud/loopback/loopback.go index 6ad4f876d9..fb7a35beae 100644 --- a/lib/cloud/loopback/loopback.go +++ b/lib/cloud/loopback/loopback.go @@ -11,6 +11,7 @@ import ( "encoding/json" "errors" "io" + "os" "os/exec" "os/user" "strings" @@ -58,6 +59,16 @@ func (is *instanceSet) Create(it arvados.InstanceType, _ cloud.ImageID, tags clo if len(is.instances) > 0 { return nil, errQuota } + // A crunch-run process running in a previous instance may + // have marked the node as broken. In the loopback scenario a + // destroy+create cycle doesn't fix whatever was broken -- but + // nothing else will either, so the best we can do is remove + // the "broken" flag and try again. + if err := os.Remove("/var/lock/crunch-run-broken"); err == nil { + is.logger.Info("removed /var/lock/crunch-run-broken") + } else if !errors.Is(err, os.ErrNotExist) { + return nil, err + } u, err := user.Current() if err != nil { return nil, err