X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/14f6625379992bc3ee054ad419095b476a1c4284..bfaee40696c3c15556ef089e69da47bb832b08db:/lib/diagnostics/cmd.go diff --git a/lib/diagnostics/cmd.go b/lib/diagnostics/cmd.go index 888503b551..8d89b84d37 100644 --- a/lib/diagnostics/cmd.go +++ b/lib/diagnostics/cmd.go @@ -62,6 +62,7 @@ func (Command) RunCommand(prog string, args []string, stdin io.Reader, stdout, s } // docker save hello-world > hello-world.tar +// //go:embed hello-world.tar var HelloWorldDockerImage []byte @@ -130,6 +131,8 @@ func (diag *diagnoser) dotest(id int, title string, fn func() error) { func (diag *diagnoser) runtests() { client := arvados.NewClientFromEnv() + // Disable auto-retry, use context instead + client.Timeout = 0 if client.APIHost == "" || client.AuthToken == "" { diag.errorf("ARVADOS_API_HOST and ARVADOS_API_TOKEN environment variables are not set -- aborting without running any tests") @@ -156,8 +159,7 @@ func (diag *diagnoser) runtests() { return err } if cluster.SystemRootToken != os.Getenv("ARVADOS_API_TOKEN") { - diag.infof("skipping because provided token is not SystemRootToken") - return nil + return fmt.Errorf("diagnostics usage error: %s is readable but SystemRootToken does not match $ARVADOS_API_TOKEN (to fix, either run 'arvados-client sudo diagnostics' to load everything from config file, or set ARVADOS_CONFIG=- to load nothing from config file)", ldr.Path) } agg := &health.Aggregator{Cluster: cluster} resp := agg.ClusterHealth() @@ -318,9 +320,9 @@ func (diag *diagnoser) runtests() { isInternal := found["proxy"] == 0 && len(keeplist.Items) > 0 isExternal := found["proxy"] > 0 && found["proxy"] == len(keeplist.Items) if isExternal { - diag.verbosef("controller returned only proxy services, this host is treated as \"external\"") + diag.infof("controller returned only proxy services, this host is treated as \"external\"") } else if isInternal { - diag.verbosef("controller returned only non-proxy services, this host is treated as \"internal\"") + diag.infof("controller returned only non-proxy services, this host is treated as \"internal\"") } if (diag.checkInternal && !isInternal) || (diag.checkExternal && !isExternal) { return fmt.Errorf("expecting internal=%v external=%v, but found internal=%v external=%v", diag.checkInternal, diag.checkExternal, isInternal, isExternal) @@ -703,12 +705,11 @@ func (diag *diagnoser) runtests() { timeout := 10 * time.Minute diag.infof("container request submitted, waiting up to %v for container to run", arvados.Duration(timeout)) - ctx, cancel = context.WithDeadline(context.Background(), time.Now().Add(timeout)) - defer cancel() + deadline := time.Now().Add(timeout) var c arvados.Container - for ; cr.State != arvados.ContainerRequestStateFinal; time.Sleep(2 * time.Second) { - ctx, cancel := context.WithDeadline(ctx, time.Now().Add(diag.timeout)) + for ; cr.State != arvados.ContainerRequestStateFinal && time.Now().Before(deadline); time.Sleep(2 * time.Second) { + ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout)) defer cancel() crStateWas := cr.State @@ -728,11 +729,26 @@ func (diag *diagnoser) runtests() { if c.State != cStateWas { diag.debugf("container state = %s", c.State) } + + cancel() } + if cr.State != arvados.ContainerRequestStateFinal { + err := client.RequestAndDecodeContext(context.Background(), &cr, "PATCH", "arvados/v1/container_requests/"+cr.UUID, nil, map[string]interface{}{ + "container_request": map[string]interface{}{ + "priority": 0, + }}) + if err != nil { + diag.infof("error canceling container request %s: %s", cr.UUID, err) + } else { + diag.debugf("canceled container request %s", cr.UUID) + } + return fmt.Errorf("timed out waiting for container to finish; container request %s state was %q, container %s state was %q", cr.UUID, cr.State, c.UUID, c.State) + } if c.State != arvados.ContainerStateComplete { return fmt.Errorf("container request %s is final but container %s did not complete: container state = %q", cr.UUID, cr.ContainerUUID, c.State) - } else if c.ExitCode != 0 { + } + if c.ExitCode != 0 { return fmt.Errorf("container exited %d", c.ExitCode) } return nil