X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/61d3aed1cd0286c7928167a84267c225938a37ec..09cbdc3074b3f1e69c9c537875146f6da0a6ed8f:/lib/diagnostics/cmd.go diff --git a/lib/diagnostics/cmd.go b/lib/diagnostics/cmd.go index 3455d3307e..ed963e1ef7 100644 --- a/lib/diagnostics/cmd.go +++ b/lib/diagnostics/cmd.go @@ -5,8 +5,10 @@ package diagnostics import ( + "archive/tar" "bytes" "context" + _ "embed" "flag" "fmt" "io" @@ -14,12 +16,15 @@ import ( "net" "net/http" "net/url" + "os" "strings" "time" "git.arvados.org/arvados.git/lib/cmd" + "git.arvados.org/arvados.git/lib/config" "git.arvados.org/arvados.git/sdk/go/arvados" "git.arvados.org/arvados.git/sdk/go/ctxlog" + "git.arvados.org/arvados.git/sdk/go/health" "github.com/sirupsen/logrus" ) @@ -30,9 +35,10 @@ func (Command) RunCommand(prog string, args []string, stdin io.Reader, stdout, s f := flag.NewFlagSet(prog, flag.ContinueOnError) f.StringVar(&diag.projectName, "project-name", "scratch area for diagnostics", "name of project to find/create in home project and use for temporary/test objects") f.StringVar(&diag.logLevel, "log-level", "info", "logging level (debug, info, warning, error)") - f.StringVar(&diag.dockerImage, "docker-image", "alpine:latest", "image to use when running a test container") + f.StringVar(&diag.dockerImage, "docker-image", "", "image to use when running a test container (default: use embedded hello-world image)") f.BoolVar(&diag.checkInternal, "internal-client", false, "check that this host is considered an \"internal\" client") f.BoolVar(&diag.checkExternal, "external-client", false, "check that this host is considered an \"external\" client") + f.BoolVar(&diag.verbose, "v", false, "verbose: include more information in report") f.IntVar(&diag.priority, "priority", 500, "priority for test container (1..1000, or 0 to skip)") f.DurationVar(&diag.timeout, "timeout", 10*time.Second, "timeout for http requests") if ok, code := cmd.ParseFlags(f, prog, args, "", stderr); !ok { @@ -55,6 +61,11 @@ func (Command) RunCommand(prog string, args []string, stdin io.Reader, stdout, s } } +// docker save hello-world > hello-world.tar +// +//go:embed hello-world.tar +var HelloWorldDockerImage []byte + type diagnoser struct { stdout io.Writer stderr io.Writer @@ -64,6 +75,7 @@ type diagnoser struct { dockerImage string checkInternal bool checkExternal bool + verbose bool timeout time.Duration logger *logrus.Logger errors []string @@ -78,6 +90,12 @@ func (diag *diagnoser) infof(f string, args ...interface{}) { diag.logger.Infof(" ... "+f, args...) } +func (diag *diagnoser) verbosef(f string, args ...interface{}) { + if diag.verbose { + diag.logger.Infof(" ... "+f, args...) + } +} + func (diag *diagnoser) warnf(f string, args ...interface{}) { diag.logger.Warnf(" ... "+f, args...) } @@ -119,6 +137,62 @@ func (diag *diagnoser) runtests() { return } + hostname, err := os.Hostname() + if err != nil { + diag.warnf("error getting hostname: %s") + } else { + diag.verbosef("hostname = %s", hostname) + } + + diag.dotest(5, "running health check (same as `arvados-server check`)", func() error { + ldr := config.NewLoader(&bytes.Buffer{}, ctxlog.New(&bytes.Buffer{}, "text", "info")) + ldr.SetupFlags(flag.NewFlagSet("diagnostics", flag.ContinueOnError)) + cfg, err := ldr.Load() + if err != nil { + diag.infof("skipping because config could not be loaded: %s", err) + return nil + } + cluster, err := cfg.GetCluster("") + if err != nil { + return err + } + if cluster.SystemRootToken != os.Getenv("ARVADOS_API_TOKEN") { + return fmt.Errorf("diagnostics usage error: %s is readable but SystemRootToken does not match $ARVADOS_API_TOKEN (to fix, either run 'arvados-client sudo diagnostics' to load everything from config file, or set ARVADOS_CONFIG=- to load nothing from config file)", ldr.Path) + } + agg := &health.Aggregator{Cluster: cluster} + resp := agg.ClusterHealth() + for _, e := range resp.Errors { + diag.errorf("health check: %s", e) + } + if len(resp.Errors) > 0 { + diag.infof("consider running `arvados-server check -yaml` for a comprehensive report") + } + diag.verbosef("reported clock skew = %v", resp.ClockSkew) + reported := map[string]bool{} + for _, result := range resp.Checks { + version := strings.SplitN(result.Metrics.Version, " (go", 2)[0] + if version != "" && !reported[version] { + diag.verbosef("arvados version = %s", version) + reported[version] = true + } + } + reported = map[string]bool{} + for _, result := range resp.Checks { + if result.Server != "" && !reported[result.Server] { + diag.verbosef("http frontend version = %s", result.Server) + reported[result.Server] = true + } + } + reported = map[string]bool{} + for _, result := range resp.Checks { + if sha := result.ConfigSourceSHA256; sha != "" && !reported[sha] { + diag.verbosef("config file sha256 = %s", sha) + reported[sha] = true + } + } + return nil + }) + var dd arvados.DiscoveryDocument ddpath := "discovery/v1/apis/arvados/v1/rest" diag.dotest(10, fmt.Sprintf("getting discovery document from https://%s/%s", client.APIHost, ddpath), func() error { @@ -128,7 +202,7 @@ func (diag *diagnoser) runtests() { if err != nil { return err } - diag.debugf("BlobSignatureTTL = %d", dd.BlobSignatureTTL) + diag.verbosef("BlobSignatureTTL = %d", dd.BlobSignatureTTL) return nil }) @@ -142,7 +216,7 @@ func (diag *diagnoser) runtests() { if err != nil { return err } - diag.debugf("Collections.BlobSigning = %v", cluster.Collections.BlobSigning) + diag.verbosef("Collections.BlobSigning = %v", cluster.Collections.BlobSigning) cfgOK = true return nil }) @@ -155,7 +229,7 @@ func (diag *diagnoser) runtests() { if err != nil { return err } - diag.debugf("user uuid = %s", user.UUID) + diag.verbosef("user uuid = %s", user.UUID) return nil }) @@ -244,9 +318,9 @@ func (diag *diagnoser) runtests() { isInternal := found["proxy"] == 0 && len(keeplist.Items) > 0 isExternal := found["proxy"] > 0 && found["proxy"] == len(keeplist.Items) if isExternal { - diag.debugf("controller returned only proxy services, this host is treated as \"external\"") + diag.infof("controller returned only proxy services, this host is treated as \"external\"") } else if isInternal { - diag.debugf("controller returned only non-proxy services, this host is treated as \"internal\"") + diag.infof("controller returned only non-proxy services, this host is treated as \"internal\"") } if (diag.checkInternal && !isInternal) || (diag.checkExternal && !isExternal) { return fmt.Errorf("expecting internal=%v external=%v, but found internal=%v external=%v", diag.checkInternal, diag.checkExternal, isInternal, isExternal) @@ -323,7 +397,7 @@ func (diag *diagnoser) runtests() { } if len(grplist.Items) > 0 { project = grplist.Items[0] - diag.debugf("using existing project, uuid = %s", project.UUID) + diag.verbosef("using existing project, uuid = %s", project.UUID) return nil } diag.debugf("list groups: ok, no results") @@ -334,7 +408,7 @@ func (diag *diagnoser) runtests() { if err != nil { return fmt.Errorf("create project: %s", err) } - diag.debugf("created project, uuid = %s", project.UUID) + diag.verbosef("created project, uuid = %s", project.UUID) return nil }) @@ -354,7 +428,7 @@ func (diag *diagnoser) runtests() { if err != nil { return err } - diag.debugf("ok, uuid = %s", collection.UUID) + diag.verbosef("ok, uuid = %s", collection.UUID) return nil }) @@ -368,13 +442,38 @@ func (diag *diagnoser) runtests() { }() } + // Read hello-world.tar to find image ID, so we can upload it + // as "sha256:{...}.tar" + var imageSHA2 string + { + tr := tar.NewReader(bytes.NewReader(HelloWorldDockerImage)) + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + diag.errorf("internal error/bug: cannot read embedded docker image tar file: %s", err) + return + } + if s := strings.TrimSuffix(hdr.Name, ".json"); len(s) == 64 && s != hdr.Name { + imageSHA2 = s + } + } + if imageSHA2 == "" { + diag.errorf("internal error/bug: cannot find {sha256}.json file in embedded docker image tar file") + return + } + } + tarfilename := "sha256:" + imageSHA2 + ".tar" + diag.dotest(100, "uploading file via webdav", func() error { ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout)) defer cancel() if collection.UUID == "" { return fmt.Errorf("skipping, no test collection") } - req, err := http.NewRequestWithContext(ctx, "PUT", cluster.Services.WebDAVDownload.ExternalURL.String()+"c="+collection.UUID+"/testfile", bytes.NewBufferString("testfiledata")) + req, err := http.NewRequestWithContext(ctx, "PUT", cluster.Services.WebDAVDownload.ExternalURL.String()+"c="+collection.UUID+"/"+tarfilename, bytes.NewReader(HelloWorldDockerImage)) if err != nil { return fmt.Errorf("BUG? http.NewRequest: %s", err) } @@ -415,11 +514,11 @@ func (diag *diagnoser) runtests() { fileurl string }{ {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + "foo"}, - {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + "testfile"}, + {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + tarfilename}, {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/foo"}, - {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/testfile"}, - {true, true, http.StatusOK, strings.Replace(davurl.String(), "*", strings.Replace(collection.PortableDataHash, "+", "-", -1), 1) + "testfile"}, - {true, false, http.StatusOK, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=" + collection.UUID + "/_/testfile"}, + {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/" + tarfilename}, + {true, true, http.StatusOK, strings.Replace(davurl.String(), "*", strings.Replace(collection.PortableDataHash, "+", "-", -1), 1) + tarfilename}, + {true, false, http.StatusOK, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=" + collection.UUID + "/_/" + tarfilename}, } { diag.dotest(120+i, fmt.Sprintf("downloading from webdav (%s)", trial.fileurl), func() error { if trial.needWildcard && !davWildcard { @@ -448,8 +547,13 @@ func (diag *diagnoser) runtests() { if resp.StatusCode != trial.status { return fmt.Errorf("unexpected response status: %s", resp.Status) } - if trial.status == http.StatusOK && string(body) != "testfiledata" { - return fmt.Errorf("unexpected response content: %q", body) + if trial.status == http.StatusOK && !bytes.Equal(body, HelloWorldDockerImage) { + excerpt := body + if len(excerpt) > 128 { + excerpt = append([]byte(nil), body[:128]...) + excerpt = append(excerpt, []byte("[...]")...) + } + return fmt.Errorf("unexpected response content: len %d, %q", len(body), excerpt) } return nil }) @@ -555,16 +659,25 @@ func (diag *diagnoser) runtests() { return fmt.Errorf("skipping, no project to work in") } + timestamp := time.Now().Format(time.RFC3339) + ctrCommand := []string{"echo", timestamp} + if diag.dockerImage == "" { + if collection.UUID == "" { + return fmt.Errorf("skipping, no test collection to use as docker image") + } + diag.dockerImage = collection.PortableDataHash + ctrCommand = []string{"/hello"} + } + var cr arvados.ContainerRequest ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout)) defer cancel() - timestamp := time.Now().Format(time.RFC3339) err := client.RequestAndDecodeContext(ctx, &cr, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{"container_request": map[string]interface{}{ "owner_uuid": project.UUID, "name": fmt.Sprintf("diagnostics container request %s", timestamp), "container_image": diag.dockerImage, - "command": []string{"echo", timestamp}, + "command": ctrCommand, "use_existing": false, "output_path": "/mnt/output", "output_name": fmt.Sprintf("diagnostics output %s", timestamp), @@ -585,17 +698,16 @@ func (diag *diagnoser) runtests() { if err != nil { return err } - diag.debugf("container request uuid = %s", cr.UUID) - diag.debugf("container uuid = %s", cr.ContainerUUID) + diag.verbosef("container request uuid = %s", cr.UUID) + diag.verbosef("container uuid = %s", cr.ContainerUUID) timeout := 10 * time.Minute diag.infof("container request submitted, waiting up to %v for container to run", arvados.Duration(timeout)) - ctx, cancel = context.WithDeadline(context.Background(), time.Now().Add(timeout)) - defer cancel() + deadline := time.Now().Add(timeout) var c arvados.Container - for ; cr.State != arvados.ContainerRequestStateFinal; time.Sleep(2 * time.Second) { - ctx, cancel := context.WithDeadline(ctx, time.Now().Add(diag.timeout)) + for ; cr.State != arvados.ContainerRequestStateFinal && time.Now().Before(deadline); time.Sleep(2 * time.Second) { + ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout)) defer cancel() crStateWas := cr.State @@ -615,11 +727,26 @@ func (diag *diagnoser) runtests() { if c.State != cStateWas { diag.debugf("container state = %s", c.State) } + + cancel() } + if cr.State != arvados.ContainerRequestStateFinal { + err := client.RequestAndDecodeContext(context.Background(), &cr, "PATCH", "arvados/v1/container_requests/"+cr.UUID, nil, map[string]interface{}{ + "container_request": map[string]interface{}{ + "priority": 0, + }}) + if err != nil { + diag.infof("error canceling container request %s: %s", cr.UUID, err) + } else { + diag.debugf("canceled container request %s", cr.UUID) + } + return fmt.Errorf("timed out waiting for container to finish; container request %s state was %q, container %s state was %q", cr.UUID, cr.State, c.UUID, c.State) + } if c.State != arvados.ContainerStateComplete { return fmt.Errorf("container request %s is final but container %s did not complete: container state = %q", cr.UUID, cr.ContainerUUID, c.State) - } else if c.ExitCode != 0 { + } + if c.ExitCode != 0 { return fmt.Errorf("container exited %d", c.ExitCode) } return nil