X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/a9b9c6ff05e0268570b829bd62a6f683cf9f1d19..d5df19fdff62724f9faeb3bee17201363071bf9e:/lib/diagnostics/cmd.go diff --git a/lib/diagnostics/cmd.go b/lib/diagnostics/cmd.go index b0241b3ae4..9c229c9b4e 100644 --- a/lib/diagnostics/cmd.go +++ b/lib/diagnostics/cmd.go @@ -5,8 +5,10 @@ package diagnostics import ( + "archive/tar" "bytes" "context" + _ "embed" "flag" "fmt" "io" @@ -14,31 +16,33 @@ import ( "net" "net/http" "net/url" + "os" "strings" "time" + "git.arvados.org/arvados.git/lib/cmd" + "git.arvados.org/arvados.git/lib/config" "git.arvados.org/arvados.git/sdk/go/arvados" "git.arvados.org/arvados.git/sdk/go/ctxlog" + "git.arvados.org/arvados.git/sdk/go/health" "github.com/sirupsen/logrus" ) type Command struct{} -func (cmd Command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { +func (Command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int { var diag diagnoser f := flag.NewFlagSet(prog, flag.ContinueOnError) f.StringVar(&diag.projectName, "project-name", "scratch area for diagnostics", "name of project to find/create in home project and use for temporary/test objects") f.StringVar(&diag.logLevel, "log-level", "info", "logging level (debug, info, warning, error)") + f.StringVar(&diag.dockerImage, "docker-image", "", "image to use when running a test container (default: use embedded hello-world image)") f.BoolVar(&diag.checkInternal, "internal-client", false, "check that this host is considered an \"internal\" client") f.BoolVar(&diag.checkExternal, "external-client", false, "check that this host is considered an \"external\" client") + f.BoolVar(&diag.verbose, "v", false, "verbose: include more information in report") f.IntVar(&diag.priority, "priority", 500, "priority for test container (1..1000, or 0 to skip)") f.DurationVar(&diag.timeout, "timeout", 10*time.Second, "timeout for http requests") - err := f.Parse(args) - if err == flag.ErrHelp { - return 0 - } else if err != nil { - fmt.Fprintln(stderr, err) - return 2 + if ok, code := cmd.ParseFlags(f, prog, args, "", stderr); !ok { + return code } diag.logger = ctxlog.New(stdout, "text", diag.logLevel) diag.logger.SetFormatter(&logrus.TextFormatter{DisableTimestamp: true, DisableLevelTruncation: true, PadLevelText: true}) @@ -57,14 +61,21 @@ func (cmd Command) RunCommand(prog string, args []string, stdin io.Reader, stdou } } +// docker save hello-world > hello-world.tar +// +//go:embed hello-world.tar +var HelloWorldDockerImage []byte + type diagnoser struct { stdout io.Writer stderr io.Writer logLevel string priority int projectName string + dockerImage string checkInternal bool checkExternal bool + verbose bool timeout time.Duration logger *logrus.Logger errors []string @@ -79,6 +90,12 @@ func (diag *diagnoser) infof(f string, args ...interface{}) { diag.logger.Infof(" ... "+f, args...) } +func (diag *diagnoser) verbosef(f string, args ...interface{}) { + if diag.verbose { + diag.logger.Infof(" ... "+f, args...) + } +} + func (diag *diagnoser) warnf(f string, args ...interface{}) { diag.logger.Warnf(" ... "+f, args...) } @@ -120,6 +137,62 @@ func (diag *diagnoser) runtests() { return } + hostname, err := os.Hostname() + if err != nil { + diag.warnf("error getting hostname: %s") + } else { + diag.verbosef("hostname = %s", hostname) + } + + diag.dotest(5, "running health check (same as `arvados-server check`)", func() error { + ldr := config.NewLoader(&bytes.Buffer{}, ctxlog.New(&bytes.Buffer{}, "text", "info")) + ldr.SetupFlags(flag.NewFlagSet("diagnostics", flag.ContinueOnError)) + cfg, err := ldr.Load() + if err != nil { + diag.infof("skipping because config could not be loaded: %s", err) + return nil + } + cluster, err := cfg.GetCluster("") + if err != nil { + return err + } + if cluster.SystemRootToken != os.Getenv("ARVADOS_API_TOKEN") { + return fmt.Errorf("diagnostics usage error: %s is readable but SystemRootToken does not match $ARVADOS_API_TOKEN (to fix, either run 'arvados-client sudo diagnostics' to load everything from config file, or set ARVADOS_CONFIG=- to load nothing from config file)", ldr.Path) + } + agg := &health.Aggregator{Cluster: cluster} + resp := agg.ClusterHealth() + for _, e := range resp.Errors { + diag.errorf("health check: %s", e) + } + if len(resp.Errors) > 0 { + diag.infof("consider running `arvados-server check -yaml` for a comprehensive report") + } + diag.verbosef("reported clock skew = %v", resp.ClockSkew) + reported := map[string]bool{} + for _, result := range resp.Checks { + version := strings.SplitN(result.Metrics.Version, " (go", 2)[0] + if version != "" && !reported[version] { + diag.verbosef("arvados version = %s", version) + reported[version] = true + } + } + reported = map[string]bool{} + for _, result := range resp.Checks { + if result.Server != "" && !reported[result.Server] { + diag.verbosef("http frontend version = %s", result.Server) + reported[result.Server] = true + } + } + reported = map[string]bool{} + for _, result := range resp.Checks { + if sha := result.ConfigSourceSHA256; sha != "" && !reported[sha] { + diag.verbosef("config file sha256 = %s", sha) + reported[sha] = true + } + } + return nil + }) + var dd arvados.DiscoveryDocument ddpath := "discovery/v1/apis/arvados/v1/rest" diag.dotest(10, fmt.Sprintf("getting discovery document from https://%s/%s", client.APIHost, ddpath), func() error { @@ -129,12 +202,13 @@ func (diag *diagnoser) runtests() { if err != nil { return err } - diag.debugf("BlobSignatureTTL = %d", dd.BlobSignatureTTL) + diag.verbosef("BlobSignatureTTL = %d", dd.BlobSignatureTTL) return nil }) var cluster arvados.Cluster cfgpath := "arvados/v1/config" + cfgOK := false diag.dotest(20, fmt.Sprintf("getting exported config from https://%s/%s", client.APIHost, cfgpath), func() error { ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout)) defer cancel() @@ -142,7 +216,8 @@ func (diag *diagnoser) runtests() { if err != nil { return err } - diag.debugf("Collections.BlobSigning = %v", cluster.Collections.BlobSigning) + diag.verbosef("Collections.BlobSigning = %v", cluster.Collections.BlobSigning) + cfgOK = true return nil }) @@ -154,10 +229,15 @@ func (diag *diagnoser) runtests() { if err != nil { return err } - diag.debugf("user uuid = %s", user.UUID) + diag.verbosef("user uuid = %s", user.UUID) return nil }) + if !cfgOK { + diag.errorf("cannot proceed without cluster config -- aborting without running any further tests") + return + } + // uncomment to create some spurious errors // cluster.Services.WebDAVDownload.ExternalURL.Host = "0.0.0.0:9" @@ -238,9 +318,9 @@ func (diag *diagnoser) runtests() { isInternal := found["proxy"] == 0 && len(keeplist.Items) > 0 isExternal := found["proxy"] > 0 && found["proxy"] == len(keeplist.Items) if isExternal { - diag.debugf("controller returned only proxy services, this host is treated as \"external\"") + diag.verbosef("controller returned only proxy services, this host is treated as \"external\"") } else if isInternal { - diag.debugf("controller returned only non-proxy services, this host is treated as \"internal\"") + diag.verbosef("controller returned only non-proxy services, this host is treated as \"internal\"") } if (diag.checkInternal && !isInternal) || (diag.checkExternal && !isExternal) { return fmt.Errorf("expecting internal=%v external=%v, but found internal=%v external=%v", diag.checkInternal, diag.checkExternal, isInternal, isExternal) @@ -317,7 +397,7 @@ func (diag *diagnoser) runtests() { } if len(grplist.Items) > 0 { project = grplist.Items[0] - diag.debugf("using existing project, uuid = %s", project.UUID) + diag.verbosef("using existing project, uuid = %s", project.UUID) return nil } diag.debugf("list groups: ok, no results") @@ -328,7 +408,7 @@ func (diag *diagnoser) runtests() { if err != nil { return fmt.Errorf("create project: %s", err) } - diag.debugf("created project, uuid = %s", project.UUID) + diag.verbosef("created project, uuid = %s", project.UUID) return nil }) @@ -348,7 +428,7 @@ func (diag *diagnoser) runtests() { if err != nil { return err } - diag.debugf("ok, uuid = %s", collection.UUID) + diag.verbosef("ok, uuid = %s", collection.UUID) return nil }) @@ -362,13 +442,38 @@ func (diag *diagnoser) runtests() { }() } + // Read hello-world.tar to find image ID, so we can upload it + // as "sha256:{...}.tar" + var imageSHA2 string + { + tr := tar.NewReader(bytes.NewReader(HelloWorldDockerImage)) + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + diag.errorf("internal error/bug: cannot read embedded docker image tar file: %s", err) + return + } + if s := strings.TrimSuffix(hdr.Name, ".json"); len(s) == 64 && s != hdr.Name { + imageSHA2 = s + } + } + if imageSHA2 == "" { + diag.errorf("internal error/bug: cannot find {sha256}.json file in embedded docker image tar file") + return + } + } + tarfilename := "sha256:" + imageSHA2 + ".tar" + diag.dotest(100, "uploading file via webdav", func() error { ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout)) defer cancel() if collection.UUID == "" { return fmt.Errorf("skipping, no test collection") } - req, err := http.NewRequestWithContext(ctx, "PUT", cluster.Services.WebDAVDownload.ExternalURL.String()+"c="+collection.UUID+"/testfile", bytes.NewBufferString("testfiledata")) + req, err := http.NewRequestWithContext(ctx, "PUT", cluster.Services.WebDAVDownload.ExternalURL.String()+"c="+collection.UUID+"/"+tarfilename, bytes.NewReader(HelloWorldDockerImage)) if err != nil { return fmt.Errorf("BUG? http.NewRequest: %s", err) } @@ -391,29 +496,35 @@ func (diag *diagnoser) runtests() { }) davurl := cluster.Services.WebDAV.ExternalURL + davWildcard := strings.HasPrefix(davurl.Host, "*--") || strings.HasPrefix(davurl.Host, "*.") diag.dotest(110, fmt.Sprintf("checking WebDAV ExternalURL wildcard (%s)", davurl), func() error { if davurl.Host == "" { return fmt.Errorf("host missing - content previews will not work") } - if !strings.HasPrefix(davurl.Host, "*--") && !strings.HasPrefix(davurl.Host, "*.") && !cluster.Collections.TrustAllContent { + if !davWildcard && !cluster.Collections.TrustAllContent { diag.warnf("WebDAV ExternalURL has no leading wildcard and TrustAllContent==false - content previews will not work") } return nil }) for i, trial := range []struct { - needcoll bool - status int - fileurl string + needcoll bool + needWildcard bool + status int + fileurl string }{ - {false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + "foo"}, - {false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + "testfile"}, - {false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/foo"}, - {false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/testfile"}, - {true, http.StatusOK, strings.Replace(davurl.String(), "*", strings.Replace(collection.PortableDataHash, "+", "-", -1), 1) + "testfile"}, - {true, http.StatusOK, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=" + collection.UUID + "/_/testfile"}, + {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + "foo"}, + {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + tarfilename}, + {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/foo"}, + {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/" + tarfilename}, + {true, true, http.StatusOK, strings.Replace(davurl.String(), "*", strings.Replace(collection.PortableDataHash, "+", "-", -1), 1) + tarfilename}, + {true, false, http.StatusOK, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=" + collection.UUID + "/_/" + tarfilename}, } { diag.dotest(120+i, fmt.Sprintf("downloading from webdav (%s)", trial.fileurl), func() error { + if trial.needWildcard && !davWildcard { + diag.warnf("skipping collection-id-in-vhost test because WebDAV ExternalURL has no leading wildcard") + return nil + } ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout)) defer cancel() if trial.needcoll && collection.UUID == "" { @@ -436,8 +547,13 @@ func (diag *diagnoser) runtests() { if resp.StatusCode != trial.status { return fmt.Errorf("unexpected response status: %s", resp.Status) } - if trial.status == http.StatusOK && string(body) != "testfiledata" { - return fmt.Errorf("unexpected response content: %q", body) + if trial.status == http.StatusOK && !bytes.Equal(body, HelloWorldDockerImage) { + excerpt := body + if len(excerpt) > 128 { + excerpt = append([]byte(nil), body[:128]...) + excerpt = append(excerpt, []byte("[...]")...) + } + return fmt.Errorf("unexpected response content: len %d, %q", len(body), excerpt) } return nil }) @@ -453,9 +569,10 @@ func (diag *diagnoser) runtests() { return err } if len(vmlist.Items) < 1 { - return fmt.Errorf("no VMs found") + diag.warnf("no VMs found") + } else { + vm = vmlist.Items[0] } - vm = vmlist.Items[0] return nil }) @@ -463,7 +580,8 @@ func (diag *diagnoser) runtests() { ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout)) defer cancel() if vm.UUID == "" { - return fmt.Errorf("skipping, no vm available") + diag.warnf("skipping, no vm available") + return nil } webshelltermurl := cluster.Services.Workbench1.ExternalURL.String() + "virtual_machines/" + vm.UUID + "/webshell/testusername" diag.debugf("url %s", webshelltermurl) @@ -491,7 +609,8 @@ func (diag *diagnoser) runtests() { ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout)) defer cancel() if vm.UUID == "" { - return fmt.Errorf("skipping, no vm available") + diag.warnf("skipping, no vm available") + return nil } u := cluster.Services.WebShell.ExternalURL webshellurl := u.String() + vm.Hostname + "?" @@ -540,16 +659,25 @@ func (diag *diagnoser) runtests() { return fmt.Errorf("skipping, no project to work in") } + timestamp := time.Now().Format(time.RFC3339) + ctrCommand := []string{"echo", timestamp} + if diag.dockerImage == "" { + if collection.UUID == "" { + return fmt.Errorf("skipping, no test collection to use as docker image") + } + diag.dockerImage = collection.PortableDataHash + ctrCommand = []string{"/hello"} + } + var cr arvados.ContainerRequest ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout)) defer cancel() - timestamp := time.Now().Format(time.RFC3339) err := client.RequestAndDecodeContext(ctx, &cr, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{"container_request": map[string]interface{}{ "owner_uuid": project.UUID, "name": fmt.Sprintf("diagnostics container request %s", timestamp), - "container_image": "arvados/jobs", - "command": []string{"echo", timestamp}, + "container_image": diag.dockerImage, + "command": ctrCommand, "use_existing": false, "output_path": "/mnt/output", "output_name": fmt.Sprintf("diagnostics output %s", timestamp), @@ -570,8 +698,8 @@ func (diag *diagnoser) runtests() { if err != nil { return err } - diag.debugf("container request uuid = %s", cr.UUID) - diag.debugf("container uuid = %s", cr.ContainerUUID) + diag.verbosef("container request uuid = %s", cr.UUID) + diag.verbosef("container uuid = %s", cr.ContainerUUID) timeout := 10 * time.Minute diag.infof("container request submitted, waiting up to %v for container to run", arvados.Duration(timeout))