Merge branch '21508-io-panel-performance' into main. Closes #21508
[arvados.git] / lib / diagnostics / cmd.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package diagnostics
6
7 import (
8         "archive/tar"
9         "bytes"
10         "context"
11         "crypto/sha256"
12         _ "embed"
13         "flag"
14         "fmt"
15         "io"
16         "io/ioutil"
17         "net"
18         "net/http"
19         "net/url"
20         "os"
21         "os/exec"
22         "strings"
23         "time"
24
25         "git.arvados.org/arvados.git/lib/cmd"
26         "git.arvados.org/arvados.git/lib/config"
27         "git.arvados.org/arvados.git/sdk/go/arvados"
28         "git.arvados.org/arvados.git/sdk/go/ctxlog"
29         "git.arvados.org/arvados.git/sdk/go/health"
30         "github.com/sirupsen/logrus"
31 )
32
33 type Command struct{}
34
35 func (Command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
36         var diag diagnoser
37         f := flag.NewFlagSet(prog, flag.ContinueOnError)
38         f.StringVar(&diag.projectName, "project-name", "scratch area for diagnostics", "`name` of project to find/create in home project and use for temporary/test objects")
39         f.StringVar(&diag.logLevel, "log-level", "info", "logging `level` (debug, info, warning, error)")
40         f.StringVar(&diag.dockerImage, "docker-image", "", "`image` (tag or portable data hash) to use when running a test container, or \"hello-world\" to use embedded hello-world image (default: build a custom image containing this executable, and run diagnostics inside the container too)")
41         f.StringVar(&diag.dockerImageFrom, "docker-image-from", "debian:stable-slim", "`base` image to use when building a custom image (see https://doc.arvados.org/main/admin/diagnostics.html#container-options)")
42         f.BoolVar(&diag.checkInternal, "internal-client", false, "check that this host is considered an \"internal\" client")
43         f.BoolVar(&diag.checkExternal, "external-client", false, "check that this host is considered an \"external\" client")
44         f.BoolVar(&diag.verbose, "v", false, "verbose: include more information in report")
45         f.IntVar(&diag.priority, "priority", 500, "priority for test container (1..1000, or 0 to skip)")
46         f.DurationVar(&diag.timeout, "timeout", 10*time.Second, "timeout for http requests")
47         if ok, code := cmd.ParseFlags(f, prog, args, "", stderr); !ok {
48                 return code
49         }
50         diag.stdout = stdout
51         diag.stderr = stderr
52         diag.logger = ctxlog.New(stdout, "text", diag.logLevel)
53         diag.logger.SetFormatter(&logrus.TextFormatter{DisableTimestamp: true, DisableLevelTruncation: true, PadLevelText: true})
54         diag.runtests()
55         if len(diag.errors) == 0 {
56                 diag.logger.Info("--- no errors ---")
57                 return 0
58         } else {
59                 if diag.logger.Level > logrus.ErrorLevel {
60                         fmt.Fprint(stdout, "\n--- cut here --- error summary ---\n\n")
61                         for _, e := range diag.errors {
62                                 diag.logger.Error(e)
63                         }
64                 }
65                 return 1
66         }
67 }
68
69 // docker save hello-world > hello-world.tar
70 //
71 //go:embed hello-world.tar
72 var HelloWorldDockerImage []byte
73
74 type diagnoser struct {
75         stdout          io.Writer
76         stderr          io.Writer
77         logLevel        string
78         priority        int
79         projectName     string
80         dockerImage     string
81         dockerImageFrom string
82         checkInternal   bool
83         checkExternal   bool
84         verbose         bool
85         timeout         time.Duration
86         logger          *logrus.Logger
87         errors          []string
88         done            map[int]bool
89 }
90
91 func (diag *diagnoser) debugf(f string, args ...interface{}) {
92         diag.logger.Debugf("  ... "+f, args...)
93 }
94
95 func (diag *diagnoser) infof(f string, args ...interface{}) {
96         diag.logger.Infof("  ... "+f, args...)
97 }
98
99 func (diag *diagnoser) verbosef(f string, args ...interface{}) {
100         if diag.verbose {
101                 diag.logger.Infof("  ... "+f, args...)
102         }
103 }
104
105 func (diag *diagnoser) warnf(f string, args ...interface{}) {
106         diag.logger.Warnf("  ... "+f, args...)
107 }
108
109 func (diag *diagnoser) errorf(f string, args ...interface{}) {
110         diag.logger.Errorf(f, args...)
111         diag.errors = append(diag.errors, fmt.Sprintf(f, args...))
112 }
113
114 // Run the given func, logging appropriate messages before and after,
115 // adding timing info, etc.
116 //
117 // The id argument should be unique among tests, and shouldn't change
118 // when other tests are added/removed.
119 func (diag *diagnoser) dotest(id int, title string, fn func() error) {
120         if diag.done == nil {
121                 diag.done = map[int]bool{}
122         } else if diag.done[id] {
123                 diag.errorf("(bug) reused test id %d", id)
124         }
125         diag.done[id] = true
126
127         diag.logger.Infof("%4d: %s", id, title)
128         t0 := time.Now()
129         err := fn()
130         elapsed := fmt.Sprintf("%d ms", time.Now().Sub(t0)/time.Millisecond)
131         if err != nil {
132                 diag.errorf("%4d: %s (%s): %s", id, title, elapsed, err)
133         } else {
134                 diag.logger.Debugf("%4d: %s (%s): ok", id, title, elapsed)
135         }
136 }
137
138 func (diag *diagnoser) runtests() {
139         client := arvados.NewClientFromEnv()
140         // Disable auto-retry, use context instead
141         client.Timeout = 0
142
143         if client.APIHost == "" || client.AuthToken == "" {
144                 diag.errorf("ARVADOS_API_HOST and ARVADOS_API_TOKEN environment variables are not set -- aborting without running any tests")
145                 return
146         }
147
148         hostname, err := os.Hostname()
149         if err != nil {
150                 diag.warnf("error getting hostname: %s")
151         } else {
152                 diag.verbosef("hostname = %s", hostname)
153         }
154
155         diag.dotest(5, "running health check (same as `arvados-server check`)", func() error {
156                 ldr := config.NewLoader(&bytes.Buffer{}, ctxlog.New(&bytes.Buffer{}, "text", "info"))
157                 ldr.SetupFlags(flag.NewFlagSet("diagnostics", flag.ContinueOnError))
158                 cfg, err := ldr.Load()
159                 if err != nil {
160                         diag.infof("skipping because config could not be loaded: %s", err)
161                         return nil
162                 }
163                 cluster, err := cfg.GetCluster("")
164                 if err != nil {
165                         return err
166                 }
167                 if cluster.SystemRootToken != os.Getenv("ARVADOS_API_TOKEN") {
168                         return fmt.Errorf("diagnostics usage error: %s is readable but SystemRootToken does not match $ARVADOS_API_TOKEN (to fix, either run 'arvados-client sudo diagnostics' to load everything from config file, or set ARVADOS_CONFIG=- to load nothing from config file)", ldr.Path)
169                 }
170                 agg := &health.Aggregator{Cluster: cluster}
171                 resp := agg.ClusterHealth()
172                 for _, e := range resp.Errors {
173                         diag.errorf("health check: %s", e)
174                 }
175                 if len(resp.Errors) > 0 {
176                         diag.infof("consider running `arvados-server check -yaml` for a comprehensive report")
177                 }
178                 diag.verbosef("reported clock skew = %v", resp.ClockSkew)
179                 reported := map[string]bool{}
180                 for _, result := range resp.Checks {
181                         version := strings.SplitN(result.Metrics.Version, " (go", 2)[0]
182                         if version != "" && !reported[version] {
183                                 diag.verbosef("arvados version = %s", version)
184                                 reported[version] = true
185                         }
186                 }
187                 reported = map[string]bool{}
188                 for _, result := range resp.Checks {
189                         if result.Server != "" && !reported[result.Server] {
190                                 diag.verbosef("http frontend version = %s", result.Server)
191                                 reported[result.Server] = true
192                         }
193                 }
194                 reported = map[string]bool{}
195                 for _, result := range resp.Checks {
196                         if sha := result.ConfigSourceSHA256; sha != "" && !reported[sha] {
197                                 diag.verbosef("config file sha256 = %s", sha)
198                                 reported[sha] = true
199                         }
200                 }
201                 return nil
202         })
203
204         var dd arvados.DiscoveryDocument
205         ddpath := "discovery/v1/apis/arvados/v1/rest"
206         diag.dotest(10, fmt.Sprintf("getting discovery document from https://%s/%s", client.APIHost, ddpath), func() error {
207                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
208                 defer cancel()
209                 err := client.RequestAndDecodeContext(ctx, &dd, "GET", ddpath, nil, nil)
210                 if err != nil {
211                         return err
212                 }
213                 diag.verbosef("BlobSignatureTTL = %d", dd.BlobSignatureTTL)
214                 return nil
215         })
216
217         var cluster arvados.Cluster
218         cfgpath := "arvados/v1/config"
219         cfgOK := false
220         diag.dotest(20, fmt.Sprintf("getting exported config from https://%s/%s", client.APIHost, cfgpath), func() error {
221                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
222                 defer cancel()
223                 err := client.RequestAndDecodeContext(ctx, &cluster, "GET", cfgpath, nil, nil)
224                 if err != nil {
225                         return err
226                 }
227                 diag.verbosef("Collections.BlobSigning = %v", cluster.Collections.BlobSigning)
228                 cfgOK = true
229                 return nil
230         })
231
232         var user arvados.User
233         diag.dotest(30, "getting current user record", func() error {
234                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
235                 defer cancel()
236                 err := client.RequestAndDecodeContext(ctx, &user, "GET", "arvados/v1/users/current", nil, nil)
237                 if err != nil {
238                         return err
239                 }
240                 diag.verbosef("user uuid = %s", user.UUID)
241                 return nil
242         })
243
244         if !cfgOK {
245                 diag.errorf("cannot proceed without cluster config -- aborting without running any further tests")
246                 return
247         }
248
249         // uncomment to create some spurious errors
250         // cluster.Services.WebDAVDownload.ExternalURL.Host = "0.0.0.0:9"
251
252         // TODO: detect routing errors here, like finding wb2 at the
253         // wb1 address.
254         for i, svc := range []*arvados.Service{
255                 &cluster.Services.Keepproxy,
256                 &cluster.Services.WebDAV,
257                 &cluster.Services.WebDAVDownload,
258                 &cluster.Services.Websocket,
259                 &cluster.Services.Workbench1,
260                 &cluster.Services.Workbench2,
261         } {
262                 diag.dotest(40+i, fmt.Sprintf("connecting to service endpoint %s", svc.ExternalURL), func() error {
263                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
264                         defer cancel()
265                         u := svc.ExternalURL
266                         if strings.HasPrefix(u.Scheme, "ws") {
267                                 // We can do a real websocket test elsewhere,
268                                 // but for now we'll just check the https
269                                 // connection.
270                                 u.Scheme = "http" + u.Scheme[2:]
271                         }
272                         if svc == &cluster.Services.WebDAV && strings.HasPrefix(u.Host, "*") {
273                                 u.Host = "d41d8cd98f00b204e9800998ecf8427e-0" + u.Host[1:]
274                         }
275                         req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
276                         if err != nil {
277                                 return err
278                         }
279                         resp, err := http.DefaultClient.Do(req)
280                         if err != nil {
281                                 return err
282                         }
283                         resp.Body.Close()
284                         return nil
285                 })
286         }
287
288         for i, url := range []string{
289                 cluster.Services.Controller.ExternalURL.String(),
290                 cluster.Services.Keepproxy.ExternalURL.String() + "d41d8cd98f00b204e9800998ecf8427e+0",
291                 cluster.Services.WebDAVDownload.ExternalURL.String(),
292         } {
293                 diag.dotest(50+i, fmt.Sprintf("checking CORS headers at %s", url), func() error {
294                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
295                         defer cancel()
296                         req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
297                         if err != nil {
298                                 return err
299                         }
300                         req.Header.Set("Origin", "https://example.com")
301                         resp, err := http.DefaultClient.Do(req)
302                         if err != nil {
303                                 return err
304                         }
305                         if hdr := resp.Header.Get("Access-Control-Allow-Origin"); hdr != "*" {
306                                 return fmt.Errorf("expected \"Access-Control-Allow-Origin: *\", got %q", hdr)
307                         }
308                         return nil
309                 })
310         }
311
312         var keeplist arvados.KeepServiceList
313         diag.dotest(60, "checking internal/external client detection", func() error {
314                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
315                 defer cancel()
316                 err := client.RequestAndDecodeContext(ctx, &keeplist, "GET", "arvados/v1/keep_services/accessible", nil, arvados.ListOptions{Limit: 999999})
317                 if err != nil {
318                         return fmt.Errorf("error getting keep services list: %s", err)
319                 } else if len(keeplist.Items) == 0 {
320                         return fmt.Errorf("controller did not return any keep services")
321                 }
322                 found := map[string]int{}
323                 for _, ks := range keeplist.Items {
324                         found[ks.ServiceType]++
325                 }
326                 isInternal := found["proxy"] == 0 && len(keeplist.Items) > 0
327                 isExternal := found["proxy"] > 0 && found["proxy"] == len(keeplist.Items)
328                 if isExternal {
329                         diag.infof("controller returned only proxy services, this host is treated as \"external\"")
330                 } else if isInternal {
331                         diag.infof("controller returned only non-proxy services, this host is treated as \"internal\"")
332                 }
333                 if (diag.checkInternal && !isInternal) || (diag.checkExternal && !isExternal) {
334                         return fmt.Errorf("expecting internal=%v external=%v, but found internal=%v external=%v", diag.checkInternal, diag.checkExternal, isInternal, isExternal)
335                 }
336                 return nil
337         })
338
339         for i, ks := range keeplist.Items {
340                 u := url.URL{
341                         Scheme: "http",
342                         Host:   net.JoinHostPort(ks.ServiceHost, fmt.Sprintf("%d", ks.ServicePort)),
343                         Path:   "/",
344                 }
345                 if ks.ServiceSSLFlag {
346                         u.Scheme = "https"
347                 }
348                 diag.dotest(61+i, fmt.Sprintf("reading+writing via keep service at %s", u.String()), func() error {
349                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
350                         defer cancel()
351                         req, err := http.NewRequestWithContext(ctx, "PUT", u.String()+"d41d8cd98f00b204e9800998ecf8427e", nil)
352                         if err != nil {
353                                 return err
354                         }
355                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
356                         resp, err := http.DefaultClient.Do(req)
357                         if err != nil {
358                                 return err
359                         }
360                         defer resp.Body.Close()
361                         body, err := ioutil.ReadAll(resp.Body)
362                         if err != nil {
363                                 return fmt.Errorf("reading response body: %s", err)
364                         }
365                         loc := strings.TrimSpace(string(body))
366                         if !strings.HasPrefix(loc, "d41d8") {
367                                 return fmt.Errorf("unexpected response from write: %q", body)
368                         }
369
370                         req, err = http.NewRequestWithContext(ctx, "GET", u.String()+loc, nil)
371                         if err != nil {
372                                 return err
373                         }
374                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
375                         resp, err = http.DefaultClient.Do(req)
376                         if err != nil {
377                                 return err
378                         }
379                         defer resp.Body.Close()
380                         body, err = ioutil.ReadAll(resp.Body)
381                         if err != nil {
382                                 return fmt.Errorf("reading response body: %s", err)
383                         }
384                         if len(body) != 0 {
385                                 return fmt.Errorf("unexpected response from read: %q", body)
386                         }
387
388                         return nil
389                 })
390         }
391
392         var project arvados.Group
393         diag.dotest(80, fmt.Sprintf("finding/creating %q project", diag.projectName), func() error {
394                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
395                 defer cancel()
396                 var grplist arvados.GroupList
397                 err := client.RequestAndDecodeContext(ctx, &grplist, "GET", "arvados/v1/groups", nil, arvados.ListOptions{
398                         Filters: []arvados.Filter{
399                                 {"name", "=", diag.projectName},
400                                 {"group_class", "=", "project"},
401                                 {"owner_uuid", "=", user.UUID}},
402                         Limit: 999999})
403                 if err != nil {
404                         return fmt.Errorf("list groups: %s", err)
405                 }
406                 if len(grplist.Items) > 0 {
407                         project = grplist.Items[0]
408                         diag.verbosef("using existing project, uuid = %s", project.UUID)
409                         return nil
410                 }
411                 diag.debugf("list groups: ok, no results")
412                 err = client.RequestAndDecodeContext(ctx, &project, "POST", "arvados/v1/groups", nil, map[string]interface{}{"group": map[string]interface{}{
413                         "name":        diag.projectName,
414                         "group_class": "project",
415                 }})
416                 if err != nil {
417                         return fmt.Errorf("create project: %s", err)
418                 }
419                 diag.verbosef("created project, uuid = %s", project.UUID)
420                 return nil
421         })
422
423         var collection arvados.Collection
424         diag.dotest(90, "creating temporary collection", func() error {
425                 if project.UUID == "" {
426                         return fmt.Errorf("skipping, no project to work in")
427                 }
428                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
429                 defer cancel()
430                 err := client.RequestAndDecodeContext(ctx, &collection, "POST", "arvados/v1/collections", nil, map[string]interface{}{
431                         "ensure_unique_name": true,
432                         "collection": map[string]interface{}{
433                                 "owner_uuid": project.UUID,
434                                 "name":       "test collection",
435                                 "trash_at":   time.Now().Add(time.Hour)}})
436                 if err != nil {
437                         return err
438                 }
439                 diag.verbosef("ok, uuid = %s", collection.UUID)
440                 return nil
441         })
442
443         if collection.UUID != "" {
444                 defer func() {
445                         diag.dotest(9990, "deleting temporary collection", func() error {
446                                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
447                                 defer cancel()
448                                 return client.RequestAndDecodeContext(ctx, nil, "DELETE", "arvados/v1/collections/"+collection.UUID, nil, nil)
449                         })
450                 }()
451         }
452
453         tempdir, err := ioutil.TempDir("", "arvados-diagnostics")
454         if err != nil {
455                 diag.errorf("error creating temp dir: %s", err)
456                 return
457         }
458         defer os.RemoveAll(tempdir)
459
460         var dockerImageData []byte
461         if diag.dockerImage != "" || diag.priority < 1 {
462                 // We won't be using the self-built docker image, so
463                 // don't build it.  But we will write the embedded
464                 // "hello-world" image to our test collection to test
465                 // upload/download, whether or not we're using it as a
466                 // docker image.
467                 dockerImageData = HelloWorldDockerImage
468         } else if selfbin, err := os.Readlink("/proc/self/exe"); err != nil {
469                 diag.errorf("readlink /proc/self/exe: %s", err)
470                 return
471         } else if selfbindata, err := os.ReadFile(selfbin); err != nil {
472                 diag.errorf("error reading %s: %s", selfbin, err)
473                 return
474         } else {
475                 selfbinSha := fmt.Sprintf("%x", sha256.Sum256(selfbindata))
476                 tag := "arvados-client-diagnostics:" + selfbinSha[:9]
477                 err := os.WriteFile(tempdir+"/arvados-client", selfbindata, 0777)
478                 if err != nil {
479                         diag.errorf("error writing %s: %s", tempdir+"/arvados-client", err)
480                         return
481                 }
482
483                 dockerfile := "FROM " + diag.dockerImageFrom + "\n"
484                 dockerfile += "RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends libfuse2 ca-certificates && apt-get clean\n"
485                 dockerfile += "COPY /arvados-client /arvados-client\n"
486                 cmd := exec.Command("docker", "build", "--tag", tag, "-f", "-", tempdir)
487                 cmd.Stdin = strings.NewReader(dockerfile)
488                 cmd.Stdout = diag.stderr
489                 cmd.Stderr = diag.stderr
490                 err = cmd.Run()
491                 if err != nil {
492                         diag.errorf("error building docker image: %s", err)
493                         return
494                 }
495                 checkversion, err := exec.Command("docker", "run", tag, "/arvados-client", "version").CombinedOutput()
496                 if err != nil {
497                         diag.errorf("docker image does not seem to work: %s", err)
498                         return
499                 }
500                 diag.infof("arvados-client version: %s", checkversion)
501
502                 buf, err := exec.Command("docker", "save", tag).Output()
503                 if err != nil {
504                         diag.errorf("docker save %s: %s", tag, err)
505                         return
506                 }
507                 diag.infof("docker image size is %d", len(buf))
508                 dockerImageData = buf
509         }
510
511         // Read image tarball to find image ID, so we can upload it as
512         // "sha256:{...}.tar"
513         var imageSHA2 string
514         {
515                 tr := tar.NewReader(bytes.NewReader(dockerImageData))
516                 for {
517                         hdr, err := tr.Next()
518                         if err == io.EOF {
519                                 break
520                         }
521                         if err != nil {
522                                 diag.errorf("internal error/bug: cannot read docker image tar file: %s", err)
523                                 return
524                         }
525                         if s := strings.TrimSuffix(hdr.Name, ".json"); len(s) == 64 && s != hdr.Name {
526                                 imageSHA2 = s
527                         }
528                 }
529                 if imageSHA2 == "" {
530                         diag.errorf("internal error/bug: cannot find {sha256}.json file in docker image tar file")
531                         return
532                 }
533         }
534         tarfilename := "sha256:" + imageSHA2 + ".tar"
535
536         diag.dotest(100, "uploading file via webdav", func() error {
537                 timeout := diag.timeout
538                 if len(dockerImageData) > 10<<20 && timeout < time.Minute {
539                         // Extend the normal http timeout if we're
540                         // uploading a substantial docker image.
541                         timeout = time.Minute
542                 }
543                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(timeout))
544                 defer cancel()
545                 if collection.UUID == "" {
546                         return fmt.Errorf("skipping, no test collection")
547                 }
548                 t0 := time.Now()
549                 req, err := http.NewRequestWithContext(ctx, "PUT", cluster.Services.WebDAVDownload.ExternalURL.String()+"c="+collection.UUID+"/"+tarfilename, bytes.NewReader(dockerImageData))
550                 if err != nil {
551                         return fmt.Errorf("BUG? http.NewRequest: %s", err)
552                 }
553                 req.Header.Set("Authorization", "Bearer "+client.AuthToken)
554                 resp, err := http.DefaultClient.Do(req)
555                 if err != nil {
556                         return fmt.Errorf("error performing http request: %s", err)
557                 }
558                 resp.Body.Close()
559                 if resp.StatusCode != http.StatusCreated {
560                         return fmt.Errorf("status %s", resp.Status)
561                 }
562                 diag.verbosef("upload ok, status %s, %f MB/s", resp.Status, float64(len(dockerImageData))/time.Since(t0).Seconds()/1000000)
563                 err = client.RequestAndDecodeContext(ctx, &collection, "GET", "arvados/v1/collections/"+collection.UUID, nil, nil)
564                 if err != nil {
565                         return fmt.Errorf("get updated collection: %s", err)
566                 }
567                 diag.verbosef("upload pdh %s", collection.PortableDataHash)
568                 return nil
569         })
570
571         davurl := cluster.Services.WebDAV.ExternalURL
572         davWildcard := strings.HasPrefix(davurl.Host, "*--") || strings.HasPrefix(davurl.Host, "*.")
573         diag.dotest(110, fmt.Sprintf("checking WebDAV ExternalURL wildcard (%s)", davurl), func() error {
574                 if davurl.Host == "" {
575                         return fmt.Errorf("host missing - content previews will not work")
576                 }
577                 if !davWildcard && !cluster.Collections.TrustAllContent {
578                         diag.warnf("WebDAV ExternalURL has no leading wildcard and TrustAllContent==false - content previews will not work")
579                 }
580                 return nil
581         })
582
583         for i, trial := range []struct {
584                 needcoll     bool
585                 needWildcard bool
586                 status       int
587                 fileurl      string
588         }{
589                 {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + "foo"},
590                 {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + tarfilename},
591                 {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/foo"},
592                 {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/" + tarfilename},
593                 {true, true, http.StatusOK, strings.Replace(davurl.String(), "*", strings.Replace(collection.PortableDataHash, "+", "-", -1), 1) + tarfilename},
594                 {true, false, http.StatusOK, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=" + collection.UUID + "/_/" + tarfilename},
595         } {
596                 diag.dotest(120+i, fmt.Sprintf("downloading from webdav (%s)", trial.fileurl), func() error {
597                         if trial.needWildcard && !davWildcard {
598                                 diag.warnf("skipping collection-id-in-vhost test because WebDAV ExternalURL has no leading wildcard")
599                                 return nil
600                         }
601                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
602                         defer cancel()
603                         if trial.needcoll && collection.UUID == "" {
604                                 return fmt.Errorf("skipping, no test collection")
605                         }
606                         req, err := http.NewRequestWithContext(ctx, "GET", trial.fileurl, nil)
607                         if err != nil {
608                                 return err
609                         }
610                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
611                         resp, err := http.DefaultClient.Do(req)
612                         if err != nil {
613                                 return err
614                         }
615                         defer resp.Body.Close()
616                         body, err := ioutil.ReadAll(resp.Body)
617                         if err != nil {
618                                 return fmt.Errorf("reading response: %s", err)
619                         }
620                         if resp.StatusCode != trial.status {
621                                 return fmt.Errorf("unexpected response status: %s", resp.Status)
622                         }
623                         if trial.status == http.StatusOK && !bytes.Equal(body, dockerImageData) {
624                                 excerpt := body
625                                 if len(excerpt) > 128 {
626                                         excerpt = append([]byte(nil), body[:128]...)
627                                         excerpt = append(excerpt, []byte("[...]")...)
628                                 }
629                                 return fmt.Errorf("unexpected response content: len %d, %q", len(body), excerpt)
630                         }
631                         return nil
632                 })
633         }
634
635         var vm arvados.VirtualMachine
636         diag.dotest(130, "getting list of virtual machines", func() error {
637                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
638                 defer cancel()
639                 var vmlist arvados.VirtualMachineList
640                 err := client.RequestAndDecodeContext(ctx, &vmlist, "GET", "arvados/v1/virtual_machines", nil, arvados.ListOptions{Limit: 999999})
641                 if err != nil {
642                         return err
643                 }
644                 if len(vmlist.Items) < 1 {
645                         diag.warnf("no VMs found")
646                 } else {
647                         vm = vmlist.Items[0]
648                 }
649                 return nil
650         })
651
652         diag.dotest(150, "connecting to webshell service", func() error {
653                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
654                 defer cancel()
655                 if vm.UUID == "" {
656                         diag.warnf("skipping, no vm available")
657                         return nil
658                 }
659                 u := cluster.Services.WebShell.ExternalURL
660                 webshellurl := u.String() + vm.Hostname + "?"
661                 if strings.HasPrefix(u.Host, "*") {
662                         u.Host = vm.Hostname + u.Host[1:]
663                         webshellurl = u.String() + "?"
664                 }
665                 diag.debugf("url %s", webshellurl)
666                 req, err := http.NewRequestWithContext(ctx, "POST", webshellurl, bytes.NewBufferString(url.Values{
667                         "width":   {"80"},
668                         "height":  {"25"},
669                         "session": {"xyzzy"},
670                         "rooturl": {webshellurl},
671                 }.Encode()))
672                 if err != nil {
673                         return err
674                 }
675                 req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
676                 resp, err := http.DefaultClient.Do(req)
677                 if err != nil {
678                         return err
679                 }
680                 defer resp.Body.Close()
681                 diag.debugf("response status %s", resp.Status)
682                 body, err := ioutil.ReadAll(resp.Body)
683                 if err != nil {
684                         return fmt.Errorf("reading response: %s", err)
685                 }
686                 diag.debugf("response body %q", body)
687                 // We don't speak the protocol, so we get a 400 error
688                 // from the webshell server even if everything is
689                 // OK. Anything else (404, 502, ???) indicates a
690                 // problem.
691                 if resp.StatusCode != http.StatusBadRequest {
692                         return fmt.Errorf("unexpected response status: %s, %q", resp.Status, body)
693                 }
694                 return nil
695         })
696
697         diag.dotest(160, "running a container", func() error {
698                 if diag.priority < 1 {
699                         diag.infof("skipping (use priority > 0 if you want to run a container)")
700                         return nil
701                 }
702                 if project.UUID == "" {
703                         return fmt.Errorf("skipping, no project to work in")
704                 }
705
706                 timestamp := time.Now().Format(time.RFC3339)
707
708                 var ctrCommand []string
709                 switch diag.dockerImage {
710                 case "":
711                         if collection.UUID == "" {
712                                 return fmt.Errorf("skipping, no test collection to use as docker image")
713                         }
714                         diag.dockerImage = collection.PortableDataHash
715                         ctrCommand = []string{"/arvados-client", "diagnostics",
716                                 "-priority=0", // don't run a container
717                                 "-log-level=" + diag.logLevel,
718                                 "-internal-client=true"}
719                 case "hello-world":
720                         if collection.UUID == "" {
721                                 return fmt.Errorf("skipping, no test collection to use as docker image")
722                         }
723                         diag.dockerImage = collection.PortableDataHash
724                         ctrCommand = []string{"/hello"}
725                 default:
726                         ctrCommand = []string{"echo", timestamp}
727                 }
728
729                 var cr arvados.ContainerRequest
730                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
731                 defer cancel()
732
733                 err := client.RequestAndDecodeContext(ctx, &cr, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{"container_request": map[string]interface{}{
734                         "owner_uuid":      project.UUID,
735                         "name":            fmt.Sprintf("diagnostics container request %s", timestamp),
736                         "container_image": diag.dockerImage,
737                         "command":         ctrCommand,
738                         "use_existing":    false,
739                         "output_path":     "/mnt/output",
740                         "output_name":     fmt.Sprintf("diagnostics output %s", timestamp),
741                         "priority":        diag.priority,
742                         "state":           arvados.ContainerRequestStateCommitted,
743                         "mounts": map[string]map[string]interface{}{
744                                 "/mnt/output": {
745                                         "kind":     "collection",
746                                         "writable": true,
747                                 },
748                         },
749                         "runtime_constraints": arvados.RuntimeConstraints{
750                                 API:          true,
751                                 VCPUs:        1,
752                                 RAM:          128 << 20,
753                                 KeepCacheRAM: 64 << 20,
754                         },
755                 }})
756                 if err != nil {
757                         return err
758                 }
759                 diag.infof("container request uuid = %s", cr.UUID)
760                 diag.verbosef("container uuid = %s", cr.ContainerUUID)
761
762                 timeout := 10 * time.Minute
763                 diag.infof("container request submitted, waiting up to %v for container to run", arvados.Duration(timeout))
764                 deadline := time.Now().Add(timeout)
765
766                 var c arvados.Container
767                 for ; cr.State != arvados.ContainerRequestStateFinal && time.Now().Before(deadline); time.Sleep(2 * time.Second) {
768                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
769                         defer cancel()
770
771                         crStateWas := cr.State
772                         err := client.RequestAndDecodeContext(ctx, &cr, "GET", "arvados/v1/container_requests/"+cr.UUID, nil, nil)
773                         if err != nil {
774                                 return err
775                         }
776                         if cr.State != crStateWas {
777                                 diag.debugf("container request state = %s", cr.State)
778                         }
779
780                         cStateWas := c.State
781                         err = client.RequestAndDecodeContext(ctx, &c, "GET", "arvados/v1/containers/"+cr.ContainerUUID, nil, nil)
782                         if err != nil {
783                                 return err
784                         }
785                         if c.State != cStateWas {
786                                 diag.debugf("container state = %s", c.State)
787                         }
788
789                         cancel()
790                 }
791
792                 if cr.State != arvados.ContainerRequestStateFinal {
793                         err := client.RequestAndDecodeContext(context.Background(), &cr, "PATCH", "arvados/v1/container_requests/"+cr.UUID, nil, map[string]interface{}{
794                                 "container_request": map[string]interface{}{
795                                         "priority": 0,
796                                 }})
797                         if err != nil {
798                                 diag.infof("error canceling container request %s: %s", cr.UUID, err)
799                         } else {
800                                 diag.debugf("canceled container request %s", cr.UUID)
801                         }
802                         return fmt.Errorf("timed out waiting for container to finish; container request %s state was %q, container %s state was %q", cr.UUID, cr.State, c.UUID, c.State)
803                 }
804                 if c.State != arvados.ContainerStateComplete {
805                         return fmt.Errorf("container request %s is final but container %s did not complete: container state = %q", cr.UUID, cr.ContainerUUID, c.State)
806                 }
807                 if c.ExitCode != 0 {
808                         return fmt.Errorf("container exited %d", c.ExitCode)
809                 }
810                 return nil
811         })
812 }