19377: Add diagnostics -v flag: arvados/nginx/config versions, etc.
[arvados.git] / lib / diagnostics / cmd.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package diagnostics
6
7 import (
8         "archive/tar"
9         "bytes"
10         "context"
11         _ "embed"
12         "flag"
13         "fmt"
14         "io"
15         "io/ioutil"
16         "net"
17         "net/http"
18         "net/url"
19         "os"
20         "strings"
21         "time"
22
23         "git.arvados.org/arvados.git/lib/cmd"
24         "git.arvados.org/arvados.git/lib/config"
25         "git.arvados.org/arvados.git/sdk/go/arvados"
26         "git.arvados.org/arvados.git/sdk/go/ctxlog"
27         "git.arvados.org/arvados.git/sdk/go/health"
28         "github.com/sirupsen/logrus"
29 )
30
31 type Command struct{}
32
33 func (Command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
34         var diag diagnoser
35         f := flag.NewFlagSet(prog, flag.ContinueOnError)
36         f.StringVar(&diag.projectName, "project-name", "scratch area for diagnostics", "name of project to find/create in home project and use for temporary/test objects")
37         f.StringVar(&diag.logLevel, "log-level", "info", "logging level (debug, info, warning, error)")
38         f.StringVar(&diag.dockerImage, "docker-image", "", "image to use when running a test container (default: use embedded hello-world image)")
39         f.BoolVar(&diag.checkInternal, "internal-client", false, "check that this host is considered an \"internal\" client")
40         f.BoolVar(&diag.checkExternal, "external-client", false, "check that this host is considered an \"external\" client")
41         f.BoolVar(&diag.verbose, "v", false, "verbose: include more information in report")
42         f.IntVar(&diag.priority, "priority", 500, "priority for test container (1..1000, or 0 to skip)")
43         f.DurationVar(&diag.timeout, "timeout", 10*time.Second, "timeout for http requests")
44         if ok, code := cmd.ParseFlags(f, prog, args, "", stderr); !ok {
45                 return code
46         }
47         diag.logger = ctxlog.New(stdout, "text", diag.logLevel)
48         diag.logger.SetFormatter(&logrus.TextFormatter{DisableTimestamp: true, DisableLevelTruncation: true, PadLevelText: true})
49         diag.runtests()
50         if len(diag.errors) == 0 {
51                 diag.logger.Info("--- no errors ---")
52                 return 0
53         } else {
54                 if diag.logger.Level > logrus.ErrorLevel {
55                         fmt.Fprint(stdout, "\n--- cut here --- error summary ---\n\n")
56                         for _, e := range diag.errors {
57                                 diag.logger.Error(e)
58                         }
59                 }
60                 return 1
61         }
62 }
63
64 // docker save hello-world > hello-world.tar
65 //go:embed hello-world.tar
66 var HelloWorldDockerImage []byte
67
68 type diagnoser struct {
69         stdout        io.Writer
70         stderr        io.Writer
71         logLevel      string
72         priority      int
73         projectName   string
74         dockerImage   string
75         checkInternal bool
76         checkExternal bool
77         verbose       bool
78         timeout       time.Duration
79         logger        *logrus.Logger
80         errors        []string
81         done          map[int]bool
82 }
83
84 func (diag *diagnoser) debugf(f string, args ...interface{}) {
85         diag.logger.Debugf("  ... "+f, args...)
86 }
87
88 func (diag *diagnoser) infof(f string, args ...interface{}) {
89         diag.logger.Infof("  ... "+f, args...)
90 }
91
92 func (diag *diagnoser) verbosef(f string, args ...interface{}) {
93         if diag.verbose {
94                 diag.logger.Infof("  ... "+f, args...)
95         }
96 }
97
98 func (diag *diagnoser) warnf(f string, args ...interface{}) {
99         diag.logger.Warnf("  ... "+f, args...)
100 }
101
102 func (diag *diagnoser) errorf(f string, args ...interface{}) {
103         diag.logger.Errorf(f, args...)
104         diag.errors = append(diag.errors, fmt.Sprintf(f, args...))
105 }
106
107 // Run the given func, logging appropriate messages before and after,
108 // adding timing info, etc.
109 //
110 // The id argument should be unique among tests, and shouldn't change
111 // when other tests are added/removed.
112 func (diag *diagnoser) dotest(id int, title string, fn func() error) {
113         if diag.done == nil {
114                 diag.done = map[int]bool{}
115         } else if diag.done[id] {
116                 diag.errorf("(bug) reused test id %d", id)
117         }
118         diag.done[id] = true
119
120         diag.logger.Infof("%4d: %s", id, title)
121         t0 := time.Now()
122         err := fn()
123         elapsed := fmt.Sprintf("%d ms", time.Now().Sub(t0)/time.Millisecond)
124         if err != nil {
125                 diag.errorf("%4d: %s (%s): %s", id, title, elapsed, err)
126         } else {
127                 diag.logger.Debugf("%4d: %s (%s): ok", id, title, elapsed)
128         }
129 }
130
131 func (diag *diagnoser) runtests() {
132         client := arvados.NewClientFromEnv()
133
134         if client.APIHost == "" || client.AuthToken == "" {
135                 diag.errorf("ARVADOS_API_HOST and ARVADOS_API_TOKEN environment variables are not set -- aborting without running any tests")
136                 return
137         }
138
139         hostname, err := os.Hostname()
140         if err != nil {
141                 diag.warnf("error getting hostname: %s")
142         } else {
143                 diag.verbosef("hostname = %s", hostname)
144         }
145
146         diag.dotest(5, "running health check (same as `arvados-server check`)", func() error {
147                 ldr := config.NewLoader(&bytes.Buffer{}, ctxlog.New(&bytes.Buffer{}, "text", "info"))
148                 ldr.SetupFlags(flag.NewFlagSet("diagnostics", flag.ContinueOnError))
149                 cfg, err := ldr.Load()
150                 if err != nil {
151                         diag.infof("skipping because config could not be loaded: %s", err)
152                         return nil
153                 }
154                 cluster, err := cfg.GetCluster("")
155                 if err != nil {
156                         return err
157                 }
158                 if cluster.SystemRootToken != os.Getenv("ARVADOS_API_TOKEN") {
159                         diag.infof("skipping because provided token is not SystemRootToken")
160                 }
161                 agg := &health.Aggregator{Cluster: cluster}
162                 resp := agg.ClusterHealth()
163                 for _, e := range resp.Errors {
164                         diag.errorf("health check: %s", e)
165                 }
166                 diag.verbosef("reported clock skew = %v", resp.ClockSkew)
167                 reported := map[string]bool{}
168                 for _, result := range resp.Checks {
169                         version := strings.SplitN(result.Metrics.Version, " (go", 2)[0]
170                         if version != "" && !reported[version] {
171                                 diag.verbosef("arvados version = %s", version)
172                                 reported[version] = true
173                         }
174                 }
175                 reported = map[string]bool{}
176                 for _, result := range resp.Checks {
177                         if result.Server != "" && !reported[result.Server] {
178                                 diag.verbosef("http frontend version = %s", result.Server)
179                                 reported[result.Server] = true
180                         }
181                 }
182                 reported = map[string]bool{}
183                 for _, result := range resp.Checks {
184                         if sha := result.ConfigSourceSHA256; sha != "" && !reported[sha] {
185                                 diag.verbosef("config file sha256 = %s", sha)
186                                 reported[sha] = true
187                         }
188                 }
189                 return nil
190         })
191
192         var dd arvados.DiscoveryDocument
193         ddpath := "discovery/v1/apis/arvados/v1/rest"
194         diag.dotest(10, fmt.Sprintf("getting discovery document from https://%s/%s", client.APIHost, ddpath), func() error {
195                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
196                 defer cancel()
197                 err := client.RequestAndDecodeContext(ctx, &dd, "GET", ddpath, nil, nil)
198                 if err != nil {
199                         return err
200                 }
201                 diag.verbosef("BlobSignatureTTL = %d", dd.BlobSignatureTTL)
202                 return nil
203         })
204
205         var cluster arvados.Cluster
206         cfgpath := "arvados/v1/config"
207         cfgOK := false
208         diag.dotest(20, fmt.Sprintf("getting exported config from https://%s/%s", client.APIHost, cfgpath), func() error {
209                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
210                 defer cancel()
211                 err := client.RequestAndDecodeContext(ctx, &cluster, "GET", cfgpath, nil, nil)
212                 if err != nil {
213                         return err
214                 }
215                 diag.verbosef("Collections.BlobSigning = %v", cluster.Collections.BlobSigning)
216                 cfgOK = true
217                 return nil
218         })
219
220         var user arvados.User
221         diag.dotest(30, "getting current user record", func() error {
222                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
223                 defer cancel()
224                 err := client.RequestAndDecodeContext(ctx, &user, "GET", "arvados/v1/users/current", nil, nil)
225                 if err != nil {
226                         return err
227                 }
228                 diag.verbosef("user uuid = %s", user.UUID)
229                 return nil
230         })
231
232         if !cfgOK {
233                 diag.errorf("cannot proceed without cluster config -- aborting without running any further tests")
234                 return
235         }
236
237         // uncomment to create some spurious errors
238         // cluster.Services.WebDAVDownload.ExternalURL.Host = "0.0.0.0:9"
239
240         // TODO: detect routing errors here, like finding wb2 at the
241         // wb1 address.
242         for i, svc := range []*arvados.Service{
243                 &cluster.Services.Keepproxy,
244                 &cluster.Services.WebDAV,
245                 &cluster.Services.WebDAVDownload,
246                 &cluster.Services.Websocket,
247                 &cluster.Services.Workbench1,
248                 &cluster.Services.Workbench2,
249         } {
250                 diag.dotest(40+i, fmt.Sprintf("connecting to service endpoint %s", svc.ExternalURL), func() error {
251                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
252                         defer cancel()
253                         u := svc.ExternalURL
254                         if strings.HasPrefix(u.Scheme, "ws") {
255                                 // We can do a real websocket test elsewhere,
256                                 // but for now we'll just check the https
257                                 // connection.
258                                 u.Scheme = "http" + u.Scheme[2:]
259                         }
260                         if svc == &cluster.Services.WebDAV && strings.HasPrefix(u.Host, "*") {
261                                 u.Host = "d41d8cd98f00b204e9800998ecf8427e-0" + u.Host[1:]
262                         }
263                         req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
264                         if err != nil {
265                                 return err
266                         }
267                         resp, err := http.DefaultClient.Do(req)
268                         if err != nil {
269                                 return err
270                         }
271                         resp.Body.Close()
272                         return nil
273                 })
274         }
275
276         for i, url := range []string{
277                 cluster.Services.Controller.ExternalURL.String(),
278                 cluster.Services.Keepproxy.ExternalURL.String() + "d41d8cd98f00b204e9800998ecf8427e+0",
279                 cluster.Services.WebDAVDownload.ExternalURL.String(),
280         } {
281                 diag.dotest(50+i, fmt.Sprintf("checking CORS headers at %s", url), func() error {
282                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
283                         defer cancel()
284                         req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
285                         if err != nil {
286                                 return err
287                         }
288                         req.Header.Set("Origin", "https://example.com")
289                         resp, err := http.DefaultClient.Do(req)
290                         if err != nil {
291                                 return err
292                         }
293                         if hdr := resp.Header.Get("Access-Control-Allow-Origin"); hdr != "*" {
294                                 return fmt.Errorf("expected \"Access-Control-Allow-Origin: *\", got %q", hdr)
295                         }
296                         return nil
297                 })
298         }
299
300         var keeplist arvados.KeepServiceList
301         diag.dotest(60, "checking internal/external client detection", func() error {
302                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
303                 defer cancel()
304                 err := client.RequestAndDecodeContext(ctx, &keeplist, "GET", "arvados/v1/keep_services/accessible", nil, arvados.ListOptions{Limit: 999999})
305                 if err != nil {
306                         return fmt.Errorf("error getting keep services list: %s", err)
307                 } else if len(keeplist.Items) == 0 {
308                         return fmt.Errorf("controller did not return any keep services")
309                 }
310                 found := map[string]int{}
311                 for _, ks := range keeplist.Items {
312                         found[ks.ServiceType]++
313                 }
314                 isInternal := found["proxy"] == 0 && len(keeplist.Items) > 0
315                 isExternal := found["proxy"] > 0 && found["proxy"] == len(keeplist.Items)
316                 if isExternal {
317                         diag.verbosef("controller returned only proxy services, this host is treated as \"external\"")
318                 } else if isInternal {
319                         diag.verbosef("controller returned only non-proxy services, this host is treated as \"internal\"")
320                 }
321                 if (diag.checkInternal && !isInternal) || (diag.checkExternal && !isExternal) {
322                         return fmt.Errorf("expecting internal=%v external=%v, but found internal=%v external=%v", diag.checkInternal, diag.checkExternal, isInternal, isExternal)
323                 }
324                 return nil
325         })
326
327         for i, ks := range keeplist.Items {
328                 u := url.URL{
329                         Scheme: "http",
330                         Host:   net.JoinHostPort(ks.ServiceHost, fmt.Sprintf("%d", ks.ServicePort)),
331                         Path:   "/",
332                 }
333                 if ks.ServiceSSLFlag {
334                         u.Scheme = "https"
335                 }
336                 diag.dotest(61+i, fmt.Sprintf("reading+writing via keep service at %s", u.String()), func() error {
337                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
338                         defer cancel()
339                         req, err := http.NewRequestWithContext(ctx, "PUT", u.String()+"d41d8cd98f00b204e9800998ecf8427e", nil)
340                         if err != nil {
341                                 return err
342                         }
343                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
344                         resp, err := http.DefaultClient.Do(req)
345                         if err != nil {
346                                 return err
347                         }
348                         defer resp.Body.Close()
349                         body, err := ioutil.ReadAll(resp.Body)
350                         if err != nil {
351                                 return fmt.Errorf("reading response body: %s", err)
352                         }
353                         loc := strings.TrimSpace(string(body))
354                         if !strings.HasPrefix(loc, "d41d8") {
355                                 return fmt.Errorf("unexpected response from write: %q", body)
356                         }
357
358                         req, err = http.NewRequestWithContext(ctx, "GET", u.String()+loc, nil)
359                         if err != nil {
360                                 return err
361                         }
362                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
363                         resp, err = http.DefaultClient.Do(req)
364                         if err != nil {
365                                 return err
366                         }
367                         defer resp.Body.Close()
368                         body, err = ioutil.ReadAll(resp.Body)
369                         if err != nil {
370                                 return fmt.Errorf("reading response body: %s", err)
371                         }
372                         if len(body) != 0 {
373                                 return fmt.Errorf("unexpected response from read: %q", body)
374                         }
375
376                         return nil
377                 })
378         }
379
380         var project arvados.Group
381         diag.dotest(80, fmt.Sprintf("finding/creating %q project", diag.projectName), func() error {
382                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
383                 defer cancel()
384                 var grplist arvados.GroupList
385                 err := client.RequestAndDecodeContext(ctx, &grplist, "GET", "arvados/v1/groups", nil, arvados.ListOptions{
386                         Filters: []arvados.Filter{
387                                 {"name", "=", diag.projectName},
388                                 {"group_class", "=", "project"},
389                                 {"owner_uuid", "=", user.UUID}},
390                         Limit: 999999})
391                 if err != nil {
392                         return fmt.Errorf("list groups: %s", err)
393                 }
394                 if len(grplist.Items) > 0 {
395                         project = grplist.Items[0]
396                         diag.verbosef("using existing project, uuid = %s", project.UUID)
397                         return nil
398                 }
399                 diag.debugf("list groups: ok, no results")
400                 err = client.RequestAndDecodeContext(ctx, &project, "POST", "arvados/v1/groups", nil, map[string]interface{}{"group": map[string]interface{}{
401                         "name":        diag.projectName,
402                         "group_class": "project",
403                 }})
404                 if err != nil {
405                         return fmt.Errorf("create project: %s", err)
406                 }
407                 diag.verbosef("created project, uuid = %s", project.UUID)
408                 return nil
409         })
410
411         var collection arvados.Collection
412         diag.dotest(90, "creating temporary collection", func() error {
413                 if project.UUID == "" {
414                         return fmt.Errorf("skipping, no project to work in")
415                 }
416                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
417                 defer cancel()
418                 err := client.RequestAndDecodeContext(ctx, &collection, "POST", "arvados/v1/collections", nil, map[string]interface{}{
419                         "ensure_unique_name": true,
420                         "collection": map[string]interface{}{
421                                 "owner_uuid": project.UUID,
422                                 "name":       "test collection",
423                                 "trash_at":   time.Now().Add(time.Hour)}})
424                 if err != nil {
425                         return err
426                 }
427                 diag.verbosef("ok, uuid = %s", collection.UUID)
428                 return nil
429         })
430
431         if collection.UUID != "" {
432                 defer func() {
433                         diag.dotest(9990, "deleting temporary collection", func() error {
434                                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
435                                 defer cancel()
436                                 return client.RequestAndDecodeContext(ctx, nil, "DELETE", "arvados/v1/collections/"+collection.UUID, nil, nil)
437                         })
438                 }()
439         }
440
441         // Read hello-world.tar to find image ID, so we can upload it
442         // as "sha256:{...}.tar"
443         var imageSHA2 string
444         {
445                 tr := tar.NewReader(bytes.NewReader(HelloWorldDockerImage))
446                 for {
447                         hdr, err := tr.Next()
448                         if err == io.EOF {
449                                 break
450                         }
451                         if err != nil {
452                                 diag.errorf("internal error/bug: cannot read embedded docker image tar file: %s", err)
453                                 return
454                         }
455                         if s := strings.TrimSuffix(hdr.Name, ".json"); len(s) == 64 && s != hdr.Name {
456                                 imageSHA2 = s
457                         }
458                 }
459                 if imageSHA2 == "" {
460                         diag.errorf("internal error/bug: cannot find {sha256}.json file in embedded docker image tar file")
461                         return
462                 }
463         }
464         tarfilename := "sha256:" + imageSHA2 + ".tar"
465
466         diag.dotest(100, "uploading file via webdav", func() error {
467                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
468                 defer cancel()
469                 if collection.UUID == "" {
470                         return fmt.Errorf("skipping, no test collection")
471                 }
472                 req, err := http.NewRequestWithContext(ctx, "PUT", cluster.Services.WebDAVDownload.ExternalURL.String()+"c="+collection.UUID+"/"+tarfilename, bytes.NewReader(HelloWorldDockerImage))
473                 if err != nil {
474                         return fmt.Errorf("BUG? http.NewRequest: %s", err)
475                 }
476                 req.Header.Set("Authorization", "Bearer "+client.AuthToken)
477                 resp, err := http.DefaultClient.Do(req)
478                 if err != nil {
479                         return fmt.Errorf("error performing http request: %s", err)
480                 }
481                 resp.Body.Close()
482                 if resp.StatusCode != http.StatusCreated {
483                         return fmt.Errorf("status %s", resp.Status)
484                 }
485                 diag.debugf("ok, status %s", resp.Status)
486                 err = client.RequestAndDecodeContext(ctx, &collection, "GET", "arvados/v1/collections/"+collection.UUID, nil, nil)
487                 if err != nil {
488                         return fmt.Errorf("get updated collection: %s", err)
489                 }
490                 diag.debugf("ok, pdh %s", collection.PortableDataHash)
491                 return nil
492         })
493
494         davurl := cluster.Services.WebDAV.ExternalURL
495         davWildcard := strings.HasPrefix(davurl.Host, "*--") || strings.HasPrefix(davurl.Host, "*.")
496         diag.dotest(110, fmt.Sprintf("checking WebDAV ExternalURL wildcard (%s)", davurl), func() error {
497                 if davurl.Host == "" {
498                         return fmt.Errorf("host missing - content previews will not work")
499                 }
500                 if !davWildcard && !cluster.Collections.TrustAllContent {
501                         diag.warnf("WebDAV ExternalURL has no leading wildcard and TrustAllContent==false - content previews will not work")
502                 }
503                 return nil
504         })
505
506         for i, trial := range []struct {
507                 needcoll     bool
508                 needWildcard bool
509                 status       int
510                 fileurl      string
511         }{
512                 {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + "foo"},
513                 {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + tarfilename},
514                 {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/foo"},
515                 {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/" + tarfilename},
516                 {true, true, http.StatusOK, strings.Replace(davurl.String(), "*", strings.Replace(collection.PortableDataHash, "+", "-", -1), 1) + tarfilename},
517                 {true, false, http.StatusOK, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=" + collection.UUID + "/_/" + tarfilename},
518         } {
519                 diag.dotest(120+i, fmt.Sprintf("downloading from webdav (%s)", trial.fileurl), func() error {
520                         if trial.needWildcard && !davWildcard {
521                                 diag.warnf("skipping collection-id-in-vhost test because WebDAV ExternalURL has no leading wildcard")
522                                 return nil
523                         }
524                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
525                         defer cancel()
526                         if trial.needcoll && collection.UUID == "" {
527                                 return fmt.Errorf("skipping, no test collection")
528                         }
529                         req, err := http.NewRequestWithContext(ctx, "GET", trial.fileurl, nil)
530                         if err != nil {
531                                 return err
532                         }
533                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
534                         resp, err := http.DefaultClient.Do(req)
535                         if err != nil {
536                                 return err
537                         }
538                         defer resp.Body.Close()
539                         body, err := ioutil.ReadAll(resp.Body)
540                         if err != nil {
541                                 return fmt.Errorf("reading response: %s", err)
542                         }
543                         if resp.StatusCode != trial.status {
544                                 return fmt.Errorf("unexpected response status: %s", resp.Status)
545                         }
546                         if trial.status == http.StatusOK && !bytes.Equal(body, HelloWorldDockerImage) {
547                                 excerpt := body
548                                 if len(excerpt) > 128 {
549                                         excerpt = append([]byte(nil), body[:128]...)
550                                         excerpt = append(excerpt, []byte("[...]")...)
551                                 }
552                                 return fmt.Errorf("unexpected response content: len %d, %q", len(body), excerpt)
553                         }
554                         return nil
555                 })
556         }
557
558         var vm arvados.VirtualMachine
559         diag.dotest(130, "getting list of virtual machines", func() error {
560                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
561                 defer cancel()
562                 var vmlist arvados.VirtualMachineList
563                 err := client.RequestAndDecodeContext(ctx, &vmlist, "GET", "arvados/v1/virtual_machines", nil, arvados.ListOptions{Limit: 999999})
564                 if err != nil {
565                         return err
566                 }
567                 if len(vmlist.Items) < 1 {
568                         diag.warnf("no VMs found")
569                 } else {
570                         vm = vmlist.Items[0]
571                 }
572                 return nil
573         })
574
575         diag.dotest(140, "getting workbench1 webshell page", func() error {
576                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
577                 defer cancel()
578                 if vm.UUID == "" {
579                         diag.warnf("skipping, no vm available")
580                         return nil
581                 }
582                 webshelltermurl := cluster.Services.Workbench1.ExternalURL.String() + "virtual_machines/" + vm.UUID + "/webshell/testusername"
583                 diag.debugf("url %s", webshelltermurl)
584                 req, err := http.NewRequestWithContext(ctx, "GET", webshelltermurl, nil)
585                 if err != nil {
586                         return err
587                 }
588                 req.Header.Set("Authorization", "Bearer "+client.AuthToken)
589                 resp, err := http.DefaultClient.Do(req)
590                 if err != nil {
591                         return err
592                 }
593                 defer resp.Body.Close()
594                 body, err := ioutil.ReadAll(resp.Body)
595                 if err != nil {
596                         return fmt.Errorf("reading response: %s", err)
597                 }
598                 if resp.StatusCode != http.StatusOK {
599                         return fmt.Errorf("unexpected response status: %s %q", resp.Status, body)
600                 }
601                 return nil
602         })
603
604         diag.dotest(150, "connecting to webshell service", func() error {
605                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
606                 defer cancel()
607                 if vm.UUID == "" {
608                         diag.warnf("skipping, no vm available")
609                         return nil
610                 }
611                 u := cluster.Services.WebShell.ExternalURL
612                 webshellurl := u.String() + vm.Hostname + "?"
613                 if strings.HasPrefix(u.Host, "*") {
614                         u.Host = vm.Hostname + u.Host[1:]
615                         webshellurl = u.String() + "?"
616                 }
617                 diag.debugf("url %s", webshellurl)
618                 req, err := http.NewRequestWithContext(ctx, "POST", webshellurl, bytes.NewBufferString(url.Values{
619                         "width":   {"80"},
620                         "height":  {"25"},
621                         "session": {"xyzzy"},
622                         "rooturl": {webshellurl},
623                 }.Encode()))
624                 if err != nil {
625                         return err
626                 }
627                 req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
628                 resp, err := http.DefaultClient.Do(req)
629                 if err != nil {
630                         return err
631                 }
632                 defer resp.Body.Close()
633                 diag.debugf("response status %s", resp.Status)
634                 body, err := ioutil.ReadAll(resp.Body)
635                 if err != nil {
636                         return fmt.Errorf("reading response: %s", err)
637                 }
638                 diag.debugf("response body %q", body)
639                 // We don't speak the protocol, so we get a 400 error
640                 // from the webshell server even if everything is
641                 // OK. Anything else (404, 502, ???) indicates a
642                 // problem.
643                 if resp.StatusCode != http.StatusBadRequest {
644                         return fmt.Errorf("unexpected response status: %s, %q", resp.Status, body)
645                 }
646                 return nil
647         })
648
649         diag.dotest(160, "running a container", func() error {
650                 if diag.priority < 1 {
651                         diag.infof("skipping (use priority > 0 if you want to run a container)")
652                         return nil
653                 }
654                 if project.UUID == "" {
655                         return fmt.Errorf("skipping, no project to work in")
656                 }
657
658                 timestamp := time.Now().Format(time.RFC3339)
659                 ctrCommand := []string{"echo", timestamp}
660                 if diag.dockerImage == "" {
661                         if collection.UUID == "" {
662                                 return fmt.Errorf("skipping, no test collection to use as docker image")
663                         }
664                         diag.dockerImage = collection.PortableDataHash
665                         ctrCommand = []string{"/hello"}
666                 }
667
668                 var cr arvados.ContainerRequest
669                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
670                 defer cancel()
671
672                 err := client.RequestAndDecodeContext(ctx, &cr, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{"container_request": map[string]interface{}{
673                         "owner_uuid":      project.UUID,
674                         "name":            fmt.Sprintf("diagnostics container request %s", timestamp),
675                         "container_image": diag.dockerImage,
676                         "command":         ctrCommand,
677                         "use_existing":    false,
678                         "output_path":     "/mnt/output",
679                         "output_name":     fmt.Sprintf("diagnostics output %s", timestamp),
680                         "priority":        diag.priority,
681                         "state":           arvados.ContainerRequestStateCommitted,
682                         "mounts": map[string]map[string]interface{}{
683                                 "/mnt/output": {
684                                         "kind":     "collection",
685                                         "writable": true,
686                                 },
687                         },
688                         "runtime_constraints": arvados.RuntimeConstraints{
689                                 VCPUs:        1,
690                                 RAM:          1 << 26,
691                                 KeepCacheRAM: 1 << 26,
692                         },
693                 }})
694                 if err != nil {
695                         return err
696                 }
697                 diag.verbosef("container request uuid = %s", cr.UUID)
698                 diag.verbosef("container uuid = %s", cr.ContainerUUID)
699
700                 timeout := 10 * time.Minute
701                 diag.infof("container request submitted, waiting up to %v for container to run", arvados.Duration(timeout))
702                 ctx, cancel = context.WithDeadline(context.Background(), time.Now().Add(timeout))
703                 defer cancel()
704
705                 var c arvados.Container
706                 for ; cr.State != arvados.ContainerRequestStateFinal; time.Sleep(2 * time.Second) {
707                         ctx, cancel := context.WithDeadline(ctx, time.Now().Add(diag.timeout))
708                         defer cancel()
709
710                         crStateWas := cr.State
711                         err := client.RequestAndDecodeContext(ctx, &cr, "GET", "arvados/v1/container_requests/"+cr.UUID, nil, nil)
712                         if err != nil {
713                                 return err
714                         }
715                         if cr.State != crStateWas {
716                                 diag.debugf("container request state = %s", cr.State)
717                         }
718
719                         cStateWas := c.State
720                         err = client.RequestAndDecodeContext(ctx, &c, "GET", "arvados/v1/containers/"+cr.ContainerUUID, nil, nil)
721                         if err != nil {
722                                 return err
723                         }
724                         if c.State != cStateWas {
725                                 diag.debugf("container state = %s", c.State)
726                         }
727                 }
728
729                 if c.State != arvados.ContainerStateComplete {
730                         return fmt.Errorf("container request %s is final but container %s did not complete: container state = %q", cr.UUID, cr.ContainerUUID, c.State)
731                 } else if c.ExitCode != 0 {
732                         return fmt.Errorf("container exited %d", c.ExitCode)
733                 }
734                 return nil
735         })
736 }