20083: Track both loadedPDH and savedPDH for checking local changes.
[arvados.git] / lib / diagnostics / cmd.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package diagnostics
6
7 import (
8         "archive/tar"
9         "bytes"
10         "context"
11         _ "embed"
12         "flag"
13         "fmt"
14         "io"
15         "io/ioutil"
16         "net"
17         "net/http"
18         "net/url"
19         "os"
20         "strings"
21         "time"
22
23         "git.arvados.org/arvados.git/lib/cmd"
24         "git.arvados.org/arvados.git/lib/config"
25         "git.arvados.org/arvados.git/sdk/go/arvados"
26         "git.arvados.org/arvados.git/sdk/go/ctxlog"
27         "git.arvados.org/arvados.git/sdk/go/health"
28         "github.com/sirupsen/logrus"
29 )
30
31 type Command struct{}
32
33 func (Command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
34         var diag diagnoser
35         f := flag.NewFlagSet(prog, flag.ContinueOnError)
36         f.StringVar(&diag.projectName, "project-name", "scratch area for diagnostics", "name of project to find/create in home project and use for temporary/test objects")
37         f.StringVar(&diag.logLevel, "log-level", "info", "logging level (debug, info, warning, error)")
38         f.StringVar(&diag.dockerImage, "docker-image", "", "image to use when running a test container (default: use embedded hello-world image)")
39         f.BoolVar(&diag.checkInternal, "internal-client", false, "check that this host is considered an \"internal\" client")
40         f.BoolVar(&diag.checkExternal, "external-client", false, "check that this host is considered an \"external\" client")
41         f.BoolVar(&diag.verbose, "v", false, "verbose: include more information in report")
42         f.IntVar(&diag.priority, "priority", 500, "priority for test container (1..1000, or 0 to skip)")
43         f.DurationVar(&diag.timeout, "timeout", 10*time.Second, "timeout for http requests")
44         if ok, code := cmd.ParseFlags(f, prog, args, "", stderr); !ok {
45                 return code
46         }
47         diag.logger = ctxlog.New(stdout, "text", diag.logLevel)
48         diag.logger.SetFormatter(&logrus.TextFormatter{DisableTimestamp: true, DisableLevelTruncation: true, PadLevelText: true})
49         diag.runtests()
50         if len(diag.errors) == 0 {
51                 diag.logger.Info("--- no errors ---")
52                 return 0
53         } else {
54                 if diag.logger.Level > logrus.ErrorLevel {
55                         fmt.Fprint(stdout, "\n--- cut here --- error summary ---\n\n")
56                         for _, e := range diag.errors {
57                                 diag.logger.Error(e)
58                         }
59                 }
60                 return 1
61         }
62 }
63
64 // docker save hello-world > hello-world.tar
65 //
66 //go:embed hello-world.tar
67 var HelloWorldDockerImage []byte
68
69 type diagnoser struct {
70         stdout        io.Writer
71         stderr        io.Writer
72         logLevel      string
73         priority      int
74         projectName   string
75         dockerImage   string
76         checkInternal bool
77         checkExternal bool
78         verbose       bool
79         timeout       time.Duration
80         logger        *logrus.Logger
81         errors        []string
82         done          map[int]bool
83 }
84
85 func (diag *diagnoser) debugf(f string, args ...interface{}) {
86         diag.logger.Debugf("  ... "+f, args...)
87 }
88
89 func (diag *diagnoser) infof(f string, args ...interface{}) {
90         diag.logger.Infof("  ... "+f, args...)
91 }
92
93 func (diag *diagnoser) verbosef(f string, args ...interface{}) {
94         if diag.verbose {
95                 diag.logger.Infof("  ... "+f, args...)
96         }
97 }
98
99 func (diag *diagnoser) warnf(f string, args ...interface{}) {
100         diag.logger.Warnf("  ... "+f, args...)
101 }
102
103 func (diag *diagnoser) errorf(f string, args ...interface{}) {
104         diag.logger.Errorf(f, args...)
105         diag.errors = append(diag.errors, fmt.Sprintf(f, args...))
106 }
107
108 // Run the given func, logging appropriate messages before and after,
109 // adding timing info, etc.
110 //
111 // The id argument should be unique among tests, and shouldn't change
112 // when other tests are added/removed.
113 func (diag *diagnoser) dotest(id int, title string, fn func() error) {
114         if diag.done == nil {
115                 diag.done = map[int]bool{}
116         } else if diag.done[id] {
117                 diag.errorf("(bug) reused test id %d", id)
118         }
119         diag.done[id] = true
120
121         diag.logger.Infof("%4d: %s", id, title)
122         t0 := time.Now()
123         err := fn()
124         elapsed := fmt.Sprintf("%d ms", time.Now().Sub(t0)/time.Millisecond)
125         if err != nil {
126                 diag.errorf("%4d: %s (%s): %s", id, title, elapsed, err)
127         } else {
128                 diag.logger.Debugf("%4d: %s (%s): ok", id, title, elapsed)
129         }
130 }
131
132 func (diag *diagnoser) runtests() {
133         client := arvados.NewClientFromEnv()
134
135         if client.APIHost == "" || client.AuthToken == "" {
136                 diag.errorf("ARVADOS_API_HOST and ARVADOS_API_TOKEN environment variables are not set -- aborting without running any tests")
137                 return
138         }
139
140         hostname, err := os.Hostname()
141         if err != nil {
142                 diag.warnf("error getting hostname: %s")
143         } else {
144                 diag.verbosef("hostname = %s", hostname)
145         }
146
147         diag.dotest(5, "running health check (same as `arvados-server check`)", func() error {
148                 ldr := config.NewLoader(&bytes.Buffer{}, ctxlog.New(&bytes.Buffer{}, "text", "info"))
149                 ldr.SetupFlags(flag.NewFlagSet("diagnostics", flag.ContinueOnError))
150                 cfg, err := ldr.Load()
151                 if err != nil {
152                         diag.infof("skipping because config could not be loaded: %s", err)
153                         return nil
154                 }
155                 cluster, err := cfg.GetCluster("")
156                 if err != nil {
157                         return err
158                 }
159                 if cluster.SystemRootToken != os.Getenv("ARVADOS_API_TOKEN") {
160                         return fmt.Errorf("diagnostics usage error: %s is readable but SystemRootToken does not match $ARVADOS_API_TOKEN (to fix, either run 'arvados-client sudo diagnostics' to load everything from config file, or set ARVADOS_CONFIG=- to load nothing from config file)", ldr.Path)
161                 }
162                 agg := &health.Aggregator{Cluster: cluster}
163                 resp := agg.ClusterHealth()
164                 for _, e := range resp.Errors {
165                         diag.errorf("health check: %s", e)
166                 }
167                 if len(resp.Errors) > 0 {
168                         diag.infof("consider running `arvados-server check -yaml` for a comprehensive report")
169                 }
170                 diag.verbosef("reported clock skew = %v", resp.ClockSkew)
171                 reported := map[string]bool{}
172                 for _, result := range resp.Checks {
173                         version := strings.SplitN(result.Metrics.Version, " (go", 2)[0]
174                         if version != "" && !reported[version] {
175                                 diag.verbosef("arvados version = %s", version)
176                                 reported[version] = true
177                         }
178                 }
179                 reported = map[string]bool{}
180                 for _, result := range resp.Checks {
181                         if result.Server != "" && !reported[result.Server] {
182                                 diag.verbosef("http frontend version = %s", result.Server)
183                                 reported[result.Server] = true
184                         }
185                 }
186                 reported = map[string]bool{}
187                 for _, result := range resp.Checks {
188                         if sha := result.ConfigSourceSHA256; sha != "" && !reported[sha] {
189                                 diag.verbosef("config file sha256 = %s", sha)
190                                 reported[sha] = true
191                         }
192                 }
193                 return nil
194         })
195
196         var dd arvados.DiscoveryDocument
197         ddpath := "discovery/v1/apis/arvados/v1/rest"
198         diag.dotest(10, fmt.Sprintf("getting discovery document from https://%s/%s", client.APIHost, ddpath), func() error {
199                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
200                 defer cancel()
201                 err := client.RequestAndDecodeContext(ctx, &dd, "GET", ddpath, nil, nil)
202                 if err != nil {
203                         return err
204                 }
205                 diag.verbosef("BlobSignatureTTL = %d", dd.BlobSignatureTTL)
206                 return nil
207         })
208
209         var cluster arvados.Cluster
210         cfgpath := "arvados/v1/config"
211         cfgOK := false
212         diag.dotest(20, fmt.Sprintf("getting exported config from https://%s/%s", client.APIHost, cfgpath), func() error {
213                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
214                 defer cancel()
215                 err := client.RequestAndDecodeContext(ctx, &cluster, "GET", cfgpath, nil, nil)
216                 if err != nil {
217                         return err
218                 }
219                 diag.verbosef("Collections.BlobSigning = %v", cluster.Collections.BlobSigning)
220                 cfgOK = true
221                 return nil
222         })
223
224         var user arvados.User
225         diag.dotest(30, "getting current user record", func() error {
226                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
227                 defer cancel()
228                 err := client.RequestAndDecodeContext(ctx, &user, "GET", "arvados/v1/users/current", nil, nil)
229                 if err != nil {
230                         return err
231                 }
232                 diag.verbosef("user uuid = %s", user.UUID)
233                 return nil
234         })
235
236         if !cfgOK {
237                 diag.errorf("cannot proceed without cluster config -- aborting without running any further tests")
238                 return
239         }
240
241         // uncomment to create some spurious errors
242         // cluster.Services.WebDAVDownload.ExternalURL.Host = "0.0.0.0:9"
243
244         // TODO: detect routing errors here, like finding wb2 at the
245         // wb1 address.
246         for i, svc := range []*arvados.Service{
247                 &cluster.Services.Keepproxy,
248                 &cluster.Services.WebDAV,
249                 &cluster.Services.WebDAVDownload,
250                 &cluster.Services.Websocket,
251                 &cluster.Services.Workbench1,
252                 &cluster.Services.Workbench2,
253         } {
254                 diag.dotest(40+i, fmt.Sprintf("connecting to service endpoint %s", svc.ExternalURL), func() error {
255                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
256                         defer cancel()
257                         u := svc.ExternalURL
258                         if strings.HasPrefix(u.Scheme, "ws") {
259                                 // We can do a real websocket test elsewhere,
260                                 // but for now we'll just check the https
261                                 // connection.
262                                 u.Scheme = "http" + u.Scheme[2:]
263                         }
264                         if svc == &cluster.Services.WebDAV && strings.HasPrefix(u.Host, "*") {
265                                 u.Host = "d41d8cd98f00b204e9800998ecf8427e-0" + u.Host[1:]
266                         }
267                         req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
268                         if err != nil {
269                                 return err
270                         }
271                         resp, err := http.DefaultClient.Do(req)
272                         if err != nil {
273                                 return err
274                         }
275                         resp.Body.Close()
276                         return nil
277                 })
278         }
279
280         for i, url := range []string{
281                 cluster.Services.Controller.ExternalURL.String(),
282                 cluster.Services.Keepproxy.ExternalURL.String() + "d41d8cd98f00b204e9800998ecf8427e+0",
283                 cluster.Services.WebDAVDownload.ExternalURL.String(),
284         } {
285                 diag.dotest(50+i, fmt.Sprintf("checking CORS headers at %s", url), func() error {
286                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
287                         defer cancel()
288                         req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
289                         if err != nil {
290                                 return err
291                         }
292                         req.Header.Set("Origin", "https://example.com")
293                         resp, err := http.DefaultClient.Do(req)
294                         if err != nil {
295                                 return err
296                         }
297                         if hdr := resp.Header.Get("Access-Control-Allow-Origin"); hdr != "*" {
298                                 return fmt.Errorf("expected \"Access-Control-Allow-Origin: *\", got %q", hdr)
299                         }
300                         return nil
301                 })
302         }
303
304         var keeplist arvados.KeepServiceList
305         diag.dotest(60, "checking internal/external client detection", func() error {
306                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
307                 defer cancel()
308                 err := client.RequestAndDecodeContext(ctx, &keeplist, "GET", "arvados/v1/keep_services/accessible", nil, arvados.ListOptions{Limit: 999999})
309                 if err != nil {
310                         return fmt.Errorf("error getting keep services list: %s", err)
311                 } else if len(keeplist.Items) == 0 {
312                         return fmt.Errorf("controller did not return any keep services")
313                 }
314                 found := map[string]int{}
315                 for _, ks := range keeplist.Items {
316                         found[ks.ServiceType]++
317                 }
318                 isInternal := found["proxy"] == 0 && len(keeplist.Items) > 0
319                 isExternal := found["proxy"] > 0 && found["proxy"] == len(keeplist.Items)
320                 if isExternal {
321                         diag.infof("controller returned only proxy services, this host is treated as \"external\"")
322                 } else if isInternal {
323                         diag.infof("controller returned only non-proxy services, this host is treated as \"internal\"")
324                 }
325                 if (diag.checkInternal && !isInternal) || (diag.checkExternal && !isExternal) {
326                         return fmt.Errorf("expecting internal=%v external=%v, but found internal=%v external=%v", diag.checkInternal, diag.checkExternal, isInternal, isExternal)
327                 }
328                 return nil
329         })
330
331         for i, ks := range keeplist.Items {
332                 u := url.URL{
333                         Scheme: "http",
334                         Host:   net.JoinHostPort(ks.ServiceHost, fmt.Sprintf("%d", ks.ServicePort)),
335                         Path:   "/",
336                 }
337                 if ks.ServiceSSLFlag {
338                         u.Scheme = "https"
339                 }
340                 diag.dotest(61+i, fmt.Sprintf("reading+writing via keep service at %s", u.String()), func() error {
341                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
342                         defer cancel()
343                         req, err := http.NewRequestWithContext(ctx, "PUT", u.String()+"d41d8cd98f00b204e9800998ecf8427e", nil)
344                         if err != nil {
345                                 return err
346                         }
347                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
348                         resp, err := http.DefaultClient.Do(req)
349                         if err != nil {
350                                 return err
351                         }
352                         defer resp.Body.Close()
353                         body, err := ioutil.ReadAll(resp.Body)
354                         if err != nil {
355                                 return fmt.Errorf("reading response body: %s", err)
356                         }
357                         loc := strings.TrimSpace(string(body))
358                         if !strings.HasPrefix(loc, "d41d8") {
359                                 return fmt.Errorf("unexpected response from write: %q", body)
360                         }
361
362                         req, err = http.NewRequestWithContext(ctx, "GET", u.String()+loc, nil)
363                         if err != nil {
364                                 return err
365                         }
366                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
367                         resp, err = http.DefaultClient.Do(req)
368                         if err != nil {
369                                 return err
370                         }
371                         defer resp.Body.Close()
372                         body, err = ioutil.ReadAll(resp.Body)
373                         if err != nil {
374                                 return fmt.Errorf("reading response body: %s", err)
375                         }
376                         if len(body) != 0 {
377                                 return fmt.Errorf("unexpected response from read: %q", body)
378                         }
379
380                         return nil
381                 })
382         }
383
384         var project arvados.Group
385         diag.dotest(80, fmt.Sprintf("finding/creating %q project", diag.projectName), func() error {
386                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
387                 defer cancel()
388                 var grplist arvados.GroupList
389                 err := client.RequestAndDecodeContext(ctx, &grplist, "GET", "arvados/v1/groups", nil, arvados.ListOptions{
390                         Filters: []arvados.Filter{
391                                 {"name", "=", diag.projectName},
392                                 {"group_class", "=", "project"},
393                                 {"owner_uuid", "=", user.UUID}},
394                         Limit: 999999})
395                 if err != nil {
396                         return fmt.Errorf("list groups: %s", err)
397                 }
398                 if len(grplist.Items) > 0 {
399                         project = grplist.Items[0]
400                         diag.verbosef("using existing project, uuid = %s", project.UUID)
401                         return nil
402                 }
403                 diag.debugf("list groups: ok, no results")
404                 err = client.RequestAndDecodeContext(ctx, &project, "POST", "arvados/v1/groups", nil, map[string]interface{}{"group": map[string]interface{}{
405                         "name":        diag.projectName,
406                         "group_class": "project",
407                 }})
408                 if err != nil {
409                         return fmt.Errorf("create project: %s", err)
410                 }
411                 diag.verbosef("created project, uuid = %s", project.UUID)
412                 return nil
413         })
414
415         var collection arvados.Collection
416         diag.dotest(90, "creating temporary collection", func() error {
417                 if project.UUID == "" {
418                         return fmt.Errorf("skipping, no project to work in")
419                 }
420                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
421                 defer cancel()
422                 err := client.RequestAndDecodeContext(ctx, &collection, "POST", "arvados/v1/collections", nil, map[string]interface{}{
423                         "ensure_unique_name": true,
424                         "collection": map[string]interface{}{
425                                 "owner_uuid": project.UUID,
426                                 "name":       "test collection",
427                                 "trash_at":   time.Now().Add(time.Hour)}})
428                 if err != nil {
429                         return err
430                 }
431                 diag.verbosef("ok, uuid = %s", collection.UUID)
432                 return nil
433         })
434
435         if collection.UUID != "" {
436                 defer func() {
437                         diag.dotest(9990, "deleting temporary collection", func() error {
438                                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
439                                 defer cancel()
440                                 return client.RequestAndDecodeContext(ctx, nil, "DELETE", "arvados/v1/collections/"+collection.UUID, nil, nil)
441                         })
442                 }()
443         }
444
445         // Read hello-world.tar to find image ID, so we can upload it
446         // as "sha256:{...}.tar"
447         var imageSHA2 string
448         {
449                 tr := tar.NewReader(bytes.NewReader(HelloWorldDockerImage))
450                 for {
451                         hdr, err := tr.Next()
452                         if err == io.EOF {
453                                 break
454                         }
455                         if err != nil {
456                                 diag.errorf("internal error/bug: cannot read embedded docker image tar file: %s", err)
457                                 return
458                         }
459                         if s := strings.TrimSuffix(hdr.Name, ".json"); len(s) == 64 && s != hdr.Name {
460                                 imageSHA2 = s
461                         }
462                 }
463                 if imageSHA2 == "" {
464                         diag.errorf("internal error/bug: cannot find {sha256}.json file in embedded docker image tar file")
465                         return
466                 }
467         }
468         tarfilename := "sha256:" + imageSHA2 + ".tar"
469
470         diag.dotest(100, "uploading file via webdav", func() error {
471                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
472                 defer cancel()
473                 if collection.UUID == "" {
474                         return fmt.Errorf("skipping, no test collection")
475                 }
476                 req, err := http.NewRequestWithContext(ctx, "PUT", cluster.Services.WebDAVDownload.ExternalURL.String()+"c="+collection.UUID+"/"+tarfilename, bytes.NewReader(HelloWorldDockerImage))
477                 if err != nil {
478                         return fmt.Errorf("BUG? http.NewRequest: %s", err)
479                 }
480                 req.Header.Set("Authorization", "Bearer "+client.AuthToken)
481                 resp, err := http.DefaultClient.Do(req)
482                 if err != nil {
483                         return fmt.Errorf("error performing http request: %s", err)
484                 }
485                 resp.Body.Close()
486                 if resp.StatusCode != http.StatusCreated {
487                         return fmt.Errorf("status %s", resp.Status)
488                 }
489                 diag.debugf("ok, status %s", resp.Status)
490                 err = client.RequestAndDecodeContext(ctx, &collection, "GET", "arvados/v1/collections/"+collection.UUID, nil, nil)
491                 if err != nil {
492                         return fmt.Errorf("get updated collection: %s", err)
493                 }
494                 diag.debugf("ok, pdh %s", collection.PortableDataHash)
495                 return nil
496         })
497
498         davurl := cluster.Services.WebDAV.ExternalURL
499         davWildcard := strings.HasPrefix(davurl.Host, "*--") || strings.HasPrefix(davurl.Host, "*.")
500         diag.dotest(110, fmt.Sprintf("checking WebDAV ExternalURL wildcard (%s)", davurl), func() error {
501                 if davurl.Host == "" {
502                         return fmt.Errorf("host missing - content previews will not work")
503                 }
504                 if !davWildcard && !cluster.Collections.TrustAllContent {
505                         diag.warnf("WebDAV ExternalURL has no leading wildcard and TrustAllContent==false - content previews will not work")
506                 }
507                 return nil
508         })
509
510         for i, trial := range []struct {
511                 needcoll     bool
512                 needWildcard bool
513                 status       int
514                 fileurl      string
515         }{
516                 {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + "foo"},
517                 {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + tarfilename},
518                 {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/foo"},
519                 {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/" + tarfilename},
520                 {true, true, http.StatusOK, strings.Replace(davurl.String(), "*", strings.Replace(collection.PortableDataHash, "+", "-", -1), 1) + tarfilename},
521                 {true, false, http.StatusOK, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=" + collection.UUID + "/_/" + tarfilename},
522         } {
523                 diag.dotest(120+i, fmt.Sprintf("downloading from webdav (%s)", trial.fileurl), func() error {
524                         if trial.needWildcard && !davWildcard {
525                                 diag.warnf("skipping collection-id-in-vhost test because WebDAV ExternalURL has no leading wildcard")
526                                 return nil
527                         }
528                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
529                         defer cancel()
530                         if trial.needcoll && collection.UUID == "" {
531                                 return fmt.Errorf("skipping, no test collection")
532                         }
533                         req, err := http.NewRequestWithContext(ctx, "GET", trial.fileurl, nil)
534                         if err != nil {
535                                 return err
536                         }
537                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
538                         resp, err := http.DefaultClient.Do(req)
539                         if err != nil {
540                                 return err
541                         }
542                         defer resp.Body.Close()
543                         body, err := ioutil.ReadAll(resp.Body)
544                         if err != nil {
545                                 return fmt.Errorf("reading response: %s", err)
546                         }
547                         if resp.StatusCode != trial.status {
548                                 return fmt.Errorf("unexpected response status: %s", resp.Status)
549                         }
550                         if trial.status == http.StatusOK && !bytes.Equal(body, HelloWorldDockerImage) {
551                                 excerpt := body
552                                 if len(excerpt) > 128 {
553                                         excerpt = append([]byte(nil), body[:128]...)
554                                         excerpt = append(excerpt, []byte("[...]")...)
555                                 }
556                                 return fmt.Errorf("unexpected response content: len %d, %q", len(body), excerpt)
557                         }
558                         return nil
559                 })
560         }
561
562         var vm arvados.VirtualMachine
563         diag.dotest(130, "getting list of virtual machines", func() error {
564                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
565                 defer cancel()
566                 var vmlist arvados.VirtualMachineList
567                 err := client.RequestAndDecodeContext(ctx, &vmlist, "GET", "arvados/v1/virtual_machines", nil, arvados.ListOptions{Limit: 999999})
568                 if err != nil {
569                         return err
570                 }
571                 if len(vmlist.Items) < 1 {
572                         diag.warnf("no VMs found")
573                 } else {
574                         vm = vmlist.Items[0]
575                 }
576                 return nil
577         })
578
579         diag.dotest(140, "getting workbench1 webshell page", func() error {
580                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
581                 defer cancel()
582                 if vm.UUID == "" {
583                         diag.warnf("skipping, no vm available")
584                         return nil
585                 }
586                 webshelltermurl := cluster.Services.Workbench1.ExternalURL.String() + "virtual_machines/" + vm.UUID + "/webshell/testusername"
587                 diag.debugf("url %s", webshelltermurl)
588                 req, err := http.NewRequestWithContext(ctx, "GET", webshelltermurl, nil)
589                 if err != nil {
590                         return err
591                 }
592                 req.Header.Set("Authorization", "Bearer "+client.AuthToken)
593                 resp, err := http.DefaultClient.Do(req)
594                 if err != nil {
595                         return err
596                 }
597                 defer resp.Body.Close()
598                 body, err := ioutil.ReadAll(resp.Body)
599                 if err != nil {
600                         return fmt.Errorf("reading response: %s", err)
601                 }
602                 if resp.StatusCode != http.StatusOK {
603                         return fmt.Errorf("unexpected response status: %s %q", resp.Status, body)
604                 }
605                 return nil
606         })
607
608         diag.dotest(150, "connecting to webshell service", func() error {
609                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
610                 defer cancel()
611                 if vm.UUID == "" {
612                         diag.warnf("skipping, no vm available")
613                         return nil
614                 }
615                 u := cluster.Services.WebShell.ExternalURL
616                 webshellurl := u.String() + vm.Hostname + "?"
617                 if strings.HasPrefix(u.Host, "*") {
618                         u.Host = vm.Hostname + u.Host[1:]
619                         webshellurl = u.String() + "?"
620                 }
621                 diag.debugf("url %s", webshellurl)
622                 req, err := http.NewRequestWithContext(ctx, "POST", webshellurl, bytes.NewBufferString(url.Values{
623                         "width":   {"80"},
624                         "height":  {"25"},
625                         "session": {"xyzzy"},
626                         "rooturl": {webshellurl},
627                 }.Encode()))
628                 if err != nil {
629                         return err
630                 }
631                 req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
632                 resp, err := http.DefaultClient.Do(req)
633                 if err != nil {
634                         return err
635                 }
636                 defer resp.Body.Close()
637                 diag.debugf("response status %s", resp.Status)
638                 body, err := ioutil.ReadAll(resp.Body)
639                 if err != nil {
640                         return fmt.Errorf("reading response: %s", err)
641                 }
642                 diag.debugf("response body %q", body)
643                 // We don't speak the protocol, so we get a 400 error
644                 // from the webshell server even if everything is
645                 // OK. Anything else (404, 502, ???) indicates a
646                 // problem.
647                 if resp.StatusCode != http.StatusBadRequest {
648                         return fmt.Errorf("unexpected response status: %s, %q", resp.Status, body)
649                 }
650                 return nil
651         })
652
653         diag.dotest(160, "running a container", func() error {
654                 if diag.priority < 1 {
655                         diag.infof("skipping (use priority > 0 if you want to run a container)")
656                         return nil
657                 }
658                 if project.UUID == "" {
659                         return fmt.Errorf("skipping, no project to work in")
660                 }
661
662                 timestamp := time.Now().Format(time.RFC3339)
663                 ctrCommand := []string{"echo", timestamp}
664                 if diag.dockerImage == "" {
665                         if collection.UUID == "" {
666                                 return fmt.Errorf("skipping, no test collection to use as docker image")
667                         }
668                         diag.dockerImage = collection.PortableDataHash
669                         ctrCommand = []string{"/hello"}
670                 }
671
672                 var cr arvados.ContainerRequest
673                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
674                 defer cancel()
675
676                 err := client.RequestAndDecodeContext(ctx, &cr, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{"container_request": map[string]interface{}{
677                         "owner_uuid":      project.UUID,
678                         "name":            fmt.Sprintf("diagnostics container request %s", timestamp),
679                         "container_image": diag.dockerImage,
680                         "command":         ctrCommand,
681                         "use_existing":    false,
682                         "output_path":     "/mnt/output",
683                         "output_name":     fmt.Sprintf("diagnostics output %s", timestamp),
684                         "priority":        diag.priority,
685                         "state":           arvados.ContainerRequestStateCommitted,
686                         "mounts": map[string]map[string]interface{}{
687                                 "/mnt/output": {
688                                         "kind":     "collection",
689                                         "writable": true,
690                                 },
691                         },
692                         "runtime_constraints": arvados.RuntimeConstraints{
693                                 VCPUs:        1,
694                                 RAM:          1 << 26,
695                                 KeepCacheRAM: 1 << 26,
696                         },
697                 }})
698                 if err != nil {
699                         return err
700                 }
701                 diag.verbosef("container request uuid = %s", cr.UUID)
702                 diag.verbosef("container uuid = %s", cr.ContainerUUID)
703
704                 timeout := 10 * time.Minute
705                 diag.infof("container request submitted, waiting up to %v for container to run", arvados.Duration(timeout))
706                 deadline := time.Now().Add(timeout)
707
708                 var c arvados.Container
709                 for ; cr.State != arvados.ContainerRequestStateFinal && time.Now().Before(deadline); time.Sleep(2 * time.Second) {
710                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
711                         defer cancel()
712
713                         crStateWas := cr.State
714                         err := client.RequestAndDecodeContext(ctx, &cr, "GET", "arvados/v1/container_requests/"+cr.UUID, nil, nil)
715                         if err != nil {
716                                 return err
717                         }
718                         if cr.State != crStateWas {
719                                 diag.debugf("container request state = %s", cr.State)
720                         }
721
722                         cStateWas := c.State
723                         err = client.RequestAndDecodeContext(ctx, &c, "GET", "arvados/v1/containers/"+cr.ContainerUUID, nil, nil)
724                         if err != nil {
725                                 return err
726                         }
727                         if c.State != cStateWas {
728                                 diag.debugf("container state = %s", c.State)
729                         }
730
731                         cancel()
732                 }
733
734                 if cr.State != arvados.ContainerRequestStateFinal {
735                         err := client.RequestAndDecodeContext(context.Background(), &cr, "PATCH", "arvados/v1/container_requests/"+cr.UUID, nil, map[string]interface{}{
736                                 "container_request": map[string]interface{}{
737                                         "priority": 0,
738                                 }})
739                         if err != nil {
740                                 diag.infof("error canceling container request %s: %s", cr.UUID, err)
741                         } else {
742                                 diag.debugf("canceled container request %s", cr.UUID)
743                         }
744                         return fmt.Errorf("timed out waiting for container to finish; container request %s state was %q, container %s state was %q", cr.UUID, cr.State, c.UUID, c.State)
745                 }
746                 if c.State != arvados.ContainerStateComplete {
747                         return fmt.Errorf("container request %s is final but container %s did not complete: container state = %q", cr.UUID, cr.ContainerUUID, c.State)
748                 }
749                 if c.ExitCode != 0 {
750                         return fmt.Errorf("container exited %d", c.ExitCode)
751                 }
752                 return nil
753         })
754 }