20612: Run diagnostics from inside the test container too.
[arvados.git] / lib / diagnostics / cmd.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package diagnostics
6
7 import (
8         "archive/tar"
9         "bytes"
10         "context"
11         "crypto/sha256"
12         _ "embed"
13         "flag"
14         "fmt"
15         "io"
16         "io/ioutil"
17         "net"
18         "net/http"
19         "net/url"
20         "os"
21         "os/exec"
22         "strings"
23         "time"
24
25         "git.arvados.org/arvados.git/lib/cmd"
26         "git.arvados.org/arvados.git/lib/config"
27         "git.arvados.org/arvados.git/sdk/go/arvados"
28         "git.arvados.org/arvados.git/sdk/go/ctxlog"
29         "git.arvados.org/arvados.git/sdk/go/health"
30         "github.com/sirupsen/logrus"
31 )
32
33 type Command struct{}
34
35 func (Command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
36         var diag diagnoser
37         f := flag.NewFlagSet(prog, flag.ContinueOnError)
38         f.StringVar(&diag.projectName, "project-name", "scratch area for diagnostics", "`name` of project to find/create in home project and use for temporary/test objects")
39         f.StringVar(&diag.logLevel, "log-level", "info", "logging `level` (debug, info, warning, error)")
40         f.StringVar(&diag.dockerImage, "docker-image", "", "`image` (tag or portable data hash) to use when running a test container, or \"hello-world\" to use embedded hello-world image (default: build a custom image containing this executable, and run diagnostics inside the container too)")
41         f.StringVar(&diag.dockerImageFrom, "docker-image-from", "debian:stable-slim", "`base` image to use when building a custom image (use a debian-based image similar this host's OS for best results)")
42         f.BoolVar(&diag.checkInternal, "internal-client", false, "check that this host is considered an \"internal\" client")
43         f.BoolVar(&diag.checkExternal, "external-client", false, "check that this host is considered an \"external\" client")
44         f.BoolVar(&diag.verbose, "v", false, "verbose: include more information in report")
45         f.IntVar(&diag.priority, "priority", 500, "priority for test container (1..1000, or 0 to skip)")
46         f.DurationVar(&diag.timeout, "timeout", 10*time.Second, "timeout for http requests")
47         if ok, code := cmd.ParseFlags(f, prog, args, "", stderr); !ok {
48                 return code
49         }
50         diag.stdout = stdout
51         diag.stderr = stderr
52         diag.logger = ctxlog.New(stdout, "text", diag.logLevel)
53         diag.logger.SetFormatter(&logrus.TextFormatter{DisableTimestamp: true, DisableLevelTruncation: true, PadLevelText: true})
54         diag.runtests()
55         if len(diag.errors) == 0 {
56                 diag.logger.Info("--- no errors ---")
57                 return 0
58         } else {
59                 if diag.logger.Level > logrus.ErrorLevel {
60                         fmt.Fprint(stdout, "\n--- cut here --- error summary ---\n\n")
61                         for _, e := range diag.errors {
62                                 diag.logger.Error(e)
63                         }
64                 }
65                 return 1
66         }
67 }
68
69 // docker save hello-world > hello-world.tar
70 //
71 //go:embed hello-world.tar
72 var HelloWorldDockerImage []byte
73
74 type diagnoser struct {
75         stdout          io.Writer
76         stderr          io.Writer
77         logLevel        string
78         priority        int
79         projectName     string
80         dockerImage     string
81         dockerImageFrom string
82         checkInternal   bool
83         checkExternal   bool
84         verbose         bool
85         timeout         time.Duration
86         logger          *logrus.Logger
87         errors          []string
88         done            map[int]bool
89 }
90
91 func (diag *diagnoser) debugf(f string, args ...interface{}) {
92         diag.logger.Debugf("  ... "+f, args...)
93 }
94
95 func (diag *diagnoser) infof(f string, args ...interface{}) {
96         diag.logger.Infof("  ... "+f, args...)
97 }
98
99 func (diag *diagnoser) verbosef(f string, args ...interface{}) {
100         if diag.verbose {
101                 diag.logger.Infof("  ... "+f, args...)
102         }
103 }
104
105 func (diag *diagnoser) warnf(f string, args ...interface{}) {
106         diag.logger.Warnf("  ... "+f, args...)
107 }
108
109 func (diag *diagnoser) errorf(f string, args ...interface{}) {
110         diag.logger.Errorf(f, args...)
111         diag.errors = append(diag.errors, fmt.Sprintf(f, args...))
112 }
113
114 // Run the given func, logging appropriate messages before and after,
115 // adding timing info, etc.
116 //
117 // The id argument should be unique among tests, and shouldn't change
118 // when other tests are added/removed.
119 func (diag *diagnoser) dotest(id int, title string, fn func() error) {
120         if diag.done == nil {
121                 diag.done = map[int]bool{}
122         } else if diag.done[id] {
123                 diag.errorf("(bug) reused test id %d", id)
124         }
125         diag.done[id] = true
126
127         diag.logger.Infof("%4d: %s", id, title)
128         t0 := time.Now()
129         err := fn()
130         elapsed := fmt.Sprintf("%d ms", time.Now().Sub(t0)/time.Millisecond)
131         if err != nil {
132                 diag.errorf("%4d: %s (%s): %s", id, title, elapsed, err)
133         } else {
134                 diag.logger.Debugf("%4d: %s (%s): ok", id, title, elapsed)
135         }
136 }
137
138 func (diag *diagnoser) runtests() {
139         client := arvados.NewClientFromEnv()
140         // Disable auto-retry, use context instead
141         client.Timeout = 0
142
143         if client.APIHost == "" || client.AuthToken == "" {
144                 diag.errorf("ARVADOS_API_HOST and ARVADOS_API_TOKEN environment variables are not set -- aborting without running any tests")
145                 return
146         }
147
148         hostname, err := os.Hostname()
149         if err != nil {
150                 diag.warnf("error getting hostname: %s")
151         } else {
152                 diag.verbosef("hostname = %s", hostname)
153         }
154
155         diag.dotest(5, "running health check (same as `arvados-server check`)", func() error {
156                 ldr := config.NewLoader(&bytes.Buffer{}, ctxlog.New(&bytes.Buffer{}, "text", "info"))
157                 ldr.SetupFlags(flag.NewFlagSet("diagnostics", flag.ContinueOnError))
158                 cfg, err := ldr.Load()
159                 if err != nil {
160                         diag.infof("skipping because config could not be loaded: %s", err)
161                         return nil
162                 }
163                 cluster, err := cfg.GetCluster("")
164                 if err != nil {
165                         return err
166                 }
167                 if cluster.SystemRootToken != os.Getenv("ARVADOS_API_TOKEN") {
168                         return fmt.Errorf("diagnostics usage error: %s is readable but SystemRootToken does not match $ARVADOS_API_TOKEN (to fix, either run 'arvados-client sudo diagnostics' to load everything from config file, or set ARVADOS_CONFIG=- to load nothing from config file)", ldr.Path)
169                 }
170                 agg := &health.Aggregator{Cluster: cluster}
171                 resp := agg.ClusterHealth()
172                 for _, e := range resp.Errors {
173                         diag.errorf("health check: %s", e)
174                 }
175                 if len(resp.Errors) > 0 {
176                         diag.infof("consider running `arvados-server check -yaml` for a comprehensive report")
177                 }
178                 diag.verbosef("reported clock skew = %v", resp.ClockSkew)
179                 reported := map[string]bool{}
180                 for _, result := range resp.Checks {
181                         version := strings.SplitN(result.Metrics.Version, " (go", 2)[0]
182                         if version != "" && !reported[version] {
183                                 diag.verbosef("arvados version = %s", version)
184                                 reported[version] = true
185                         }
186                 }
187                 reported = map[string]bool{}
188                 for _, result := range resp.Checks {
189                         if result.Server != "" && !reported[result.Server] {
190                                 diag.verbosef("http frontend version = %s", result.Server)
191                                 reported[result.Server] = true
192                         }
193                 }
194                 reported = map[string]bool{}
195                 for _, result := range resp.Checks {
196                         if sha := result.ConfigSourceSHA256; sha != "" && !reported[sha] {
197                                 diag.verbosef("config file sha256 = %s", sha)
198                                 reported[sha] = true
199                         }
200                 }
201                 return nil
202         })
203
204         var dd arvados.DiscoveryDocument
205         ddpath := "discovery/v1/apis/arvados/v1/rest"
206         diag.dotest(10, fmt.Sprintf("getting discovery document from https://%s/%s", client.APIHost, ddpath), func() error {
207                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
208                 defer cancel()
209                 err := client.RequestAndDecodeContext(ctx, &dd, "GET", ddpath, nil, nil)
210                 if err != nil {
211                         return err
212                 }
213                 diag.verbosef("BlobSignatureTTL = %d", dd.BlobSignatureTTL)
214                 return nil
215         })
216
217         var cluster arvados.Cluster
218         cfgpath := "arvados/v1/config"
219         cfgOK := false
220         diag.dotest(20, fmt.Sprintf("getting exported config from https://%s/%s", client.APIHost, cfgpath), func() error {
221                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
222                 defer cancel()
223                 err := client.RequestAndDecodeContext(ctx, &cluster, "GET", cfgpath, nil, nil)
224                 if err != nil {
225                         return err
226                 }
227                 diag.verbosef("Collections.BlobSigning = %v", cluster.Collections.BlobSigning)
228                 cfgOK = true
229                 return nil
230         })
231
232         var user arvados.User
233         diag.dotest(30, "getting current user record", func() error {
234                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
235                 defer cancel()
236                 err := client.RequestAndDecodeContext(ctx, &user, "GET", "arvados/v1/users/current", nil, nil)
237                 if err != nil {
238                         return err
239                 }
240                 diag.verbosef("user uuid = %s", user.UUID)
241                 return nil
242         })
243
244         if !cfgOK {
245                 diag.errorf("cannot proceed without cluster config -- aborting without running any further tests")
246                 return
247         }
248
249         // uncomment to create some spurious errors
250         // cluster.Services.WebDAVDownload.ExternalURL.Host = "0.0.0.0:9"
251
252         // TODO: detect routing errors here, like finding wb2 at the
253         // wb1 address.
254         for i, svc := range []*arvados.Service{
255                 &cluster.Services.Keepproxy,
256                 &cluster.Services.WebDAV,
257                 &cluster.Services.WebDAVDownload,
258                 &cluster.Services.Websocket,
259                 &cluster.Services.Workbench1,
260                 &cluster.Services.Workbench2,
261         } {
262                 diag.dotest(40+i, fmt.Sprintf("connecting to service endpoint %s", svc.ExternalURL), func() error {
263                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
264                         defer cancel()
265                         u := svc.ExternalURL
266                         if strings.HasPrefix(u.Scheme, "ws") {
267                                 // We can do a real websocket test elsewhere,
268                                 // but for now we'll just check the https
269                                 // connection.
270                                 u.Scheme = "http" + u.Scheme[2:]
271                         }
272                         if svc == &cluster.Services.WebDAV && strings.HasPrefix(u.Host, "*") {
273                                 u.Host = "d41d8cd98f00b204e9800998ecf8427e-0" + u.Host[1:]
274                         }
275                         req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
276                         if err != nil {
277                                 return err
278                         }
279                         resp, err := http.DefaultClient.Do(req)
280                         if err != nil {
281                                 return err
282                         }
283                         resp.Body.Close()
284                         return nil
285                 })
286         }
287
288         for i, url := range []string{
289                 cluster.Services.Controller.ExternalURL.String(),
290                 cluster.Services.Keepproxy.ExternalURL.String() + "d41d8cd98f00b204e9800998ecf8427e+0",
291                 cluster.Services.WebDAVDownload.ExternalURL.String(),
292         } {
293                 diag.dotest(50+i, fmt.Sprintf("checking CORS headers at %s", url), func() error {
294                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
295                         defer cancel()
296                         req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
297                         if err != nil {
298                                 return err
299                         }
300                         req.Header.Set("Origin", "https://example.com")
301                         resp, err := http.DefaultClient.Do(req)
302                         if err != nil {
303                                 return err
304                         }
305                         if hdr := resp.Header.Get("Access-Control-Allow-Origin"); hdr != "*" {
306                                 return fmt.Errorf("expected \"Access-Control-Allow-Origin: *\", got %q", hdr)
307                         }
308                         return nil
309                 })
310         }
311
312         var keeplist arvados.KeepServiceList
313         diag.dotest(60, "checking internal/external client detection", func() error {
314                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
315                 defer cancel()
316                 err := client.RequestAndDecodeContext(ctx, &keeplist, "GET", "arvados/v1/keep_services/accessible", nil, arvados.ListOptions{Limit: 999999})
317                 if err != nil {
318                         return fmt.Errorf("error getting keep services list: %s", err)
319                 } else if len(keeplist.Items) == 0 {
320                         return fmt.Errorf("controller did not return any keep services")
321                 }
322                 found := map[string]int{}
323                 for _, ks := range keeplist.Items {
324                         found[ks.ServiceType]++
325                 }
326                 isInternal := found["proxy"] == 0 && len(keeplist.Items) > 0
327                 isExternal := found["proxy"] > 0 && found["proxy"] == len(keeplist.Items)
328                 if isExternal {
329                         diag.infof("controller returned only proxy services, this host is treated as \"external\"")
330                 } else if isInternal {
331                         diag.infof("controller returned only non-proxy services, this host is treated as \"internal\"")
332                 }
333                 if (diag.checkInternal && !isInternal) || (diag.checkExternal && !isExternal) {
334                         return fmt.Errorf("expecting internal=%v external=%v, but found internal=%v external=%v", diag.checkInternal, diag.checkExternal, isInternal, isExternal)
335                 }
336                 return nil
337         })
338
339         for i, ks := range keeplist.Items {
340                 u := url.URL{
341                         Scheme: "http",
342                         Host:   net.JoinHostPort(ks.ServiceHost, fmt.Sprintf("%d", ks.ServicePort)),
343                         Path:   "/",
344                 }
345                 if ks.ServiceSSLFlag {
346                         u.Scheme = "https"
347                 }
348                 diag.dotest(61+i, fmt.Sprintf("reading+writing via keep service at %s", u.String()), func() error {
349                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
350                         defer cancel()
351                         req, err := http.NewRequestWithContext(ctx, "PUT", u.String()+"d41d8cd98f00b204e9800998ecf8427e", nil)
352                         if err != nil {
353                                 return err
354                         }
355                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
356                         resp, err := http.DefaultClient.Do(req)
357                         if err != nil {
358                                 return err
359                         }
360                         defer resp.Body.Close()
361                         body, err := ioutil.ReadAll(resp.Body)
362                         if err != nil {
363                                 return fmt.Errorf("reading response body: %s", err)
364                         }
365                         loc := strings.TrimSpace(string(body))
366                         if !strings.HasPrefix(loc, "d41d8") {
367                                 return fmt.Errorf("unexpected response from write: %q", body)
368                         }
369
370                         req, err = http.NewRequestWithContext(ctx, "GET", u.String()+loc, nil)
371                         if err != nil {
372                                 return err
373                         }
374                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
375                         resp, err = http.DefaultClient.Do(req)
376                         if err != nil {
377                                 return err
378                         }
379                         defer resp.Body.Close()
380                         body, err = ioutil.ReadAll(resp.Body)
381                         if err != nil {
382                                 return fmt.Errorf("reading response body: %s", err)
383                         }
384                         if len(body) != 0 {
385                                 return fmt.Errorf("unexpected response from read: %q", body)
386                         }
387
388                         return nil
389                 })
390         }
391
392         var project arvados.Group
393         diag.dotest(80, fmt.Sprintf("finding/creating %q project", diag.projectName), func() error {
394                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
395                 defer cancel()
396                 var grplist arvados.GroupList
397                 err := client.RequestAndDecodeContext(ctx, &grplist, "GET", "arvados/v1/groups", nil, arvados.ListOptions{
398                         Filters: []arvados.Filter{
399                                 {"name", "=", diag.projectName},
400                                 {"group_class", "=", "project"},
401                                 {"owner_uuid", "=", user.UUID}},
402                         Limit: 999999})
403                 if err != nil {
404                         return fmt.Errorf("list groups: %s", err)
405                 }
406                 if len(grplist.Items) > 0 {
407                         project = grplist.Items[0]
408                         diag.verbosef("using existing project, uuid = %s", project.UUID)
409                         return nil
410                 }
411                 diag.debugf("list groups: ok, no results")
412                 err = client.RequestAndDecodeContext(ctx, &project, "POST", "arvados/v1/groups", nil, map[string]interface{}{"group": map[string]interface{}{
413                         "name":        diag.projectName,
414                         "group_class": "project",
415                 }})
416                 if err != nil {
417                         return fmt.Errorf("create project: %s", err)
418                 }
419                 diag.verbosef("created project, uuid = %s", project.UUID)
420                 return nil
421         })
422
423         var collection arvados.Collection
424         diag.dotest(90, "creating temporary collection", func() error {
425                 if project.UUID == "" {
426                         return fmt.Errorf("skipping, no project to work in")
427                 }
428                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
429                 defer cancel()
430                 err := client.RequestAndDecodeContext(ctx, &collection, "POST", "arvados/v1/collections", nil, map[string]interface{}{
431                         "ensure_unique_name": true,
432                         "collection": map[string]interface{}{
433                                 "owner_uuid": project.UUID,
434                                 "name":       "test collection",
435                                 "trash_at":   time.Now().Add(time.Hour)}})
436                 if err != nil {
437                         return err
438                 }
439                 diag.verbosef("ok, uuid = %s", collection.UUID)
440                 return nil
441         })
442
443         if collection.UUID != "" {
444                 defer func() {
445                         diag.dotest(9990, "deleting temporary collection", func() error {
446                                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
447                                 defer cancel()
448                                 return client.RequestAndDecodeContext(ctx, nil, "DELETE", "arvados/v1/collections/"+collection.UUID, nil, nil)
449                         })
450                 }()
451         }
452
453         tempdir, err := ioutil.TempDir("", "arvados-diagnostics")
454         if err != nil {
455                 diag.errorf("error creating temp dir: %s", err)
456                 return
457         }
458         defer os.RemoveAll(tempdir)
459
460         var dockerImageData []byte
461         if diag.dockerImage != "" || diag.priority < 1 {
462                 // We won't be using the self-built docker image, so
463                 // don't build it.  But we will write the embedded
464                 // "hello-world" image to our test collection to test
465                 // upload/download, whether or not we're using it as a
466                 // docker image.
467                 dockerImageData = HelloWorldDockerImage
468         } else if selfbin, err := os.Readlink("/proc/self/exe"); err != nil {
469                 diag.errorf("readlink /proc/self/exe: %s", err)
470                 return
471         } else if selfbindata, err := os.ReadFile(selfbin); err != nil {
472                 diag.errorf("error reading %s: %s", selfbin, err)
473                 return
474         } else {
475                 selfbinSha := fmt.Sprintf("%x", sha256.Sum256(selfbindata))
476                 tag := "arvados-client-diagnostics:" + selfbinSha[:9]
477                 err := os.WriteFile(tempdir+"/arvados-client", selfbindata, 0777)
478                 if err != nil {
479                         diag.errorf("error writing %s: %s", tempdir+"/arvados-client", err)
480                         return
481                 }
482
483                 dockerfile := "FROM " + diag.dockerImageFrom + "\n"
484                 dockerfile += "RUN apt-get update && apt-get install --yes --no-install-recommends libfuse2 ca-certificates && apt-get clean\n"
485                 dockerfile += "COPY /arvados-client /arvados-client\n"
486                 cmd := exec.Command("docker", "build", "--tag", tag, "-f", "-", tempdir)
487                 cmd.Stdin = strings.NewReader(dockerfile)
488                 cmd.Stdout = diag.stderr
489                 cmd.Stderr = diag.stderr
490                 err = cmd.Run()
491                 if err != nil {
492                         diag.errorf("error building docker image: %s", err)
493                         return
494                 }
495                 checkversion, err := exec.Command("docker", "run", tag, "/arvados-client", "version").CombinedOutput()
496                 if err != nil {
497                         diag.errorf("docker image does not seem to work: %s", err)
498                         return
499                 }
500                 diag.infof("arvados-client version: %s", checkversion)
501
502                 buf, err := exec.Command("docker", "save", tag).Output()
503                 if err != nil {
504                         diag.errorf("docker save %s: %s", tag, err)
505                         return
506                 }
507                 diag.infof("docker image size is %d", len(buf))
508                 dockerImageData = buf
509         }
510
511         // Read image tarball to find image ID, so we can upload it as
512         // "sha256:{...}.tar"
513         var imageSHA2 string
514         {
515                 tr := tar.NewReader(bytes.NewReader(dockerImageData))
516                 for {
517                         hdr, err := tr.Next()
518                         if err == io.EOF {
519                                 break
520                         }
521                         if err != nil {
522                                 diag.errorf("internal error/bug: cannot read docker image tar file: %s", err)
523                                 return
524                         }
525                         if s := strings.TrimSuffix(hdr.Name, ".json"); len(s) == 64 && s != hdr.Name {
526                                 imageSHA2 = s
527                         }
528                 }
529                 if imageSHA2 == "" {
530                         diag.errorf("internal error/bug: cannot find {sha256}.json file in docker image tar file")
531                         return
532                 }
533         }
534         tarfilename := "sha256:" + imageSHA2 + ".tar"
535
536         diag.dotest(100, "uploading file via webdav", func() error {
537                 timeout := diag.timeout
538                 if len(dockerImageData) > 10<<20 && timeout < time.Minute {
539                         // Extend the normal http timeout if we're
540                         // uploading a substantial docker image.
541                         timeout = time.Minute
542                 }
543                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(timeout))
544                 defer cancel()
545                 if collection.UUID == "" {
546                         return fmt.Errorf("skipping, no test collection")
547                 }
548                 t0 := time.Now()
549                 req, err := http.NewRequestWithContext(ctx, "PUT", cluster.Services.WebDAVDownload.ExternalURL.String()+"c="+collection.UUID+"/"+tarfilename, bytes.NewReader(dockerImageData))
550                 if err != nil {
551                         return fmt.Errorf("BUG? http.NewRequest: %s", err)
552                 }
553                 req.Header.Set("Authorization", "Bearer "+client.AuthToken)
554                 resp, err := http.DefaultClient.Do(req)
555                 if err != nil {
556                         return fmt.Errorf("error performing http request: %s", err)
557                 }
558                 resp.Body.Close()
559                 if resp.StatusCode != http.StatusCreated {
560                         return fmt.Errorf("status %s", resp.Status)
561                 }
562                 diag.verbosef("upload ok, status %s, %f MB/s", resp.Status, float64(len(dockerImageData))/time.Since(t0).Seconds()/1000000)
563                 err = client.RequestAndDecodeContext(ctx, &collection, "GET", "arvados/v1/collections/"+collection.UUID, nil, nil)
564                 if err != nil {
565                         return fmt.Errorf("get updated collection: %s", err)
566                 }
567                 diag.verbosef("upload pdh %s", collection.PortableDataHash)
568                 return nil
569         })
570
571         davurl := cluster.Services.WebDAV.ExternalURL
572         davWildcard := strings.HasPrefix(davurl.Host, "*--") || strings.HasPrefix(davurl.Host, "*.")
573         diag.dotest(110, fmt.Sprintf("checking WebDAV ExternalURL wildcard (%s)", davurl), func() error {
574                 if davurl.Host == "" {
575                         return fmt.Errorf("host missing - content previews will not work")
576                 }
577                 if !davWildcard && !cluster.Collections.TrustAllContent {
578                         diag.warnf("WebDAV ExternalURL has no leading wildcard and TrustAllContent==false - content previews will not work")
579                 }
580                 return nil
581         })
582
583         for i, trial := range []struct {
584                 needcoll     bool
585                 needWildcard bool
586                 status       int
587                 fileurl      string
588         }{
589                 {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + "foo"},
590                 {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + tarfilename},
591                 {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/foo"},
592                 {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/" + tarfilename},
593                 {true, true, http.StatusOK, strings.Replace(davurl.String(), "*", strings.Replace(collection.PortableDataHash, "+", "-", -1), 1) + tarfilename},
594                 {true, false, http.StatusOK, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=" + collection.UUID + "/_/" + tarfilename},
595         } {
596                 diag.dotest(120+i, fmt.Sprintf("downloading from webdav (%s)", trial.fileurl), func() error {
597                         if trial.needWildcard && !davWildcard {
598                                 diag.warnf("skipping collection-id-in-vhost test because WebDAV ExternalURL has no leading wildcard")
599                                 return nil
600                         }
601                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
602                         defer cancel()
603                         if trial.needcoll && collection.UUID == "" {
604                                 return fmt.Errorf("skipping, no test collection")
605                         }
606                         req, err := http.NewRequestWithContext(ctx, "GET", trial.fileurl, nil)
607                         if err != nil {
608                                 return err
609                         }
610                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
611                         resp, err := http.DefaultClient.Do(req)
612                         if err != nil {
613                                 return err
614                         }
615                         defer resp.Body.Close()
616                         body, err := ioutil.ReadAll(resp.Body)
617                         if err != nil {
618                                 return fmt.Errorf("reading response: %s", err)
619                         }
620                         if resp.StatusCode != trial.status {
621                                 return fmt.Errorf("unexpected response status: %s", resp.Status)
622                         }
623                         if trial.status == http.StatusOK && !bytes.Equal(body, dockerImageData) {
624                                 excerpt := body
625                                 if len(excerpt) > 128 {
626                                         excerpt = append([]byte(nil), body[:128]...)
627                                         excerpt = append(excerpt, []byte("[...]")...)
628                                 }
629                                 return fmt.Errorf("unexpected response content: len %d, %q", len(body), excerpt)
630                         }
631                         return nil
632                 })
633         }
634
635         var vm arvados.VirtualMachine
636         diag.dotest(130, "getting list of virtual machines", func() error {
637                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
638                 defer cancel()
639                 var vmlist arvados.VirtualMachineList
640                 err := client.RequestAndDecodeContext(ctx, &vmlist, "GET", "arvados/v1/virtual_machines", nil, arvados.ListOptions{Limit: 999999})
641                 if err != nil {
642                         return err
643                 }
644                 if len(vmlist.Items) < 1 {
645                         diag.warnf("no VMs found")
646                 } else {
647                         vm = vmlist.Items[0]
648                 }
649                 return nil
650         })
651
652         diag.dotest(140, "getting workbench1 webshell page", func() error {
653                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
654                 defer cancel()
655                 if vm.UUID == "" {
656                         diag.warnf("skipping, no vm available")
657                         return nil
658                 }
659                 webshelltermurl := cluster.Services.Workbench1.ExternalURL.String() + "virtual_machines/" + vm.UUID + "/webshell/testusername"
660                 diag.debugf("url %s", webshelltermurl)
661                 req, err := http.NewRequestWithContext(ctx, "GET", webshelltermurl, nil)
662                 if err != nil {
663                         return err
664                 }
665                 req.Header.Set("Authorization", "Bearer "+client.AuthToken)
666                 resp, err := http.DefaultClient.Do(req)
667                 if err != nil {
668                         return err
669                 }
670                 defer resp.Body.Close()
671                 body, err := ioutil.ReadAll(resp.Body)
672                 if err != nil {
673                         return fmt.Errorf("reading response: %s", err)
674                 }
675                 if resp.StatusCode != http.StatusOK {
676                         return fmt.Errorf("unexpected response status: %s %q", resp.Status, body)
677                 }
678                 return nil
679         })
680
681         diag.dotest(150, "connecting to webshell service", func() error {
682                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
683                 defer cancel()
684                 if vm.UUID == "" {
685                         diag.warnf("skipping, no vm available")
686                         return nil
687                 }
688                 u := cluster.Services.WebShell.ExternalURL
689                 webshellurl := u.String() + vm.Hostname + "?"
690                 if strings.HasPrefix(u.Host, "*") {
691                         u.Host = vm.Hostname + u.Host[1:]
692                         webshellurl = u.String() + "?"
693                 }
694                 diag.debugf("url %s", webshellurl)
695                 req, err := http.NewRequestWithContext(ctx, "POST", webshellurl, bytes.NewBufferString(url.Values{
696                         "width":   {"80"},
697                         "height":  {"25"},
698                         "session": {"xyzzy"},
699                         "rooturl": {webshellurl},
700                 }.Encode()))
701                 if err != nil {
702                         return err
703                 }
704                 req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
705                 resp, err := http.DefaultClient.Do(req)
706                 if err != nil {
707                         return err
708                 }
709                 defer resp.Body.Close()
710                 diag.debugf("response status %s", resp.Status)
711                 body, err := ioutil.ReadAll(resp.Body)
712                 if err != nil {
713                         return fmt.Errorf("reading response: %s", err)
714                 }
715                 diag.debugf("response body %q", body)
716                 // We don't speak the protocol, so we get a 400 error
717                 // from the webshell server even if everything is
718                 // OK. Anything else (404, 502, ???) indicates a
719                 // problem.
720                 if resp.StatusCode != http.StatusBadRequest {
721                         return fmt.Errorf("unexpected response status: %s, %q", resp.Status, body)
722                 }
723                 return nil
724         })
725
726         diag.dotest(160, "running a container", func() error {
727                 if diag.priority < 1 {
728                         diag.infof("skipping (use priority > 0 if you want to run a container)")
729                         return nil
730                 }
731                 if project.UUID == "" {
732                         return fmt.Errorf("skipping, no project to work in")
733                 }
734
735                 timestamp := time.Now().Format(time.RFC3339)
736
737                 var ctrCommand []string
738                 switch diag.dockerImage {
739                 case "":
740                         if collection.UUID == "" {
741                                 return fmt.Errorf("skipping, no test collection to use as docker image")
742                         }
743                         diag.dockerImage = collection.PortableDataHash
744                         ctrCommand = []string{"/arvados-client", "diagnostics",
745                                 "-priority=0", // don't run a container
746                                 "-log-level=" + diag.logLevel,
747                                 "-internal-client=true"}
748                 case "hello-world":
749                         if collection.UUID == "" {
750                                 return fmt.Errorf("skipping, no test collection to use as docker image")
751                         }
752                         diag.dockerImage = collection.PortableDataHash
753                         ctrCommand = []string{"/hello"}
754                 default:
755                         ctrCommand = []string{"echo", timestamp}
756                 }
757
758                 var cr arvados.ContainerRequest
759                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
760                 defer cancel()
761
762                 err := client.RequestAndDecodeContext(ctx, &cr, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{"container_request": map[string]interface{}{
763                         "owner_uuid":      project.UUID,
764                         "name":            fmt.Sprintf("diagnostics container request %s", timestamp),
765                         "container_image": diag.dockerImage,
766                         "command":         ctrCommand,
767                         "use_existing":    false,
768                         "output_path":     "/mnt/output",
769                         "output_name":     fmt.Sprintf("diagnostics output %s", timestamp),
770                         "priority":        diag.priority,
771                         "state":           arvados.ContainerRequestStateCommitted,
772                         "mounts": map[string]map[string]interface{}{
773                                 "/mnt/output": {
774                                         "kind":     "collection",
775                                         "writable": true,
776                                 },
777                         },
778                         "runtime_constraints": arvados.RuntimeConstraints{
779                                 API:          true,
780                                 VCPUs:        1,
781                                 RAM:          128 << 20,
782                                 KeepCacheRAM: 64 << 20,
783                         },
784                 }})
785                 if err != nil {
786                         return err
787                 }
788                 diag.infof("container request uuid = %s", cr.UUID)
789                 diag.verbosef("container uuid = %s", cr.ContainerUUID)
790
791                 timeout := 10 * time.Minute
792                 diag.infof("container request submitted, waiting up to %v for container to run", arvados.Duration(timeout))
793                 deadline := time.Now().Add(timeout)
794
795                 var c arvados.Container
796                 for ; cr.State != arvados.ContainerRequestStateFinal && time.Now().Before(deadline); time.Sleep(2 * time.Second) {
797                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
798                         defer cancel()
799
800                         crStateWas := cr.State
801                         err := client.RequestAndDecodeContext(ctx, &cr, "GET", "arvados/v1/container_requests/"+cr.UUID, nil, nil)
802                         if err != nil {
803                                 return err
804                         }
805                         if cr.State != crStateWas {
806                                 diag.debugf("container request state = %s", cr.State)
807                         }
808
809                         cStateWas := c.State
810                         err = client.RequestAndDecodeContext(ctx, &c, "GET", "arvados/v1/containers/"+cr.ContainerUUID, nil, nil)
811                         if err != nil {
812                                 return err
813                         }
814                         if c.State != cStateWas {
815                                 diag.debugf("container state = %s", c.State)
816                         }
817
818                         cancel()
819                 }
820
821                 if cr.State != arvados.ContainerRequestStateFinal {
822                         err := client.RequestAndDecodeContext(context.Background(), &cr, "PATCH", "arvados/v1/container_requests/"+cr.UUID, nil, map[string]interface{}{
823                                 "container_request": map[string]interface{}{
824                                         "priority": 0,
825                                 }})
826                         if err != nil {
827                                 diag.infof("error canceling container request %s: %s", cr.UUID, err)
828                         } else {
829                                 diag.debugf("canceled container request %s", cr.UUID)
830                         }
831                         return fmt.Errorf("timed out waiting for container to finish; container request %s state was %q, container %s state was %q", cr.UUID, cr.State, c.UUID, c.State)
832                 }
833                 if c.State != arvados.ContainerStateComplete {
834                         return fmt.Errorf("container request %s is final but container %s did not complete: container state = %q", cr.UUID, cr.ContainerUUID, c.State)
835                 }
836                 if c.ExitCode != 0 {
837                         return fmt.Errorf("container exited %d", c.ExitCode)
838                 }
839                 return nil
840         })
841 }