Merge branch '21904-no-unqueueable-reqs'
[arvados.git] / lib / diagnostics / cmd.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package diagnostics
6
7 import (
8         "archive/tar"
9         "bytes"
10         "context"
11         "crypto/sha256"
12         _ "embed"
13         "encoding/json"
14         "flag"
15         "fmt"
16         "io"
17         "io/ioutil"
18         "net"
19         "net/http"
20         "net/url"
21         "os"
22         "os/exec"
23         "regexp"
24         "strings"
25         "time"
26
27         "git.arvados.org/arvados.git/lib/cmd"
28         "git.arvados.org/arvados.git/lib/config"
29         "git.arvados.org/arvados.git/sdk/go/arvados"
30         "git.arvados.org/arvados.git/sdk/go/ctxlog"
31         "git.arvados.org/arvados.git/sdk/go/health"
32         "github.com/sirupsen/logrus"
33 )
34
35 type Command struct{}
36
37 func (Command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
38         var diag diagnoser
39         f := flag.NewFlagSet(prog, flag.ContinueOnError)
40         f.StringVar(&diag.projectName, "project-name", "scratch area for diagnostics", "`name` of project to find/create in home project and use for temporary/test objects")
41         f.StringVar(&diag.logLevel, "log-level", "info", "logging `level` (debug, info, warning, error)")
42         f.StringVar(&diag.dockerImage, "docker-image", "", "`image` (tag or portable data hash) to use when running a test container, or \"hello-world\" to use embedded hello-world image (default: build a custom image containing this executable, and run diagnostics inside the container too)")
43         f.StringVar(&diag.dockerImageFrom, "docker-image-from", "debian:stable-slim", "`base` image to use when building a custom image (see https://doc.arvados.org/main/admin/diagnostics.html#container-options)")
44         f.BoolVar(&diag.checkInternal, "internal-client", false, "check that this host is considered an \"internal\" client")
45         f.BoolVar(&diag.checkExternal, "external-client", false, "check that this host is considered an \"external\" client")
46         f.BoolVar(&diag.verbose, "v", false, "verbose: include more information in report")
47         f.IntVar(&diag.priority, "priority", 500, "priority for test container (1..1000, or 0 to skip)")
48         f.DurationVar(&diag.timeout, "timeout", 10*time.Second, "timeout for http requests")
49         if ok, code := cmd.ParseFlags(f, prog, args, "", stderr); !ok {
50                 return code
51         }
52         diag.stdout = stdout
53         diag.stderr = stderr
54         diag.logger = ctxlog.New(stdout, "text", diag.logLevel)
55         diag.logger.SetFormatter(&logrus.TextFormatter{DisableTimestamp: true, DisableLevelTruncation: true, PadLevelText: true})
56         diag.runtests()
57         if len(diag.errors) == 0 {
58                 diag.logger.Info("--- no errors ---")
59                 return 0
60         } else {
61                 if diag.logger.Level > logrus.ErrorLevel {
62                         fmt.Fprint(stdout, "\n--- cut here --- error summary ---\n\n")
63                         for _, e := range diag.errors {
64                                 diag.logger.Error(e)
65                         }
66                 }
67                 return 1
68         }
69 }
70
71 // docker save hello-world > hello-world.tar
72 //
73 //go:embed hello-world.tar
74 var HelloWorldDockerImage []byte
75
76 type diagnoser struct {
77         stdout          io.Writer
78         stderr          io.Writer
79         logLevel        string
80         priority        int
81         projectName     string
82         dockerImage     string
83         dockerImageFrom string
84         checkInternal   bool
85         checkExternal   bool
86         verbose         bool
87         timeout         time.Duration
88         logger          *logrus.Logger
89         errors          []string
90         done            map[int]bool
91 }
92
93 func (diag *diagnoser) debugf(f string, args ...interface{}) {
94         diag.logger.Debugf("  ... "+f, args...)
95 }
96
97 func (diag *diagnoser) infof(f string, args ...interface{}) {
98         diag.logger.Infof("  ... "+f, args...)
99 }
100
101 func (diag *diagnoser) verbosef(f string, args ...interface{}) {
102         if diag.verbose {
103                 diag.logger.Infof("  ... "+f, args...)
104         }
105 }
106
107 func (diag *diagnoser) warnf(f string, args ...interface{}) {
108         diag.logger.Warnf("  ... "+f, args...)
109 }
110
111 func (diag *diagnoser) errorf(f string, args ...interface{}) {
112         diag.logger.Errorf(f, args...)
113         diag.errors = append(diag.errors, fmt.Sprintf(f, args...))
114 }
115
116 // Run the given func, logging appropriate messages before and after,
117 // adding timing info, etc.
118 //
119 // The id argument should be unique among tests, and shouldn't change
120 // when other tests are added/removed.
121 func (diag *diagnoser) dotest(id int, title string, fn func() error) {
122         if diag.done == nil {
123                 diag.done = map[int]bool{}
124         } else if diag.done[id] {
125                 diag.errorf("(bug) reused test id %d", id)
126         }
127         diag.done[id] = true
128
129         diag.logger.Infof("%4d: %s", id, title)
130         t0 := time.Now()
131         err := fn()
132         elapsed := fmt.Sprintf("%d ms", time.Now().Sub(t0)/time.Millisecond)
133         if err != nil {
134                 diag.errorf("%4d: %s (%s): %s", id, title, elapsed, err)
135         } else {
136                 diag.logger.Debugf("%4d: %s (%s): ok", id, title, elapsed)
137         }
138 }
139
140 func (diag *diagnoser) runtests() {
141         client := arvados.NewClientFromEnv()
142         // Disable auto-retry, use context instead
143         client.Timeout = 0
144
145         if client.APIHost == "" || client.AuthToken == "" {
146                 diag.errorf("ARVADOS_API_HOST and ARVADOS_API_TOKEN environment variables are not set -- aborting without running any tests")
147                 return
148         }
149
150         hostname, err := os.Hostname()
151         if err != nil {
152                 diag.warnf("error getting hostname: %s")
153         } else {
154                 diag.verbosef("hostname = %s", hostname)
155         }
156
157         diag.dotest(5, "running health check (same as `arvados-server check`)", func() error {
158                 ldr := config.NewLoader(&bytes.Buffer{}, ctxlog.New(&bytes.Buffer{}, "text", "info"))
159                 ldr.SetupFlags(flag.NewFlagSet("diagnostics", flag.ContinueOnError))
160                 cfg, err := ldr.Load()
161                 if err != nil {
162                         diag.infof("skipping because config could not be loaded: %s", err)
163                         return nil
164                 }
165                 cluster, err := cfg.GetCluster("")
166                 if err != nil {
167                         return err
168                 }
169                 if cluster.SystemRootToken != os.Getenv("ARVADOS_API_TOKEN") {
170                         return fmt.Errorf("diagnostics usage error: %s is readable but SystemRootToken does not match $ARVADOS_API_TOKEN (to fix, either run 'arvados-client sudo diagnostics' to load everything from config file, or set ARVADOS_CONFIG=- to load nothing from config file)", ldr.Path)
171                 }
172                 agg := &health.Aggregator{Cluster: cluster}
173                 resp := agg.ClusterHealth()
174                 for _, e := range resp.Errors {
175                         diag.errorf("health check: %s", e)
176                 }
177                 if len(resp.Errors) > 0 {
178                         diag.infof("consider running `arvados-server check -yaml` for a comprehensive report")
179                 }
180                 diag.verbosef("reported clock skew = %v", resp.ClockSkew)
181                 reported := map[string]bool{}
182                 for _, result := range resp.Checks {
183                         version := strings.SplitN(result.Metrics.Version, " (go", 2)[0]
184                         if version != "" && !reported[version] {
185                                 diag.verbosef("arvados version = %s", version)
186                                 reported[version] = true
187                         }
188                 }
189                 reported = map[string]bool{}
190                 for _, result := range resp.Checks {
191                         if result.Server != "" && !reported[result.Server] {
192                                 diag.verbosef("http frontend version = %s", result.Server)
193                                 reported[result.Server] = true
194                         }
195                 }
196                 reported = map[string]bool{}
197                 for _, result := range resp.Checks {
198                         if sha := result.ConfigSourceSHA256; sha != "" && !reported[sha] {
199                                 diag.verbosef("config file sha256 = %s", sha)
200                                 reported[sha] = true
201                         }
202                 }
203                 return nil
204         })
205
206         var dd arvados.DiscoveryDocument
207         ddpath := "discovery/v1/apis/arvados/v1/rest"
208         diag.dotest(10, fmt.Sprintf("getting discovery document from https://%s/%s", client.APIHost, ddpath), func() error {
209                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
210                 defer cancel()
211                 err := client.RequestAndDecodeContext(ctx, &dd, "GET", ddpath, nil, nil)
212                 if err != nil {
213                         return err
214                 }
215                 diag.verbosef("BlobSignatureTTL = %d", dd.BlobSignatureTTL)
216                 return nil
217         })
218
219         var cluster arvados.Cluster
220         cfgpath := "arvados/v1/config"
221         cfgOK := false
222         diag.dotest(20, fmt.Sprintf("getting exported config from https://%s/%s", client.APIHost, cfgpath), func() error {
223                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
224                 defer cancel()
225                 err := client.RequestAndDecodeContext(ctx, &cluster, "GET", cfgpath, nil, nil)
226                 if err != nil {
227                         return err
228                 }
229                 diag.verbosef("Collections.BlobSigning = %v", cluster.Collections.BlobSigning)
230                 cfgOK = true
231                 return nil
232         })
233
234         var user arvados.User
235         diag.dotest(30, "getting current user record", func() error {
236                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
237                 defer cancel()
238                 err := client.RequestAndDecodeContext(ctx, &user, "GET", "arvados/v1/users/current", nil, nil)
239                 if err != nil {
240                         return err
241                 }
242                 diag.verbosef("user uuid = %s", user.UUID)
243                 return nil
244         })
245
246         if !cfgOK {
247                 diag.errorf("cannot proceed without cluster config -- aborting without running any further tests")
248                 return
249         }
250
251         // uncomment to create some spurious errors
252         // cluster.Services.WebDAVDownload.ExternalURL.Host = "0.0.0.0:9"
253
254         // TODO: detect routing errors here, like finding wb2 at the
255         // wb1 address.
256         for i, svc := range []*arvados.Service{
257                 &cluster.Services.Keepproxy,
258                 &cluster.Services.WebDAV,
259                 &cluster.Services.WebDAVDownload,
260                 &cluster.Services.Websocket,
261                 &cluster.Services.Workbench1,
262                 &cluster.Services.Workbench2,
263         } {
264                 diag.dotest(40+i, fmt.Sprintf("connecting to service endpoint %s", svc.ExternalURL), func() error {
265                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
266                         defer cancel()
267                         u := svc.ExternalURL
268                         if strings.HasPrefix(u.Scheme, "ws") {
269                                 // We can do a real websocket test elsewhere,
270                                 // but for now we'll just check the https
271                                 // connection.
272                                 u.Scheme = "http" + u.Scheme[2:]
273                         }
274                         if svc == &cluster.Services.WebDAV && strings.HasPrefix(u.Host, "*") {
275                                 u.Host = "d41d8cd98f00b204e9800998ecf8427e-0" + u.Host[1:]
276                         }
277                         req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
278                         if err != nil {
279                                 return err
280                         }
281                         resp, err := http.DefaultClient.Do(req)
282                         if err != nil {
283                                 return err
284                         }
285                         resp.Body.Close()
286                         return nil
287                 })
288         }
289
290         for i, url := range []string{
291                 cluster.Services.Controller.ExternalURL.String(),
292                 cluster.Services.Keepproxy.ExternalURL.String() + "d41d8cd98f00b204e9800998ecf8427e+0",
293                 cluster.Services.WebDAVDownload.ExternalURL.String(),
294         } {
295                 diag.dotest(50+i, fmt.Sprintf("checking CORS headers at %s", url), func() error {
296                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
297                         defer cancel()
298                         req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
299                         if err != nil {
300                                 return err
301                         }
302                         req.Header.Set("Origin", "https://example.com")
303                         resp, err := http.DefaultClient.Do(req)
304                         if err != nil {
305                                 return err
306                         }
307                         if hdr := resp.Header.Get("Access-Control-Allow-Origin"); hdr != "*" {
308                                 return fmt.Errorf("expected \"Access-Control-Allow-Origin: *\", got %q", hdr)
309                         }
310                         return nil
311                 })
312         }
313
314         var keeplist arvados.KeepServiceList
315         diag.dotest(60, "checking internal/external client detection", func() error {
316                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
317                 defer cancel()
318                 err := client.RequestAndDecodeContext(ctx, &keeplist, "GET", "arvados/v1/keep_services/accessible", nil, arvados.ListOptions{Limit: 999999})
319                 if err != nil {
320                         return fmt.Errorf("error getting keep services list: %s", err)
321                 } else if len(keeplist.Items) == 0 {
322                         return fmt.Errorf("controller did not return any keep services")
323                 }
324                 found := map[string]int{}
325                 for _, ks := range keeplist.Items {
326                         found[ks.ServiceType]++
327                 }
328                 isInternal := found["proxy"] == 0 && len(keeplist.Items) > 0
329                 isExternal := found["proxy"] > 0 && found["proxy"] == len(keeplist.Items)
330                 if isExternal {
331                         diag.infof("controller returned only proxy services, this host is treated as \"external\"")
332                 } else if isInternal {
333                         diag.infof("controller returned only non-proxy services, this host is treated as \"internal\"")
334                 }
335                 if (diag.checkInternal && !isInternal) || (diag.checkExternal && !isExternal) {
336                         return fmt.Errorf("expecting internal=%v external=%v, but found internal=%v external=%v", diag.checkInternal, diag.checkExternal, isInternal, isExternal)
337                 }
338                 return nil
339         })
340
341         for i, ks := range keeplist.Items {
342                 u := url.URL{
343                         Scheme: "http",
344                         Host:   net.JoinHostPort(ks.ServiceHost, fmt.Sprintf("%d", ks.ServicePort)),
345                         Path:   "/",
346                 }
347                 if ks.ServiceSSLFlag {
348                         u.Scheme = "https"
349                 }
350                 diag.dotest(61+i, fmt.Sprintf("reading+writing via keep service at %s", u.String()), func() error {
351                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
352                         defer cancel()
353                         req, err := http.NewRequestWithContext(ctx, "PUT", u.String()+"d41d8cd98f00b204e9800998ecf8427e", nil)
354                         if err != nil {
355                                 return err
356                         }
357                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
358                         resp, err := http.DefaultClient.Do(req)
359                         if err != nil {
360                                 return err
361                         }
362                         defer resp.Body.Close()
363                         body, err := ioutil.ReadAll(resp.Body)
364                         if err != nil {
365                                 return fmt.Errorf("reading response body: %s", err)
366                         }
367                         loc := strings.TrimSpace(string(body))
368                         if !strings.HasPrefix(loc, "d41d8") {
369                                 return fmt.Errorf("unexpected response from write: %q", body)
370                         }
371
372                         req, err = http.NewRequestWithContext(ctx, "GET", u.String()+loc, nil)
373                         if err != nil {
374                                 return err
375                         }
376                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
377                         resp, err = http.DefaultClient.Do(req)
378                         if err != nil {
379                                 return err
380                         }
381                         defer resp.Body.Close()
382                         body, err = ioutil.ReadAll(resp.Body)
383                         if err != nil {
384                                 return fmt.Errorf("reading response body: %s", err)
385                         }
386                         if len(body) != 0 {
387                                 return fmt.Errorf("unexpected response from read: %q", body)
388                         }
389
390                         return nil
391                 })
392         }
393
394         var project arvados.Group
395         diag.dotest(80, fmt.Sprintf("finding/creating %q project", diag.projectName), func() error {
396                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
397                 defer cancel()
398                 var grplist arvados.GroupList
399                 err := client.RequestAndDecodeContext(ctx, &grplist, "GET", "arvados/v1/groups", nil, arvados.ListOptions{
400                         Filters: []arvados.Filter{
401                                 {"name", "=", diag.projectName},
402                                 {"group_class", "=", "project"},
403                                 {"owner_uuid", "=", user.UUID}},
404                         Limit: 999999})
405                 if err != nil {
406                         return fmt.Errorf("list groups: %s", err)
407                 }
408                 if len(grplist.Items) > 0 {
409                         project = grplist.Items[0]
410                         diag.verbosef("using existing project, uuid = %s", project.UUID)
411                         return nil
412                 }
413                 diag.debugf("list groups: ok, no results")
414                 err = client.RequestAndDecodeContext(ctx, &project, "POST", "arvados/v1/groups", nil, map[string]interface{}{"group": map[string]interface{}{
415                         "name":        diag.projectName,
416                         "group_class": "project",
417                 }})
418                 if err != nil {
419                         return fmt.Errorf("create project: %s", err)
420                 }
421                 diag.verbosef("created project, uuid = %s", project.UUID)
422                 return nil
423         })
424
425         var collection arvados.Collection
426         diag.dotest(90, "creating temporary collection", func() error {
427                 if project.UUID == "" {
428                         return fmt.Errorf("skipping, no project to work in")
429                 }
430                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
431                 defer cancel()
432                 err := client.RequestAndDecodeContext(ctx, &collection, "POST", "arvados/v1/collections", nil, map[string]interface{}{
433                         "ensure_unique_name": true,
434                         "collection": map[string]interface{}{
435                                 "owner_uuid": project.UUID,
436                                 "name":       "test collection",
437                                 "trash_at":   time.Now().Add(time.Hour)}})
438                 if err != nil {
439                         return err
440                 }
441                 diag.verbosef("ok, uuid = %s", collection.UUID)
442                 return nil
443         })
444
445         if collection.UUID != "" {
446                 defer func() {
447                         diag.dotest(9990, "deleting temporary collection", func() error {
448                                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
449                                 defer cancel()
450                                 return client.RequestAndDecodeContext(ctx, nil, "DELETE", "arvados/v1/collections/"+collection.UUID, nil, nil)
451                         })
452                 }()
453         }
454
455         tempdir, err := ioutil.TempDir("", "arvados-diagnostics")
456         if err != nil {
457                 diag.errorf("error creating temp dir: %s", err)
458                 return
459         }
460         defer os.RemoveAll(tempdir)
461
462         var imageSHA2 string
463         var dockerImageData []byte
464         if diag.dockerImage != "" || diag.priority < 1 {
465                 // We won't be using the self-built docker image, so
466                 // don't build it.  But we will write the embedded
467                 // "hello-world" image to our test collection to test
468                 // upload/download, whether or not we're using it as a
469                 // docker image.
470                 dockerImageData = HelloWorldDockerImage
471
472                 if diag.priority > 0 {
473                         imageSHA2, err = getSHA2FromImageData(dockerImageData)
474                         if err != nil {
475                                 diag.errorf("internal error/bug: %s", err)
476                                 return
477                         }
478                 }
479         } else if selfbin, err := os.Readlink("/proc/self/exe"); err != nil {
480                 diag.errorf("readlink /proc/self/exe: %s", err)
481                 return
482         } else if selfbindata, err := os.ReadFile(selfbin); err != nil {
483                 diag.errorf("error reading %s: %s", selfbin, err)
484                 return
485         } else {
486                 selfbinSha := fmt.Sprintf("%x", sha256.Sum256(selfbindata))
487                 tag := "arvados-client-diagnostics:" + selfbinSha[:9]
488                 err := os.WriteFile(tempdir+"/arvados-client", selfbindata, 0777)
489                 if err != nil {
490                         diag.errorf("error writing %s: %s", tempdir+"/arvados-client", err)
491                         return
492                 }
493
494                 dockerfile := "FROM " + diag.dockerImageFrom + "\n"
495                 dockerfile += "RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends libfuse2 ca-certificates && apt-get clean\n"
496                 dockerfile += "COPY /arvados-client /arvados-client\n"
497                 cmd := exec.Command("docker", "build", "--tag", tag, "-f", "-", tempdir)
498                 cmd.Stdin = strings.NewReader(dockerfile)
499                 cmd.Stdout = diag.stderr
500                 cmd.Stderr = diag.stderr
501                 err = cmd.Run()
502                 if err != nil {
503                         diag.errorf("error building docker image: %s", err)
504                         return
505                 }
506                 checkversion, err := exec.Command("docker", "run", tag, "/arvados-client", "version").CombinedOutput()
507                 if err != nil {
508                         diag.errorf("docker image does not seem to work: %s", err)
509                         return
510                 }
511                 diag.infof("arvados-client version: %s", checkversion)
512
513                 buf, err := exec.Command("docker", "inspect", "--format={{.Id}}", tag).Output()
514                 if err != nil {
515                         diag.errorf("docker inspect --format={{.Id}} %s: %s", tag, err)
516                         return
517                 }
518                 imageSHA2 = min64HexDigits.FindString(string(buf))
519                 if len(imageSHA2) != 64 {
520                         diag.errorf("docker inspect --format={{.Id}} output %q does not seem to contain sha256 digest", buf)
521                         return
522                 }
523
524                 buf, err = exec.Command("docker", "save", tag).Output()
525                 if err != nil {
526                         diag.errorf("docker save %s: %s", tag, err)
527                         return
528                 }
529                 diag.infof("docker image size is %d", len(buf))
530                 dockerImageData = buf
531         }
532
533         tarfilename := "sha256:" + imageSHA2 + ".tar"
534
535         diag.dotest(100, "uploading file via webdav", func() error {
536                 timeout := diag.timeout
537                 if len(dockerImageData) > 10<<20 && timeout < time.Minute {
538                         // Extend the normal http timeout if we're
539                         // uploading a substantial docker image.
540                         timeout = time.Minute
541                 }
542                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(timeout))
543                 defer cancel()
544                 if collection.UUID == "" {
545                         return fmt.Errorf("skipping, no test collection")
546                 }
547                 t0 := time.Now()
548                 req, err := http.NewRequestWithContext(ctx, "PUT", cluster.Services.WebDAVDownload.ExternalURL.String()+"c="+collection.UUID+"/"+tarfilename, bytes.NewReader(dockerImageData))
549                 if err != nil {
550                         return fmt.Errorf("BUG? http.NewRequest: %s", err)
551                 }
552                 req.Header.Set("Authorization", "Bearer "+client.AuthToken)
553                 resp, err := http.DefaultClient.Do(req)
554                 if err != nil {
555                         return fmt.Errorf("error performing http request: %s", err)
556                 }
557                 resp.Body.Close()
558                 if resp.StatusCode != http.StatusCreated {
559                         return fmt.Errorf("status %s", resp.Status)
560                 }
561                 diag.verbosef("upload ok, status %s, %f MB/s", resp.Status, float64(len(dockerImageData))/time.Since(t0).Seconds()/1000000)
562                 err = client.RequestAndDecodeContext(ctx, &collection, "GET", "arvados/v1/collections/"+collection.UUID, nil, nil)
563                 if err != nil {
564                         return fmt.Errorf("get updated collection: %s", err)
565                 }
566                 diag.verbosef("upload pdh %s", collection.PortableDataHash)
567                 return nil
568         })
569
570         davurl := cluster.Services.WebDAV.ExternalURL
571         davWildcard := strings.HasPrefix(davurl.Host, "*--") || strings.HasPrefix(davurl.Host, "*.")
572         diag.dotest(110, fmt.Sprintf("checking WebDAV ExternalURL wildcard (%s)", davurl), func() error {
573                 if davurl.Host == "" {
574                         return fmt.Errorf("host missing - content previews will not work")
575                 }
576                 if !davWildcard && !cluster.Collections.TrustAllContent {
577                         diag.warnf("WebDAV ExternalURL has no leading wildcard and TrustAllContent==false - content previews will not work")
578                 }
579                 return nil
580         })
581
582         for i, trial := range []struct {
583                 needcoll     bool
584                 needWildcard bool
585                 status       int
586                 fileurl      string
587         }{
588                 {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + "foo"},
589                 {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + tarfilename},
590                 {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/foo"},
591                 {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/" + tarfilename},
592                 {true, true, http.StatusOK, strings.Replace(davurl.String(), "*", strings.Replace(collection.PortableDataHash, "+", "-", -1), 1) + tarfilename},
593                 {true, false, http.StatusOK, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=" + collection.UUID + "/_/" + tarfilename},
594         } {
595                 diag.dotest(120+i, fmt.Sprintf("downloading from webdav (%s)", trial.fileurl), func() error {
596                         if trial.needWildcard && !davWildcard {
597                                 diag.warnf("skipping collection-id-in-vhost test because WebDAV ExternalURL has no leading wildcard")
598                                 return nil
599                         }
600                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
601                         defer cancel()
602                         if trial.needcoll && collection.UUID == "" {
603                                 return fmt.Errorf("skipping, no test collection")
604                         }
605                         req, err := http.NewRequestWithContext(ctx, "GET", trial.fileurl, nil)
606                         if err != nil {
607                                 return err
608                         }
609                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
610                         resp, err := http.DefaultClient.Do(req)
611                         if err != nil {
612                                 return err
613                         }
614                         defer resp.Body.Close()
615                         body, err := ioutil.ReadAll(resp.Body)
616                         if err != nil {
617                                 return fmt.Errorf("reading response: %s", err)
618                         }
619                         if resp.StatusCode != trial.status {
620                                 return fmt.Errorf("unexpected response status: %s", resp.Status)
621                         }
622                         if trial.status == http.StatusOK && !bytes.Equal(body, dockerImageData) {
623                                 excerpt := body
624                                 if len(excerpt) > 128 {
625                                         excerpt = append([]byte(nil), body[:128]...)
626                                         excerpt = append(excerpt, []byte("[...]")...)
627                                 }
628                                 return fmt.Errorf("unexpected response content: len %d, %q", len(body), excerpt)
629                         }
630                         return nil
631                 })
632         }
633
634         var vm arvados.VirtualMachine
635         diag.dotest(130, "getting list of virtual machines", func() error {
636                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
637                 defer cancel()
638                 var vmlist arvados.VirtualMachineList
639                 err := client.RequestAndDecodeContext(ctx, &vmlist, "GET", "arvados/v1/virtual_machines", nil, arvados.ListOptions{Limit: 999999})
640                 if err != nil {
641                         return err
642                 }
643                 if len(vmlist.Items) < 1 {
644                         diag.warnf("no VMs found")
645                 } else {
646                         vm = vmlist.Items[0]
647                 }
648                 return nil
649         })
650
651         diag.dotest(150, "connecting to webshell service", func() error {
652                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
653                 defer cancel()
654                 if vm.UUID == "" {
655                         diag.warnf("skipping, no vm available")
656                         return nil
657                 }
658                 u := cluster.Services.WebShell.ExternalURL
659                 webshellurl := u.String() + vm.Hostname + "?"
660                 if strings.HasPrefix(u.Host, "*") {
661                         u.Host = vm.Hostname + u.Host[1:]
662                         webshellurl = u.String() + "?"
663                 }
664                 diag.debugf("url %s", webshellurl)
665                 req, err := http.NewRequestWithContext(ctx, "POST", webshellurl, bytes.NewBufferString(url.Values{
666                         "width":   {"80"},
667                         "height":  {"25"},
668                         "session": {"xyzzy"},
669                         "rooturl": {webshellurl},
670                 }.Encode()))
671                 if err != nil {
672                         return err
673                 }
674                 req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
675                 resp, err := http.DefaultClient.Do(req)
676                 if err != nil {
677                         return err
678                 }
679                 defer resp.Body.Close()
680                 diag.debugf("response status %s", resp.Status)
681                 body, err := ioutil.ReadAll(resp.Body)
682                 if err != nil {
683                         return fmt.Errorf("reading response: %s", err)
684                 }
685                 diag.debugf("response body %q", body)
686                 // We don't speak the protocol, so we get a 400 error
687                 // from the webshell server even if everything is
688                 // OK. Anything else (404, 502, ???) indicates a
689                 // problem.
690                 if resp.StatusCode != http.StatusBadRequest {
691                         return fmt.Errorf("unexpected response status: %s, %q", resp.Status, body)
692                 }
693                 return nil
694         })
695
696         diag.dotest(160, "running a container", func() error {
697                 if diag.priority < 1 {
698                         diag.infof("skipping (use priority > 0 if you want to run a container)")
699                         return nil
700                 }
701                 if project.UUID == "" {
702                         return fmt.Errorf("skipping, no project to work in")
703                 }
704
705                 timestamp := time.Now().Format(time.RFC3339)
706
707                 var ctrCommand []string
708                 switch diag.dockerImage {
709                 case "":
710                         if collection.UUID == "" {
711                                 return fmt.Errorf("skipping, no test collection to use as docker image")
712                         }
713                         diag.dockerImage = collection.PortableDataHash
714                         ctrCommand = []string{"/arvados-client", "diagnostics",
715                                 "-priority=0", // don't run a container
716                                 "-log-level=" + diag.logLevel,
717                                 "-internal-client=true"}
718                 case "hello-world":
719                         if collection.UUID == "" {
720                                 return fmt.Errorf("skipping, no test collection to use as docker image")
721                         }
722                         diag.dockerImage = collection.PortableDataHash
723                         ctrCommand = []string{"/hello"}
724                 default:
725                         ctrCommand = []string{"echo", timestamp}
726                 }
727
728                 var cr arvados.ContainerRequest
729                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
730                 defer cancel()
731
732                 err := client.RequestAndDecodeContext(ctx, &cr, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{"container_request": map[string]interface{}{
733                         "owner_uuid":      project.UUID,
734                         "name":            fmt.Sprintf("diagnostics container request %s", timestamp),
735                         "container_image": diag.dockerImage,
736                         "command":         ctrCommand,
737                         "use_existing":    false,
738                         "output_path":     "/mnt/output",
739                         "output_name":     fmt.Sprintf("diagnostics output %s", timestamp),
740                         "priority":        diag.priority,
741                         "state":           arvados.ContainerRequestStateCommitted,
742                         "mounts": map[string]map[string]interface{}{
743                                 "/mnt/output": {
744                                         "kind":     "collection",
745                                         "writable": true,
746                                 },
747                         },
748                         "runtime_constraints": arvados.RuntimeConstraints{
749                                 API:          true,
750                                 VCPUs:        1,
751                                 RAM:          128 << 20,
752                                 KeepCacheRAM: 64 << 20,
753                         },
754                 }})
755                 if err != nil {
756                         return err
757                 }
758                 diag.infof("container request uuid = %s", cr.UUID)
759                 diag.verbosef("container uuid = %s", cr.ContainerUUID)
760
761                 timeout := 10 * time.Minute
762                 diag.infof("container request submitted, waiting up to %v for container to run", arvados.Duration(timeout))
763                 deadline := time.Now().Add(timeout)
764
765                 var c arvados.Container
766                 for ; cr.State != arvados.ContainerRequestStateFinal && time.Now().Before(deadline); time.Sleep(2 * time.Second) {
767                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
768                         defer cancel()
769
770                         crStateWas := cr.State
771                         err := client.RequestAndDecodeContext(ctx, &cr, "GET", "arvados/v1/container_requests/"+cr.UUID, nil, nil)
772                         if err != nil {
773                                 return err
774                         }
775                         if cr.State != crStateWas {
776                                 diag.debugf("container request state = %s", cr.State)
777                         }
778
779                         cStateWas := c.State
780                         err = client.RequestAndDecodeContext(ctx, &c, "GET", "arvados/v1/containers/"+cr.ContainerUUID, nil, nil)
781                         if err != nil {
782                                 return err
783                         }
784                         if c.State != cStateWas {
785                                 diag.debugf("container state = %s", c.State)
786                         }
787
788                         cancel()
789                 }
790
791                 if cr.State != arvados.ContainerRequestStateFinal {
792                         err := client.RequestAndDecodeContext(context.Background(), &cr, "PATCH", "arvados/v1/container_requests/"+cr.UUID, nil, map[string]interface{}{
793                                 "container_request": map[string]interface{}{
794                                         "priority": 0,
795                                 }})
796                         if err != nil {
797                                 diag.infof("error canceling container request %s: %s", cr.UUID, err)
798                         } else {
799                                 diag.debugf("canceled container request %s", cr.UUID)
800                         }
801                         return fmt.Errorf("timed out waiting for container to finish; container request %s state was %q, container %s state was %q", cr.UUID, cr.State, c.UUID, c.State)
802                 }
803                 if c.State != arvados.ContainerStateComplete {
804                         return fmt.Errorf("container request %s is final but container %s did not complete: container state = %q", cr.UUID, cr.ContainerUUID, c.State)
805                 }
806                 if c.ExitCode != 0 {
807                         return fmt.Errorf("container exited %d", c.ExitCode)
808                 }
809                 return nil
810         })
811 }
812
813 func getSHA2FromImageData(dockerImageData []byte) (string, error) {
814         tr := tar.NewReader(bytes.NewReader(dockerImageData))
815         for {
816                 hdr, err := tr.Next()
817                 if err == io.EOF {
818                         return "", fmt.Errorf("cannot find manifest.json in docker image tar file")
819                 }
820                 if err != nil {
821                         return "", fmt.Errorf("cannot read docker image tar file: %s", err)
822                 }
823                 if hdr.Name != "manifest.json" {
824                         continue
825                 }
826                 var manifest []struct {
827                         Config string
828                 }
829                 err = json.NewDecoder(tr).Decode(&manifest)
830                 if err != nil {
831                         return "", fmt.Errorf("cannot read manifest.json from docker image tar file: %s", err)
832                 }
833                 if len(manifest) == 0 {
834                         return "", fmt.Errorf("manifest.json is empty")
835                 }
836                 s := min64HexDigits.FindString(manifest[0].Config)
837                 if len(s) != 64 {
838                         return "", fmt.Errorf("found manifest.json but .[0].Config %q does not seem to contain sha256 digest", manifest[0].Config)
839                 }
840                 return s, nil
841         }
842 }
843
844 var min64HexDigits = regexp.MustCompile(`[0-9a-f]{64,}`)