]> git.arvados.org - arvados.git/blob - lib/diagnostics/cmd.go
Merge branch '23009-multiselect-bug' into main. Closes #23009
[arvados.git] / lib / diagnostics / cmd.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package diagnostics
6
7 import (
8         "archive/tar"
9         "bytes"
10         "context"
11         "crypto/sha256"
12         _ "embed"
13         "encoding/json"
14         "flag"
15         "fmt"
16         "io"
17         "io/ioutil"
18         "net"
19         "net/http"
20         "net/url"
21         "os"
22         "os/exec"
23         "regexp"
24         "strings"
25         "time"
26
27         "git.arvados.org/arvados.git/lib/cmd"
28         "git.arvados.org/arvados.git/lib/config"
29         "git.arvados.org/arvados.git/sdk/go/arvados"
30         "git.arvados.org/arvados.git/sdk/go/ctxlog"
31         "git.arvados.org/arvados.git/sdk/go/health"
32         "github.com/sirupsen/logrus"
33 )
34
35 type Command struct{}
36
37 func (Command) RunCommand(prog string, args []string, stdin io.Reader, stdout, stderr io.Writer) int {
38         var diag diagnoser
39         f := flag.NewFlagSet(prog, flag.ContinueOnError)
40         f.StringVar(&diag.projectName, "project-name", "scratch area for diagnostics", "`name` of project to find/create in home project and use for temporary/test objects")
41         f.StringVar(&diag.logLevel, "log-level", "info", "logging `level` (debug, info, warning, error)")
42         f.StringVar(&diag.dockerImage, "docker-image", "", "`image` (tag or portable data hash) to use when running a test container, or \"hello-world\" to use embedded hello-world image (default: build a custom image containing this executable, and run diagnostics inside the container too)")
43         f.StringVar(&diag.dockerImageFrom, "docker-image-from", "debian:stable-slim", "`base` image to use when building a custom image (see https://doc.arvados.org/main/admin/diagnostics.html#container-options)")
44         f.BoolVar(&diag.checkInternal, "internal-client", false, "check that this host is considered an \"internal\" client")
45         f.BoolVar(&diag.checkExternal, "external-client", false, "check that this host is considered an \"external\" client")
46         f.BoolVar(&diag.verbose, "v", false, "verbose: include more information in report")
47         f.IntVar(&diag.priority, "priority", 500, "priority for test container (1..1000, or 0 to skip)")
48         f.DurationVar(&diag.timeout, "timeout", 10*time.Second, "timeout for http requests")
49         if ok, code := cmd.ParseFlags(f, prog, args, "", stderr); !ok {
50                 return code
51         }
52         diag.stdout = stdout
53         diag.stderr = stderr
54         diag.logger = ctxlog.New(stdout, "text", diag.logLevel)
55         diag.logger.SetFormatter(&logrus.TextFormatter{DisableTimestamp: true, DisableLevelTruncation: true, PadLevelText: true})
56         diag.runtests()
57         if len(diag.errors) == 0 {
58                 diag.logger.Info("--- no errors ---")
59                 return 0
60         } else {
61                 if diag.logger.Level > logrus.ErrorLevel {
62                         fmt.Fprint(stdout, "\n--- cut here --- error summary ---\n\n")
63                         for _, e := range diag.errors {
64                                 diag.logger.Error(e)
65                         }
66                 }
67                 return 1
68         }
69 }
70
71 // docker save hello-world > hello-world.tar
72 //
73 //go:embed hello-world.tar
74 var HelloWorldDockerImage []byte
75
76 type diagnoser struct {
77         stdout          io.Writer
78         stderr          io.Writer
79         logLevel        string
80         priority        int
81         projectName     string
82         dockerImage     string
83         dockerImageFrom string
84         checkInternal   bool
85         checkExternal   bool
86         verbose         bool
87         timeout         time.Duration
88         logger          *logrus.Logger
89         errors          []string
90         done            map[int]bool
91 }
92
93 func (diag *diagnoser) debugf(f string, args ...interface{}) {
94         diag.logger.Debugf("  ... "+f, args...)
95 }
96
97 func (diag *diagnoser) infof(f string, args ...interface{}) {
98         diag.logger.Infof("  ... "+f, args...)
99 }
100
101 func (diag *diagnoser) verbosef(f string, args ...interface{}) {
102         if diag.verbose {
103                 diag.logger.Infof("  ... "+f, args...)
104         }
105 }
106
107 func (diag *diagnoser) warnf(f string, args ...interface{}) {
108         diag.logger.Warnf("  ... "+f, args...)
109 }
110
111 func (diag *diagnoser) errorf(f string, args ...interface{}) {
112         diag.logger.Errorf(f, args...)
113         diag.errors = append(diag.errors, fmt.Sprintf(f, args...))
114 }
115
116 // Run the given func, logging appropriate messages before and after,
117 // adding timing info, etc.
118 //
119 // The id argument should be unique among tests, and shouldn't change
120 // when other tests are added/removed.
121 func (diag *diagnoser) dotest(id int, title string, fn func() error) {
122         if diag.done == nil {
123                 diag.done = map[int]bool{}
124         } else if diag.done[id] {
125                 diag.errorf("(bug) reused test id %d", id)
126         }
127         diag.done[id] = true
128
129         diag.logger.Infof("%4d: %s", id, title)
130         t0 := time.Now()
131         err := fn()
132         elapsed := fmt.Sprintf("%d ms", time.Now().Sub(t0)/time.Millisecond)
133         if err != nil {
134                 diag.errorf("%4d: %s (%s): %s", id, title, elapsed, err)
135         } else {
136                 diag.logger.Debugf("%4d: %s (%s): ok", id, title, elapsed)
137         }
138 }
139
140 func (diag *diagnoser) runtests() {
141         client := arvados.NewClientFromEnv()
142         // Disable auto-retry, use context instead
143         client.Timeout = 0
144
145         if client.APIHost == "" || client.AuthToken == "" {
146                 diag.errorf("ARVADOS_API_HOST and ARVADOS_API_TOKEN environment variables are not set -- aborting without running any tests")
147                 return
148         }
149
150         hostname, err := os.Hostname()
151         if err != nil {
152                 diag.warnf("error getting hostname: %s")
153         } else {
154                 diag.verbosef("hostname = %s", hostname)
155         }
156
157         diag.dotest(5, "running health check (same as `arvados-server check`)", func() error {
158                 ldr := config.NewLoader(&bytes.Buffer{}, ctxlog.New(&bytes.Buffer{}, "text", "info"))
159                 ldr.SetupFlags(flag.NewFlagSet("diagnostics", flag.ContinueOnError))
160                 cfg, err := ldr.Load()
161                 if err != nil {
162                         diag.infof("skipping because config could not be loaded: %s", err)
163                         return nil
164                 }
165                 cluster, err := cfg.GetCluster("")
166                 if err != nil {
167                         return err
168                 }
169                 if cluster.SystemRootToken != os.Getenv("ARVADOS_API_TOKEN") {
170                         return fmt.Errorf("diagnostics usage error: %s is readable but SystemRootToken does not match $ARVADOS_API_TOKEN (to fix, either run 'arvados-client sudo diagnostics' to load everything from config file, or set ARVADOS_CONFIG=- to load nothing from config file)", ldr.Path)
171                 }
172                 agg := &health.Aggregator{Cluster: cluster}
173                 resp := agg.ClusterHealth()
174                 for _, e := range resp.Errors {
175                         diag.errorf("health check: %s", e)
176                 }
177                 if len(resp.Errors) > 0 {
178                         diag.infof("consider running `arvados-server check -yaml` for a comprehensive report")
179                 }
180                 diag.verbosef("reported clock skew = %v", resp.ClockSkew)
181                 reported := map[string]bool{}
182                 for _, result := range resp.Checks {
183                         version := strings.SplitN(result.Metrics.Version, " (go", 2)[0]
184                         if version != "" && !reported[version] {
185                                 diag.verbosef("arvados version = %s", version)
186                                 reported[version] = true
187                         }
188                 }
189                 reported = map[string]bool{}
190                 for _, result := range resp.Checks {
191                         if result.Server != "" && !reported[result.Server] {
192                                 diag.verbosef("http frontend version = %s", result.Server)
193                                 reported[result.Server] = true
194                         }
195                 }
196                 reported = map[string]bool{}
197                 for _, result := range resp.Checks {
198                         if sha := result.ConfigSourceSHA256; sha != "" && !reported[sha] {
199                                 diag.verbosef("config file sha256 = %s", sha)
200                                 reported[sha] = true
201                         }
202                 }
203                 return nil
204         })
205
206         var dd arvados.DiscoveryDocument
207         ddpath := "discovery/v1/apis/arvados/v1/rest"
208         diag.dotest(10, fmt.Sprintf("getting discovery document from https://%s/%s", client.APIHost, ddpath), func() error {
209                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
210                 defer cancel()
211                 err := client.RequestAndDecodeContext(ctx, &dd, "GET", ddpath, nil, nil)
212                 if err != nil {
213                         return err
214                 }
215                 diag.verbosef("BlobSignatureTTL = %d", dd.BlobSignatureTTL)
216                 return nil
217         })
218
219         var cluster arvados.Cluster
220         cfgpath := "arvados/v1/config"
221         cfgOK := false
222         diag.dotest(20, fmt.Sprintf("getting exported config from https://%s/%s", client.APIHost, cfgpath), func() error {
223                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
224                 defer cancel()
225                 err := client.RequestAndDecodeContext(ctx, &cluster, "GET", cfgpath, nil, nil)
226                 if err != nil {
227                         return err
228                 }
229                 diag.verbosef("Collections.BlobSigning = %v", cluster.Collections.BlobSigning)
230                 cfgOK = true
231                 return nil
232         })
233
234         var user arvados.User
235         diag.dotest(30, "getting current user record", func() error {
236                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
237                 defer cancel()
238                 err := client.RequestAndDecodeContext(ctx, &user, "GET", "arvados/v1/users/current", nil, nil)
239                 if err != nil {
240                         return err
241                 }
242                 diag.verbosef("user uuid = %s", user.UUID)
243                 return nil
244         })
245
246         if !cfgOK {
247                 diag.errorf("cannot proceed without cluster config -- aborting without running any further tests")
248                 return
249         }
250
251         // uncomment to create some spurious errors
252         // cluster.Services.WebDAVDownload.ExternalURL.Host = "0.0.0.0:9"
253
254         // TODO: detect routing errors here, like finding wb2 at the
255         // wb1 address.
256         for i, svc := range []struct {
257                 name   string
258                 config *arvados.Service
259         }{
260                 {"Keepproxy", &cluster.Services.Keepproxy},
261                 {"WebDAV", &cluster.Services.WebDAV},
262                 {"WebDAVDownload", &cluster.Services.WebDAVDownload},
263                 {"Websocket", &cluster.Services.Websocket},
264                 {"Workbench1", &cluster.Services.Workbench1},
265                 {"Workbench2", &cluster.Services.Workbench2},
266         } {
267                 u := url.URL(svc.config.ExternalURL)
268                 diag.dotest(40+i, fmt.Sprintf("connecting to %s endpoint %s", svc.name, u.String()), func() error {
269                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
270                         defer cancel()
271                         if strings.HasPrefix(u.Scheme, "ws") {
272                                 // We can do a real websocket test elsewhere,
273                                 // but for now we'll just check the https
274                                 // connection.
275                                 u.Scheme = "http" + u.Scheme[2:]
276                         }
277                         if svc.config == &cluster.Services.WebDAV && strings.HasPrefix(u.Host, "*") {
278                                 u.Host = "d41d8cd98f00b204e9800998ecf8427e-0" + u.Host[1:]
279                         }
280                         req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
281                         if err != nil {
282                                 return err
283                         }
284                         resp, err := http.DefaultClient.Do(req)
285                         if err != nil {
286                                 return err
287                         }
288                         resp.Body.Close()
289                         return nil
290                 })
291         }
292
293         for i, url := range []string{
294                 cluster.Services.Controller.ExternalURL.String(),
295                 cluster.Services.Keepproxy.ExternalURL.String() + "d41d8cd98f00b204e9800998ecf8427e+0",
296                 cluster.Services.WebDAVDownload.ExternalURL.String(),
297         } {
298                 diag.dotest(50+i, fmt.Sprintf("checking CORS headers at %s", url), func() error {
299                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
300                         defer cancel()
301                         req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
302                         if err != nil {
303                                 return err
304                         }
305                         req.Header.Set("Origin", "https://example.com")
306                         resp, err := http.DefaultClient.Do(req)
307                         if err != nil {
308                                 return err
309                         }
310                         if hdr := resp.Header.Get("Access-Control-Allow-Origin"); hdr != "*" {
311                                 return fmt.Errorf("expected \"Access-Control-Allow-Origin: *\", got %q", hdr)
312                         }
313                         return nil
314                 })
315         }
316
317         var keeplist arvados.KeepServiceList
318         diag.dotest(60, "checking internal/external client detection", func() error {
319                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
320                 defer cancel()
321                 err := client.RequestAndDecodeContext(ctx, &keeplist, "GET", "arvados/v1/keep_services/accessible", nil, arvados.ListOptions{Limit: 999999})
322                 if err != nil {
323                         return fmt.Errorf("error getting keep services list: %s", err)
324                 } else if len(keeplist.Items) == 0 {
325                         return fmt.Errorf("controller did not return any keep services")
326                 }
327                 found := map[string]int{}
328                 for _, ks := range keeplist.Items {
329                         found[ks.ServiceType]++
330                 }
331                 isInternal := found["proxy"] == 0 && len(keeplist.Items) > 0
332                 isExternal := found["proxy"] > 0 && found["proxy"] == len(keeplist.Items)
333                 if isExternal {
334                         diag.infof("controller returned only proxy services, this host is treated as \"external\"")
335                 } else if isInternal {
336                         diag.infof("controller returned only non-proxy services, this host is treated as \"internal\"")
337                 }
338                 if (diag.checkInternal && !isInternal) || (diag.checkExternal && !isExternal) {
339                         return fmt.Errorf("expecting internal=%v external=%v, but found internal=%v external=%v", diag.checkInternal, diag.checkExternal, isInternal, isExternal)
340                 }
341                 return nil
342         })
343
344         for i, ks := range keeplist.Items {
345                 u := url.URL{
346                         Scheme: "http",
347                         Host:   net.JoinHostPort(ks.ServiceHost, fmt.Sprintf("%d", ks.ServicePort)),
348                         Path:   "/",
349                 }
350                 if ks.ServiceSSLFlag {
351                         u.Scheme = "https"
352                 }
353                 diag.dotest(61+i, fmt.Sprintf("reading+writing via keep service at %s", u.String()), func() error {
354                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
355                         defer cancel()
356                         req, err := http.NewRequestWithContext(ctx, "PUT", u.String()+"d41d8cd98f00b204e9800998ecf8427e", nil)
357                         if err != nil {
358                                 return err
359                         }
360                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
361                         resp, err := http.DefaultClient.Do(req)
362                         if err != nil {
363                                 return err
364                         }
365                         defer resp.Body.Close()
366                         body, err := ioutil.ReadAll(resp.Body)
367                         if err != nil {
368                                 return fmt.Errorf("reading response body: %s", err)
369                         }
370                         loc := strings.TrimSpace(string(body))
371                         if !strings.HasPrefix(loc, "d41d8") {
372                                 return fmt.Errorf("unexpected response from write: %q", body)
373                         }
374
375                         req, err = http.NewRequestWithContext(ctx, "GET", u.String()+loc, nil)
376                         if err != nil {
377                                 return err
378                         }
379                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
380                         resp, err = http.DefaultClient.Do(req)
381                         if err != nil {
382                                 return err
383                         }
384                         defer resp.Body.Close()
385                         body, err = ioutil.ReadAll(resp.Body)
386                         if err != nil {
387                                 return fmt.Errorf("reading response body: %s", err)
388                         }
389                         if len(body) != 0 {
390                                 return fmt.Errorf("unexpected response from read: %q", body)
391                         }
392
393                         return nil
394                 })
395         }
396
397         var project arvados.Group
398         diag.dotest(80, fmt.Sprintf("finding/creating %q project", diag.projectName), func() error {
399                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
400                 defer cancel()
401                 var grplist arvados.GroupList
402                 err := client.RequestAndDecodeContext(ctx, &grplist, "GET", "arvados/v1/groups", nil, arvados.ListOptions{
403                         Filters: []arvados.Filter{
404                                 {"name", "=", diag.projectName},
405                                 {"group_class", "=", "project"},
406                                 {"owner_uuid", "=", user.UUID}},
407                         Limit: 999999})
408                 if err != nil {
409                         return fmt.Errorf("list groups: %s", err)
410                 }
411                 if len(grplist.Items) > 0 {
412                         project = grplist.Items[0]
413                         diag.verbosef("using existing project, uuid = %s", project.UUID)
414                         return nil
415                 }
416                 diag.debugf("list groups: ok, no results")
417                 err = client.RequestAndDecodeContext(ctx, &project, "POST", "arvados/v1/groups", nil, map[string]interface{}{"group": map[string]interface{}{
418                         "name":        diag.projectName,
419                         "group_class": "project",
420                 }})
421                 if err != nil {
422                         return fmt.Errorf("create project: %s", err)
423                 }
424                 diag.verbosef("created project, uuid = %s", project.UUID)
425                 return nil
426         })
427
428         var collection arvados.Collection
429         diag.dotest(90, "creating temporary collection", func() error {
430                 if project.UUID == "" {
431                         return fmt.Errorf("skipping, no project to work in")
432                 }
433                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
434                 defer cancel()
435                 err := client.RequestAndDecodeContext(ctx, &collection, "POST", "arvados/v1/collections", nil, map[string]interface{}{
436                         "ensure_unique_name": true,
437                         "collection": map[string]interface{}{
438                                 "owner_uuid": project.UUID,
439                                 "name":       "test collection",
440                                 "trash_at":   time.Now().Add(time.Hour)}})
441                 if err != nil {
442                         return err
443                 }
444                 diag.verbosef("ok, uuid = %s", collection.UUID)
445                 return nil
446         })
447
448         if collection.UUID != "" {
449                 defer func() {
450                         diag.dotest(9990, "deleting temporary collection", func() error {
451                                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
452                                 defer cancel()
453                                 return client.RequestAndDecodeContext(ctx, nil, "DELETE", "arvados/v1/collections/"+collection.UUID, nil, nil)
454                         })
455                 }()
456         }
457
458         tempdir, err := ioutil.TempDir("", "arvados-diagnostics")
459         if err != nil {
460                 diag.errorf("error creating temp dir: %s", err)
461                 return
462         }
463         defer os.RemoveAll(tempdir)
464
465         var imageSHA2 string
466         var dockerImageData []byte
467         if diag.dockerImage != "" || diag.priority < 1 {
468                 // We won't be using the self-built docker image, so
469                 // don't build it.  But we will write the embedded
470                 // "hello-world" image to our test collection to test
471                 // upload/download, whether or not we're using it as a
472                 // docker image.
473                 dockerImageData = HelloWorldDockerImage
474
475                 if diag.priority > 0 {
476                         imageSHA2, err = getSHA2FromImageData(dockerImageData)
477                         if err != nil {
478                                 diag.errorf("internal error/bug: %s", err)
479                                 return
480                         }
481                 }
482         } else if selfbin, err := os.Readlink("/proc/self/exe"); err != nil {
483                 diag.errorf("readlink /proc/self/exe: %s", err)
484                 return
485         } else if selfbindata, err := os.ReadFile(selfbin); err != nil {
486                 diag.errorf("error reading %s: %s", selfbin, err)
487                 return
488         } else {
489                 selfbinSha := fmt.Sprintf("%x", sha256.Sum256(selfbindata))
490                 tag := "arvados-client-diagnostics:" + selfbinSha[:9]
491                 err := os.WriteFile(tempdir+"/arvados-client", selfbindata, 0777)
492                 if err != nil {
493                         diag.errorf("error writing %s: %s", tempdir+"/arvados-client", err)
494                         return
495                 }
496
497                 dockerfile := "FROM " + diag.dockerImageFrom + "\n"
498                 dockerfile += "RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends libfuse2 ca-certificates && apt-get clean\n"
499                 dockerfile += "COPY /arvados-client /arvados-client\n"
500                 cmd := exec.Command("docker", "build", "--tag", tag, "-f", "-", tempdir)
501                 cmd.Stdin = strings.NewReader(dockerfile)
502                 cmd.Stdout = diag.stderr
503                 cmd.Stderr = diag.stderr
504                 err = cmd.Run()
505                 if err != nil {
506                         diag.errorf("error building docker image: %s", err)
507                         return
508                 }
509                 checkversion, err := exec.Command("docker", "run", tag, "/arvados-client", "version").CombinedOutput()
510                 if err != nil {
511                         diag.errorf("docker image does not seem to work: %s", err)
512                         return
513                 }
514                 diag.infof("arvados-client version: %s", checkversion)
515
516                 buf, err := exec.Command("docker", "inspect", "--format={{.Id}}", tag).Output()
517                 if err != nil {
518                         diag.errorf("docker inspect --format={{.Id}} %s: %s", tag, err)
519                         return
520                 }
521                 imageSHA2 = min64HexDigits.FindString(string(buf))
522                 if len(imageSHA2) != 64 {
523                         diag.errorf("docker inspect --format={{.Id}} output %q does not seem to contain sha256 digest", buf)
524                         return
525                 }
526
527                 buf, err = exec.Command("docker", "save", tag).Output()
528                 if err != nil {
529                         diag.errorf("docker save %s: %s", tag, err)
530                         return
531                 }
532                 diag.infof("docker image size is %d", len(buf))
533                 dockerImageData = buf
534         }
535
536         tarfilename := "sha256:" + imageSHA2 + ".tar"
537
538         diag.dotest(100, "uploading file via webdav", func() error {
539                 timeout := diag.timeout
540                 if len(dockerImageData) > 10<<20 && timeout < time.Minute {
541                         // Extend the normal http timeout if we're
542                         // uploading a substantial docker image.
543                         timeout = time.Minute
544                 }
545                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(timeout))
546                 defer cancel()
547                 if collection.UUID == "" {
548                         return fmt.Errorf("skipping, no test collection")
549                 }
550                 t0 := time.Now()
551                 req, err := http.NewRequestWithContext(ctx, "PUT", cluster.Services.WebDAVDownload.ExternalURL.String()+"c="+collection.UUID+"/"+tarfilename, bytes.NewReader(dockerImageData))
552                 if err != nil {
553                         return fmt.Errorf("BUG? http.NewRequest: %s", err)
554                 }
555                 req.Header.Set("Authorization", "Bearer "+client.AuthToken)
556                 resp, err := http.DefaultClient.Do(req)
557                 if err != nil {
558                         return fmt.Errorf("error performing http request: %s", err)
559                 }
560                 resp.Body.Close()
561                 if resp.StatusCode != http.StatusCreated {
562                         return fmt.Errorf("status %s", resp.Status)
563                 }
564                 diag.verbosef("upload ok, status %s, %f MB/s", resp.Status, float64(len(dockerImageData))/time.Since(t0).Seconds()/1000000)
565                 err = client.RequestAndDecodeContext(ctx, &collection, "GET", "arvados/v1/collections/"+collection.UUID, nil, nil)
566                 if err != nil {
567                         return fmt.Errorf("get updated collection: %s", err)
568                 }
569                 diag.verbosef("upload pdh %s", collection.PortableDataHash)
570                 return nil
571         })
572
573         davurl := cluster.Services.WebDAV.ExternalURL
574         davWildcard := strings.HasPrefix(davurl.Host, "*--") || strings.HasPrefix(davurl.Host, "*.")
575         diag.dotest(110, fmt.Sprintf("checking WebDAV ExternalURL wildcard (%s)", davurl), func() error {
576                 if davurl.Host == "" {
577                         return fmt.Errorf("host missing - content previews will not work")
578                 }
579                 if !davWildcard && !cluster.Collections.TrustAllContent {
580                         diag.warnf("WebDAV ExternalURL has no leading wildcard and TrustAllContent==false - content previews will not work")
581                 }
582                 return nil
583         })
584
585         for i, trial := range []struct {
586                 needcoll     bool
587                 needWildcard bool
588                 status       int
589                 fileurl      string
590         }{
591                 {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + "foo"},
592                 {false, false, http.StatusNotFound, strings.Replace(davurl.String(), "*", "d41d8cd98f00b204e9800998ecf8427e-0", 1) + tarfilename},
593                 {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/foo"},
594                 {false, false, http.StatusNotFound, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=d41d8cd98f00b204e9800998ecf8427e+0/_/" + tarfilename},
595                 {true, true, http.StatusOK, strings.Replace(davurl.String(), "*", strings.Replace(collection.PortableDataHash, "+", "-", -1), 1) + tarfilename},
596                 {true, false, http.StatusOK, cluster.Services.WebDAVDownload.ExternalURL.String() + "c=" + collection.UUID + "/_/" + tarfilename},
597         } {
598                 diag.dotest(120+i, fmt.Sprintf("downloading from webdav (%s)", trial.fileurl), func() error {
599                         if trial.needWildcard && !davWildcard {
600                                 diag.warnf("skipping collection-id-in-vhost test because WebDAV ExternalURL has no leading wildcard")
601                                 return nil
602                         }
603                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
604                         defer cancel()
605                         if trial.needcoll && collection.UUID == "" {
606                                 return fmt.Errorf("skipping, no test collection")
607                         }
608                         req, err := http.NewRequestWithContext(ctx, "GET", trial.fileurl, nil)
609                         if err != nil {
610                                 return err
611                         }
612                         req.Header.Set("Authorization", "Bearer "+client.AuthToken)
613                         resp, err := http.DefaultClient.Do(req)
614                         if err != nil {
615                                 return err
616                         }
617                         defer resp.Body.Close()
618                         body, err := ioutil.ReadAll(resp.Body)
619                         if err != nil {
620                                 return fmt.Errorf("reading response: %s", err)
621                         }
622                         if resp.StatusCode != trial.status {
623                                 return fmt.Errorf("unexpected response status: %s", resp.Status)
624                         }
625                         if trial.status == http.StatusOK && !bytes.Equal(body, dockerImageData) {
626                                 excerpt := body
627                                 if len(excerpt) > 128 {
628                                         excerpt = append([]byte(nil), body[:128]...)
629                                         excerpt = append(excerpt, []byte("[...]")...)
630                                 }
631                                 return fmt.Errorf("unexpected response content: len %d, %q", len(body), excerpt)
632                         }
633                         return nil
634                 })
635         }
636
637         var vm arvados.VirtualMachine
638         diag.dotest(130, "getting list of virtual machines", func() error {
639                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
640                 defer cancel()
641                 var vmlist arvados.VirtualMachineList
642                 err := client.RequestAndDecodeContext(ctx, &vmlist, "GET", "arvados/v1/virtual_machines", nil, arvados.ListOptions{Limit: 999999})
643                 if err != nil {
644                         return err
645                 }
646                 if len(vmlist.Items) < 1 {
647                         diag.warnf("no VMs found")
648                 } else {
649                         vm = vmlist.Items[0]
650                 }
651                 return nil
652         })
653
654         diag.dotest(150, "connecting to webshell service", func() error {
655                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
656                 defer cancel()
657                 u := cluster.Services.WebShell.ExternalURL
658                 if u == (arvados.URL{}) {
659                         diag.infof("skipping, webshell not configured")
660                         return nil
661                 }
662                 if vm.UUID == "" {
663                         diag.warnf("skipping, no vm available")
664                         return nil
665                 }
666                 webshellurl := u.String() + vm.Hostname + "?"
667                 if strings.HasPrefix(u.Host, "*") {
668                         u.Host = vm.Hostname + u.Host[1:]
669                         webshellurl = u.String() + "?"
670                 }
671                 diag.debugf("url %s", webshellurl)
672                 req, err := http.NewRequestWithContext(ctx, "POST", webshellurl, bytes.NewBufferString(url.Values{
673                         "width":   {"80"},
674                         "height":  {"25"},
675                         "session": {"xyzzy"},
676                         "rooturl": {webshellurl},
677                 }.Encode()))
678                 if err != nil {
679                         return err
680                 }
681                 req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
682                 resp, err := http.DefaultClient.Do(req)
683                 if err != nil {
684                         return err
685                 }
686                 defer resp.Body.Close()
687                 diag.debugf("response status %s", resp.Status)
688                 body, err := ioutil.ReadAll(resp.Body)
689                 if err != nil {
690                         return fmt.Errorf("reading response: %s", err)
691                 }
692                 diag.debugf("response body %q", body)
693                 // We don't speak the protocol, so we get a 400 error
694                 // from the webshell server even if everything is
695                 // OK. Anything else (404, 502, ???) indicates a
696                 // problem.
697                 if resp.StatusCode != http.StatusBadRequest {
698                         return fmt.Errorf("unexpected response status: %s, %q", resp.Status, body)
699                 }
700                 return nil
701         })
702
703         diag.dotest(160, "running a container", func() error {
704                 if diag.priority < 1 {
705                         diag.infof("skipping (use priority > 0 if you want to run a container)")
706                         return nil
707                 }
708                 if project.UUID == "" {
709                         return fmt.Errorf("skipping, no project to work in")
710                 }
711
712                 timestamp := time.Now().Format(time.RFC3339)
713
714                 var ctrCommand []string
715                 switch diag.dockerImage {
716                 case "":
717                         if collection.UUID == "" {
718                                 return fmt.Errorf("skipping, no test collection to use as docker image")
719                         }
720                         diag.dockerImage = collection.PortableDataHash
721                         ctrCommand = []string{"/arvados-client", "diagnostics",
722                                 "-priority=0", // don't run a container
723                                 "-log-level=" + diag.logLevel,
724                                 "-internal-client=true"}
725                 case "hello-world":
726                         if collection.UUID == "" {
727                                 return fmt.Errorf("skipping, no test collection to use as docker image")
728                         }
729                         diag.dockerImage = collection.PortableDataHash
730                         ctrCommand = []string{"/hello"}
731                 default:
732                         ctrCommand = []string{"echo", timestamp}
733                 }
734
735                 var cr arvados.ContainerRequest
736                 ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
737                 defer cancel()
738
739                 err := client.RequestAndDecodeContext(ctx, &cr, "POST", "arvados/v1/container_requests", nil, map[string]interface{}{"container_request": map[string]interface{}{
740                         "owner_uuid":      project.UUID,
741                         "name":            fmt.Sprintf("diagnostics container request %s", timestamp),
742                         "container_image": diag.dockerImage,
743                         "command":         ctrCommand,
744                         "use_existing":    false,
745                         "output_path":     "/mnt/output",
746                         "output_name":     fmt.Sprintf("diagnostics output %s", timestamp),
747                         "priority":        diag.priority,
748                         "state":           arvados.ContainerRequestStateCommitted,
749                         "mounts": map[string]map[string]interface{}{
750                                 "/mnt/output": {
751                                         "kind":     "collection",
752                                         "writable": true,
753                                 },
754                         },
755                         "runtime_constraints": arvados.RuntimeConstraints{
756                                 API:          true,
757                                 VCPUs:        1,
758                                 RAM:          128 << 20,
759                                 KeepCacheRAM: 64 << 20,
760                         },
761                 }})
762                 if err != nil {
763                         return err
764                 }
765                 diag.infof("container request uuid = %s", cr.UUID)
766                 diag.verbosef("container uuid = %s", cr.ContainerUUID)
767
768                 timeout := 10 * time.Minute
769                 diag.infof("container request submitted, waiting up to %v for container to run", arvados.Duration(timeout))
770                 deadline := time.Now().Add(timeout)
771
772                 var c arvados.Container
773                 for ; cr.State != arvados.ContainerRequestStateFinal && time.Now().Before(deadline); time.Sleep(2 * time.Second) {
774                         ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(diag.timeout))
775                         defer cancel()
776
777                         crStateWas := cr.State
778                         err := client.RequestAndDecodeContext(ctx, &cr, "GET", "arvados/v1/container_requests/"+cr.UUID, nil, nil)
779                         if err != nil {
780                                 return err
781                         }
782                         if cr.State != crStateWas {
783                                 diag.debugf("container request state = %s", cr.State)
784                         }
785
786                         cStateWas := c.State
787                         err = client.RequestAndDecodeContext(ctx, &c, "GET", "arvados/v1/containers/"+cr.ContainerUUID, nil, nil)
788                         if err != nil {
789                                 return err
790                         }
791                         if c.State != cStateWas {
792                                 diag.debugf("container state = %s", c.State)
793                         }
794
795                         cancel()
796                 }
797
798                 if cr.State != arvados.ContainerRequestStateFinal {
799                         err := client.RequestAndDecodeContext(context.Background(), &cr, "PATCH", "arvados/v1/container_requests/"+cr.UUID, nil, map[string]interface{}{
800                                 "container_request": map[string]interface{}{
801                                         "priority": 0,
802                                 }})
803                         if err != nil {
804                                 diag.infof("error canceling container request %s: %s", cr.UUID, err)
805                         } else {
806                                 diag.debugf("canceled container request %s", cr.UUID)
807                         }
808                         return fmt.Errorf("timed out waiting for container to finish; container request %s state was %q, container %s state was %q", cr.UUID, cr.State, c.UUID, c.State)
809                 }
810                 if c.State != arvados.ContainerStateComplete {
811                         return fmt.Errorf("container request %s is final but container %s did not complete: container state = %q", cr.UUID, cr.ContainerUUID, c.State)
812                 }
813                 if c.ExitCode != 0 {
814                         return fmt.Errorf("container exited %d", c.ExitCode)
815                 }
816                 return nil
817         })
818 }
819
820 func getSHA2FromImageData(dockerImageData []byte) (string, error) {
821         tr := tar.NewReader(bytes.NewReader(dockerImageData))
822         for {
823                 hdr, err := tr.Next()
824                 if err == io.EOF {
825                         return "", fmt.Errorf("cannot find manifest.json in docker image tar file")
826                 }
827                 if err != nil {
828                         return "", fmt.Errorf("cannot read docker image tar file: %s", err)
829                 }
830                 if hdr.Name != "manifest.json" {
831                         continue
832                 }
833                 var manifest []struct {
834                         Config string
835                 }
836                 err = json.NewDecoder(tr).Decode(&manifest)
837                 if err != nil {
838                         return "", fmt.Errorf("cannot read manifest.json from docker image tar file: %s", err)
839                 }
840                 if len(manifest) == 0 {
841                         return "", fmt.Errorf("manifest.json is empty")
842                 }
843                 s := min64HexDigits.FindString(manifest[0].Config)
844                 if len(s) != 64 {
845                         return "", fmt.Errorf("found manifest.json but .[0].Config %q does not seem to contain sha256 digest", manifest[0].Config)
846                 }
847                 return s, nil
848         }
849 }
850
851 var min64HexDigits = regexp.MustCompile(`[0-9a-f]{64,}`)