21700: Install Bundler system-wide in Rails postinst
[arvados.git] / lib / controller / localdb / container_gateway.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package localdb
6
7 import (
8         "bufio"
9         "bytes"
10         "context"
11         "crypto/hmac"
12         "crypto/sha256"
13         "crypto/subtle"
14         "crypto/tls"
15         "crypto/x509"
16         "errors"
17         "fmt"
18         "io"
19         "io/ioutil"
20         "net"
21         "net/http"
22         "net/http/httputil"
23         "net/url"
24         "os"
25         "strings"
26
27         "git.arvados.org/arvados.git/lib/controller/rpc"
28         "git.arvados.org/arvados.git/lib/service"
29         "git.arvados.org/arvados.git/lib/webdavfs"
30         "git.arvados.org/arvados.git/sdk/go/arvados"
31         "git.arvados.org/arvados.git/sdk/go/auth"
32         "git.arvados.org/arvados.git/sdk/go/ctxlog"
33         "git.arvados.org/arvados.git/sdk/go/httpserver"
34         keepweb "git.arvados.org/arvados.git/services/keep-web"
35         "github.com/hashicorp/yamux"
36         "golang.org/x/net/webdav"
37 )
38
39 var (
40         forceProxyForTest       = false
41         forceInternalURLForTest *arvados.URL
42 )
43
44 // ContainerRequestLog returns a WebDAV handler that reads logs from
45 // the indicated container request. It works by proxying the incoming
46 // HTTP request to
47 //
48 //   - the container gateway, if there is an associated container that
49 //     is running
50 //
51 //   - a different controller process, if there is a running container
52 //     whose gateway is accessible through a tunnel to a different
53 //     controller process
54 //
55 //   - keep-web, if saved logs exist and there is no gateway (or the
56 //     associated container is finished)
57 //
58 //   - an empty-collection stub, if there is no gateway and no saved
59 //     log
60 //
61 // For an incoming request
62 //
63 //      GET /arvados/v1/container_requests/{cr_uuid}/log/{c_uuid}{/c_log_path}
64 //
65 // The upstream request may be to {c_uuid}'s container gateway
66 //
67 //      GET /arvados/v1/container_requests/{cr_uuid}/log/{c_uuid}{/c_log_path}
68 //      X-Webdav-Prefix: /arvados/v1/container_requests/{cr_uuid}/log/{c_uuid}
69 //      X-Webdav-Source: /log
70 //
71 // ...or the upstream request may be to keep-web (where {cr_log_uuid}
72 // is the container request log collection UUID)
73 //
74 //      GET /arvados/v1/container_requests/{cr_uuid}/log/{c_uuid}{/c_log_path}
75 //      Host: {cr_log_uuid}.internal
76 //      X-Webdav-Prefix: /arvados/v1/container_requests/{cr_uuid}/log
77 //      X-Arvados-Container-Uuid: {c_uuid}
78 //
79 // ...or the request may be handled locally using an empty-collection
80 // stub.
81 func (conn *Conn) ContainerRequestLog(ctx context.Context, opts arvados.ContainerLogOptions) (http.Handler, error) {
82         if opts.Method == "OPTIONS" && opts.Header.Get("Access-Control-Request-Method") != "" {
83                 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
84                         if !keepweb.ServeCORSPreflight(w, opts.Header) {
85                                 // Inconceivable.  We already checked
86                                 // for the only condition where
87                                 // ServeCORSPreflight returns false.
88                                 httpserver.Error(w, "unhandled CORS preflight request", http.StatusInternalServerError)
89                         }
90                 }), nil
91         }
92         cr, err := conn.railsProxy.ContainerRequestGet(ctx, arvados.GetOptions{UUID: opts.UUID, Select: []string{"uuid", "container_uuid", "log_uuid"}})
93         if err != nil {
94                 if se := httpserver.HTTPStatusError(nil); errors.As(err, &se) && se.HTTPStatus() == http.StatusUnauthorized {
95                         // Hint to WebDAV client that we accept HTTP basic auth.
96                         return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
97                                 w.Header().Set("Www-Authenticate", "Basic realm=\"collections\"")
98                                 w.WriteHeader(http.StatusUnauthorized)
99                         }), nil
100                 }
101                 return nil, err
102         }
103         ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{UUID: cr.ContainerUUID, Select: []string{"uuid", "state", "gateway_address"}})
104         if err != nil {
105                 return nil, err
106         }
107         // .../log/{ctr.UUID} is a directory where the currently
108         // assigned container's log data [will] appear (as opposed to
109         // previous attempts in .../log/{previous_ctr_uuid}). Requests
110         // that are outside that directory, and requests on a
111         // non-running container, are proxied to keep-web instead of
112         // going through the container gateway system.
113         //
114         // Side note: a depth>1 directory tree listing starting at
115         // .../{cr_uuid}/log will only include subdirectories for
116         // finished containers, i.e., will not include a subdirectory
117         // with log data for a current (unfinished) container UUID.
118         // In order to access live logs, a client must look up the
119         // container_uuid field of the container request record, and
120         // explicitly request a path under .../{cr_uuid}/log/{c_uuid}.
121         if ctr.GatewayAddress == "" ||
122                 (ctr.State != arvados.ContainerStateLocked && ctr.State != arvados.ContainerStateRunning) ||
123                 !(opts.Path == "/"+ctr.UUID || strings.HasPrefix(opts.Path, "/"+ctr.UUID+"/")) {
124                 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
125                         conn.serveContainerRequestLogViaKeepWeb(opts, cr, w, r)
126                 }), nil
127         }
128         dial, arpc, err := conn.findGateway(ctx, ctr, opts.NoForward)
129         if err != nil {
130                 return nil, err
131         }
132         if arpc != nil {
133                 opts.NoForward = true
134                 return arpc.ContainerRequestLog(ctx, opts)
135         }
136         return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
137                 r = r.WithContext(ctx)
138                 var proxyReq *http.Request
139                 var proxyErr error
140                 var expectRespondAuth string
141                 proxy := &httputil.ReverseProxy{
142                         // Our custom Transport:
143                         //
144                         // - Uses a custom dialer to connect to the
145                         // gateway (either directly or through a
146                         // tunnel set up though ContainerTunnel)
147                         //
148                         // - Verifies the gateway's TLS certificate
149                         // using X-Arvados-Authorization headers.
150                         //
151                         // This involves modifying the outgoing
152                         // request header in DialTLSContext.
153                         // (ReverseProxy certainly doesn't expect us
154                         // to do this, but it works.)
155                         Transport: &http.Transport{
156                                 DialTLSContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
157                                         tlsconn, requestAuth, respondAuth, err := dial()
158                                         if err != nil {
159                                                 return nil, err
160                                         }
161                                         proxyReq.Header.Set("X-Arvados-Authorization", requestAuth)
162                                         expectRespondAuth = respondAuth
163                                         return tlsconn, nil
164                                 },
165                         },
166                         Director: func(r *http.Request) {
167                                 // Scheme/host of incoming r.URL are
168                                 // irrelevant now, and may even be
169                                 // missing. Host is ignored by our
170                                 // DialTLSContext, but we need a
171                                 // generic syntactically correct URL
172                                 // for net/http to work with.
173                                 r.URL.Scheme = "https"
174                                 r.URL.Host = "0.0.0.0:0"
175                                 r.Header.Set("X-Arvados-Container-Gateway-Uuid", ctr.UUID)
176                                 r.Header.Set("X-Webdav-Prefix", "/arvados/v1/container_requests/"+cr.UUID+"/log/"+ctr.UUID)
177                                 r.Header.Set("X-Webdav-Source", "/log")
178                                 proxyReq = r
179                         },
180                         ModifyResponse: func(resp *http.Response) error {
181                                 if resp.Header.Get("X-Arvados-Authorization-Response") != expectRespondAuth {
182                                         // Note this is how we detect
183                                         // an attacker-in-the-middle.
184                                         return httpserver.ErrorWithStatus(errors.New("bad X-Arvados-Authorization-Response header"), http.StatusBadGateway)
185                                 }
186                                 resp.Header.Del("X-Arvados-Authorization-Response")
187                                 preemptivelyDeduplicateHeaders(w.Header(), resp.Header)
188                                 return nil
189                         },
190                         ErrorHandler: func(w http.ResponseWriter, r *http.Request, err error) {
191                                 proxyErr = err
192                         },
193                 }
194                 proxy.ServeHTTP(w, r)
195                 if proxyErr == nil {
196                         // proxy succeeded
197                         return
198                 }
199                 // If proxying to the container gateway fails, it
200                 // might be caused by a race where crunch-run exited
201                 // after we decided (above) the log was not final.
202                 // In that case we should proxy to keep-web.
203                 ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{
204                         UUID:   ctr.UUID,
205                         Select: []string{"uuid", "state", "gateway_address", "log"},
206                 })
207                 if err != nil {
208                         // Lost access to the container record?
209                         httpserver.Error(w, "error re-fetching container record: "+err.Error(), http.StatusServiceUnavailable)
210                 } else if ctr.State == arvados.ContainerStateLocked || ctr.State == arvados.ContainerStateRunning {
211                         // No race, proxyErr was the best we can do
212                         httpserver.Error(w, "proxy error: "+proxyErr.Error(), http.StatusServiceUnavailable)
213                 } else {
214                         conn.serveContainerRequestLogViaKeepWeb(opts, cr, w, r)
215                 }
216         }), nil
217 }
218
219 // serveContainerLogViaKeepWeb handles a request for saved container
220 // log content by proxying to one of the configured keep-web servers.
221 //
222 // It tries to choose a keep-web server that is running on this host.
223 func (conn *Conn) serveContainerRequestLogViaKeepWeb(opts arvados.ContainerLogOptions, cr arvados.ContainerRequest, w http.ResponseWriter, r *http.Request) {
224         if cr.LogUUID == "" {
225                 // Special case: if no log data exists yet, we serve
226                 // an empty collection by ourselves instead of
227                 // proxying to keep-web.
228                 conn.serveEmptyDir("/arvados/v1/container_requests/"+cr.UUID+"/log", w, r)
229                 return
230         }
231         myURL, _ := service.URLFromContext(r.Context())
232         u := url.URL(myURL)
233         myHostname := u.Hostname()
234         var webdavBase arvados.URL
235         var ok bool
236         for webdavBase = range conn.cluster.Services.WebDAV.InternalURLs {
237                 ok = true
238                 u := url.URL(webdavBase)
239                 if h := u.Hostname(); h == "127.0.0.1" || h == "0.0.0.0" || h == "::1" || h == myHostname {
240                         // Prefer a keep-web service running on the
241                         // same host as us. (If we don't find one, we
242                         // pick one arbitrarily.)
243                         break
244                 }
245         }
246         if !ok {
247                 httpserver.Error(w, "no internalURLs configured for WebDAV service", http.StatusInternalServerError)
248                 return
249         }
250         proxy := &httputil.ReverseProxy{
251                 Director: func(r *http.Request) {
252                         r.URL.Scheme = webdavBase.Scheme
253                         r.URL.Host = webdavBase.Host
254                         // Outgoing Host header specifies the
255                         // collection ID.
256                         r.Host = cr.LogUUID + ".internal"
257                         // We already checked permission on the
258                         // container, so we can use a root token here
259                         // instead of counting on the "access to log
260                         // via container request and container"
261                         // permission check, which can be racy when a
262                         // request gets retried with a new container.
263                         r.Header.Set("Authorization", "Bearer "+conn.cluster.SystemRootToken)
264                         // We can't change r.URL.Path without
265                         // confusing WebDAV (request body and response
266                         // headers refer to the same paths) so we tell
267                         // keep-web to map the log collection onto the
268                         // containers/X/log/ namespace.
269                         r.Header.Set("X-Webdav-Prefix", "/arvados/v1/container_requests/"+cr.UUID+"/log")
270                         if len(opts.Path) >= 28 && opts.Path[6:13] == "-dz642-" {
271                                 // "/arvados/v1/container_requests/{crUUID}/log/{cUUID}..."
272                                 // proxies to
273                                 // "/log for container {cUUID}..."
274                                 r.Header.Set("X-Webdav-Prefix", "/arvados/v1/container_requests/"+cr.UUID+"/log/"+opts.Path[1:28])
275                                 r.Header.Set("X-Webdav-Source", "/log for container "+opts.Path[1:28]+"/")
276                         }
277                 },
278                 ModifyResponse: func(resp *http.Response) error {
279                         preemptivelyDeduplicateHeaders(w.Header(), resp.Header)
280                         return nil
281                 },
282         }
283         if conn.cluster.TLS.Insecure {
284                 proxy.Transport = &http.Transport{
285                         TLSClientConfig: &tls.Config{
286                                 InsecureSkipVerify: conn.cluster.TLS.Insecure,
287                         },
288                 }
289         }
290         proxy.ServeHTTP(w, r)
291 }
292
293 // httputil.ReverseProxy uses (http.Header)Add() to copy headers from
294 // the upstream Response to the downstream ResponseWriter. If headers
295 // have already been set on the downstream ResponseWriter, Add() will
296 // result in duplicate headers. For example, if we set CORS headers
297 // and then use ReverseProxy with an upstream that also sets CORS
298 // headers, our client will receive
299 //
300 //      Access-Control-Allow-Origin: *
301 //      Access-Control-Allow-Origin: *
302 //
303 // ...which is incorrect.
304 //
305 // preemptivelyDeduplicateHeaders, when called from a ModifyResponse
306 // hook, solves this by removing any conflicting headers from
307 // ResponseWriter. This way, when ReverseProxy calls Add(), it will
308 // assign the new values without causing duplicates.
309 //
310 // dst is the downstream ResponseWriter's Header(). src is the
311 // upstream resp.Header.
312 func preemptivelyDeduplicateHeaders(dst, src http.Header) {
313         for hdr := range src {
314                 dst.Del(hdr)
315         }
316 }
317
318 // serveEmptyDir handles read-only webdav requests as if there was an
319 // empty collection rooted at the given path. It's equivalent to
320 // proxying to an empty collection in keep-web, but avoids the extra
321 // hop.
322 func (conn *Conn) serveEmptyDir(path string, w http.ResponseWriter, r *http.Request) {
323         wh := webdav.Handler{
324                 Prefix:     path,
325                 FileSystem: webdav.NewMemFS(),
326                 LockSystem: webdavfs.NoLockSystem,
327                 Logger: func(r *http.Request, err error) {
328                         if err != nil && !os.IsNotExist(err) {
329                                 ctxlog.FromContext(r.Context()).WithError(err).Info("webdav error on empty collection fs")
330                         }
331                 },
332         }
333         wh.ServeHTTP(w, r)
334 }
335
336 // ContainerSSH returns a connection to the SSH server in the
337 // appropriate crunch-run process on the worker node where the
338 // specified container is running.
339 //
340 // If the returned error is nil, the caller is responsible for closing
341 // sshconn.Conn.
342 func (conn *Conn) ContainerSSH(ctx context.Context, opts arvados.ContainerSSHOptions) (sshconn arvados.ConnectionResponse, err error) {
343         user, err := conn.railsProxy.UserGetCurrent(ctx, arvados.GetOptions{})
344         if err != nil {
345                 return sshconn, err
346         }
347         ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{UUID: opts.UUID, Select: []string{"uuid", "state", "gateway_address", "interactive_session_started"}})
348         if err != nil {
349                 return sshconn, err
350         }
351         ctxRoot := auth.NewContext(ctx, &auth.Credentials{Tokens: []string{conn.cluster.SystemRootToken}})
352         if !user.IsAdmin || !conn.cluster.Containers.ShellAccess.Admin {
353                 if !conn.cluster.Containers.ShellAccess.User {
354                         return sshconn, httpserver.ErrorWithStatus(errors.New("shell access is disabled in config"), http.StatusServiceUnavailable)
355                 }
356                 crs, err := conn.railsProxy.ContainerRequestList(ctxRoot, arvados.ListOptions{Limit: -1, Filters: []arvados.Filter{{"container_uuid", "=", opts.UUID}}})
357                 if err != nil {
358                         return sshconn, err
359                 }
360                 for _, cr := range crs.Items {
361                         if cr.ModifiedByUserUUID != user.UUID {
362                                 return sshconn, httpserver.ErrorWithStatus(errors.New("permission denied: container is associated with requests submitted by other users"), http.StatusForbidden)
363                         }
364                 }
365                 if crs.ItemsAvailable != len(crs.Items) {
366                         return sshconn, httpserver.ErrorWithStatus(errors.New("incomplete response while checking permission"), http.StatusInternalServerError)
367                 }
368         }
369
370         if ctr.State == arvados.ContainerStateQueued || ctr.State == arvados.ContainerStateLocked {
371                 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("container is not running yet (state is %q)", ctr.State), http.StatusServiceUnavailable)
372         } else if ctr.State != arvados.ContainerStateRunning {
373                 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("container has ended (state is %q)", ctr.State), http.StatusGone)
374         }
375
376         dial, arpc, err := conn.findGateway(ctx, ctr, opts.NoForward)
377         if err != nil {
378                 return sshconn, err
379         }
380         if arpc != nil {
381                 opts.NoForward = true
382                 return arpc.ContainerSSH(ctx, opts)
383         }
384
385         tlsconn, requestAuth, respondAuth, err := dial()
386         if err != nil {
387                 return sshconn, err
388         }
389         bufr := bufio.NewReader(tlsconn)
390         bufw := bufio.NewWriter(tlsconn)
391
392         u := url.URL{
393                 Scheme: "http",
394                 Host:   tlsconn.RemoteAddr().String(),
395                 Path:   "/ssh",
396         }
397         postform := url.Values{
398                 // uuid is only needed for older crunch-run versions
399                 // (current version uses X-Arvados-* header below)
400                 "uuid":           {opts.UUID},
401                 "detach_keys":    {opts.DetachKeys},
402                 "login_username": {opts.LoginUsername},
403                 "no_forward":     {fmt.Sprintf("%v", opts.NoForward)},
404         }
405         postdata := postform.Encode()
406         bufw.WriteString("POST " + u.String() + " HTTP/1.1\r\n")
407         bufw.WriteString("Host: " + u.Host + "\r\n")
408         bufw.WriteString("Upgrade: ssh\r\n")
409         bufw.WriteString("X-Arvados-Container-Gateway-Uuid: " + opts.UUID + "\r\n")
410         bufw.WriteString("X-Arvados-Authorization: " + requestAuth + "\r\n")
411         bufw.WriteString("Content-Type: application/x-www-form-urlencoded\r\n")
412         fmt.Fprintf(bufw, "Content-Length: %d\r\n", len(postdata))
413         bufw.WriteString("\r\n")
414         bufw.WriteString(postdata)
415         bufw.Flush()
416         resp, err := http.ReadResponse(bufr, &http.Request{Method: "POST"})
417         if err != nil {
418                 tlsconn.Close()
419                 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("error reading http response from gateway: %w", err), http.StatusBadGateway)
420         }
421         defer resp.Body.Close()
422         if resp.StatusCode != http.StatusSwitchingProtocols {
423                 body, _ := ioutil.ReadAll(io.LimitReader(resp.Body, 1000))
424                 tlsconn.Close()
425                 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("unexpected status %s %q", resp.Status, body), http.StatusBadGateway)
426         }
427         if strings.ToLower(resp.Header.Get("Upgrade")) != "ssh" ||
428                 strings.ToLower(resp.Header.Get("Connection")) != "upgrade" {
429                 tlsconn.Close()
430                 return sshconn, httpserver.ErrorWithStatus(errors.New("bad upgrade"), http.StatusBadGateway)
431         }
432         if resp.Header.Get("X-Arvados-Authorization-Response") != respondAuth {
433                 tlsconn.Close()
434                 return sshconn, httpserver.ErrorWithStatus(errors.New("bad X-Arvados-Authorization-Response header"), http.StatusBadGateway)
435         }
436
437         if !ctr.InteractiveSessionStarted {
438                 _, err = conn.railsProxy.ContainerUpdate(ctxRoot, arvados.UpdateOptions{
439                         UUID: opts.UUID,
440                         Attrs: map[string]interface{}{
441                                 "interactive_session_started": true,
442                         },
443                 })
444                 if err != nil {
445                         tlsconn.Close()
446                         return sshconn, httpserver.ErrorWithStatus(err, http.StatusInternalServerError)
447                 }
448         }
449
450         sshconn.Conn = tlsconn
451         sshconn.Bufrw = &bufio.ReadWriter{Reader: bufr, Writer: bufw}
452         sshconn.Logger = ctxlog.FromContext(ctx)
453         sshconn.Header = http.Header{"Upgrade": {"ssh"}}
454         return sshconn, nil
455 }
456
457 // ContainerGatewayTunnel sets up a tunnel enabling us (controller) to
458 // connect to the caller's (crunch-run's) gateway server.
459 func (conn *Conn) ContainerGatewayTunnel(ctx context.Context, opts arvados.ContainerGatewayTunnelOptions) (resp arvados.ConnectionResponse, err error) {
460         h := hmac.New(sha256.New, []byte(conn.cluster.SystemRootToken))
461         fmt.Fprint(h, opts.UUID)
462         authSecret := fmt.Sprintf("%x", h.Sum(nil))
463         if subtle.ConstantTimeCompare([]byte(authSecret), []byte(opts.AuthSecret)) != 1 {
464                 ctxlog.FromContext(ctx).Info("received incorrect auth_secret")
465                 return resp, httpserver.ErrorWithStatus(errors.New("authentication error"), http.StatusUnauthorized)
466         }
467
468         muxconn, clientconn := net.Pipe()
469         tunnel, err := yamux.Server(muxconn, nil)
470         if err != nil {
471                 clientconn.Close()
472                 return resp, httpserver.ErrorWithStatus(err, http.StatusInternalServerError)
473         }
474
475         conn.gwTunnelsLock.Lock()
476         if conn.gwTunnels == nil {
477                 conn.gwTunnels = map[string]*yamux.Session{opts.UUID: tunnel}
478         } else {
479                 conn.gwTunnels[opts.UUID] = tunnel
480         }
481         conn.gwTunnelsLock.Unlock()
482
483         go func() {
484                 <-tunnel.CloseChan()
485                 conn.gwTunnelsLock.Lock()
486                 if conn.gwTunnels[opts.UUID] == tunnel {
487                         delete(conn.gwTunnels, opts.UUID)
488                 }
489                 conn.gwTunnelsLock.Unlock()
490         }()
491
492         // Assuming we're acting as the backend of an http server,
493         // lib/controller/router will call resp's ServeHTTP handler,
494         // which upgrades the incoming http connection to a raw socket
495         // and connects it to our yamux.Server through our net.Pipe().
496         resp.Conn = clientconn
497         resp.Bufrw = &bufio.ReadWriter{Reader: bufio.NewReader(&bytes.Buffer{}), Writer: bufio.NewWriter(&bytes.Buffer{})}
498         resp.Logger = ctxlog.FromContext(ctx)
499         resp.Header = http.Header{"Upgrade": {"tunnel"}}
500         if u, ok := service.URLFromContext(ctx); ok {
501                 resp.Header.Set("X-Arvados-Internal-Url", u.String())
502         } else if forceInternalURLForTest != nil {
503                 resp.Header.Set("X-Arvados-Internal-Url", forceInternalURLForTest.String())
504         }
505         return
506 }
507
508 type gatewayDialer func() (conn net.Conn, requestAuth, respondAuth string, err error)
509
510 // findGateway figures out how to connect to ctr's gateway.
511 //
512 // If the gateway can be contacted directly or through a tunnel on
513 // this instance, the first return value is a non-nil dialer.
514 //
515 // If the gateway is only accessible through a tunnel through a
516 // different controller process, the second return value is a non-nil
517 // *rpc.Conn for that controller.
518 func (conn *Conn) findGateway(ctx context.Context, ctr arvados.Container, noForward bool) (gatewayDialer, *rpc.Conn, error) {
519         conn.gwTunnelsLock.Lock()
520         tunnel := conn.gwTunnels[ctr.UUID]
521         conn.gwTunnelsLock.Unlock()
522
523         myURL, _ := service.URLFromContext(ctx)
524
525         if host, _, splitErr := net.SplitHostPort(ctr.GatewayAddress); splitErr == nil && host != "" && host != "127.0.0.1" {
526                 // If crunch-run provided a GatewayAddress like
527                 // "ipaddr:port", that means "ipaddr" is one of the
528                 // external interfaces where the gateway is
529                 // listening. In that case, it's the most
530                 // reliable/direct option, so we use it even if a
531                 // tunnel might also be available.
532                 return func() (net.Conn, string, string, error) {
533                         rawconn, err := (&net.Dialer{}).DialContext(ctx, "tcp", ctr.GatewayAddress)
534                         if err != nil {
535                                 return nil, "", "", httpserver.ErrorWithStatus(err, http.StatusServiceUnavailable)
536                         }
537                         return conn.dialGatewayTLS(ctx, ctr, rawconn)
538                 }, nil, nil
539         }
540         if tunnel != nil && !(forceProxyForTest && !noForward) {
541                 // If we can't connect directly, and the gateway has
542                 // established a yamux tunnel with us, connect through
543                 // the tunnel.
544                 //
545                 // ...except: forceProxyForTest means we are emulating
546                 // a situation where the gateway has established a
547                 // yamux tunnel with controller B, and the
548                 // ContainerSSH request arrives at controller A. If
549                 // noForward==false then we are acting as A, so
550                 // we pretend not to have a tunnel, and fall through
551                 // to the "tunurl" case below. If noForward==true
552                 // then the client is A and we are acting as B, so we
553                 // connect to our tunnel.
554                 return func() (net.Conn, string, string, error) {
555                         rawconn, err := tunnel.Open()
556                         if err != nil {
557                                 return nil, "", "", httpserver.ErrorWithStatus(err, http.StatusServiceUnavailable)
558                         }
559                         return conn.dialGatewayTLS(ctx, ctr, rawconn)
560                 }, nil, nil
561         }
562         if tunurl := strings.TrimPrefix(ctr.GatewayAddress, "tunnel "); tunurl != ctr.GatewayAddress &&
563                 tunurl != "" &&
564                 tunurl != myURL.String() &&
565                 !noForward {
566                 // If crunch-run provided a GatewayAddress like
567                 // "tunnel https://10.0.0.10:1010/", that means the
568                 // gateway has established a yamux tunnel with the
569                 // controller process at the indicated InternalURL
570                 // (which isn't us, otherwise we would have had
571                 // "tunnel != nil" above). We need to proxy through to
572                 // the other controller process in order to use the
573                 // tunnel.
574                 for u := range conn.cluster.Services.Controller.InternalURLs {
575                         if u.String() == tunurl {
576                                 ctxlog.FromContext(ctx).Debugf("connecting to container gateway through other controller at %s", u)
577                                 u := url.URL(u)
578                                 return nil, rpc.NewConn(conn.cluster.ClusterID, &u, conn.cluster.TLS.Insecure, rpc.PassthroughTokenProvider), nil
579                         }
580                 }
581                 ctxlog.FromContext(ctx).Warnf("container gateway provided a tunnel endpoint %s that is not one of Services.Controller.InternalURLs", tunurl)
582                 return nil, nil, httpserver.ErrorWithStatus(errors.New("container gateway is running but tunnel endpoint is invalid"), http.StatusServiceUnavailable)
583         }
584         if ctr.GatewayAddress == "" {
585                 return nil, nil, httpserver.ErrorWithStatus(errors.New("container is running but gateway is not available"), http.StatusServiceUnavailable)
586         } else {
587                 return nil, nil, httpserver.ErrorWithStatus(errors.New("container is running but tunnel is down"), http.StatusServiceUnavailable)
588         }
589 }
590
591 // dialGatewayTLS negotiates a TLS connection to a container gateway
592 // over the given raw connection.
593 func (conn *Conn) dialGatewayTLS(ctx context.Context, ctr arvados.Container, rawconn net.Conn) (*tls.Conn, string, string, error) {
594         // crunch-run uses a self-signed / unverifiable TLS
595         // certificate, so we use the following scheme to ensure we're
596         // not talking to an attacker-in-the-middle.
597         //
598         // 1. Compute ctrKey = HMAC-SHA256(sysRootToken,ctrUUID) --
599         // this will be the same ctrKey that a-d-c supplied to
600         // crunch-run in the GatewayAuthSecret env var.
601         //
602         // 2. Compute requestAuth = HMAC-SHA256(ctrKey,serverCert) and
603         // send it to crunch-run as the X-Arvados-Authorization
604         // header, proving that we know ctrKey. (Note a MITM cannot
605         // replay the proof to a real crunch-run server, because the
606         // real crunch-run server would have a different cert.)
607         //
608         // 3. Compute respondAuth = HMAC-SHA256(ctrKey,requestAuth)
609         // and ensure the server returns it in the
610         // X-Arvados-Authorization-Response header, proving that the
611         // server knows ctrKey.
612         var requestAuth, respondAuth string
613         tlsconn := tls.Client(rawconn, &tls.Config{
614                 InsecureSkipVerify: true,
615                 VerifyPeerCertificate: func(rawCerts [][]byte, verifiedChains [][]*x509.Certificate) error {
616                         if len(rawCerts) == 0 {
617                                 return errors.New("no certificate received, cannot compute authorization header")
618                         }
619                         h := hmac.New(sha256.New, []byte(conn.cluster.SystemRootToken))
620                         fmt.Fprint(h, ctr.UUID)
621                         authKey := fmt.Sprintf("%x", h.Sum(nil))
622                         h = hmac.New(sha256.New, []byte(authKey))
623                         h.Write(rawCerts[0])
624                         requestAuth = fmt.Sprintf("%x", h.Sum(nil))
625                         h.Reset()
626                         h.Write([]byte(requestAuth))
627                         respondAuth = fmt.Sprintf("%x", h.Sum(nil))
628                         return nil
629                 },
630         })
631         err := tlsconn.HandshakeContext(ctx)
632         if err != nil {
633                 return nil, "", "", httpserver.ErrorWithStatus(fmt.Errorf("TLS handshake failed: %w", err), http.StatusBadGateway)
634         }
635         if respondAuth == "" {
636                 tlsconn.Close()
637                 return nil, "", "", httpserver.ErrorWithStatus(errors.New("BUG: no respondAuth"), http.StatusInternalServerError)
638         }
639         return tlsconn, requestAuth, respondAuth, nil
640 }