1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
27 "git.arvados.org/arvados.git/lib/controller/rpc"
28 "git.arvados.org/arvados.git/lib/service"
29 "git.arvados.org/arvados.git/lib/webdavfs"
30 "git.arvados.org/arvados.git/sdk/go/arvados"
31 "git.arvados.org/arvados.git/sdk/go/auth"
32 "git.arvados.org/arvados.git/sdk/go/ctxlog"
33 "git.arvados.org/arvados.git/sdk/go/httpserver"
34 "github.com/hashicorp/yamux"
35 "golang.org/x/net/webdav"
39 forceProxyForTest = false
40 forceInternalURLForTest *arvados.URL
43 // ContainerRequestLog returns a WebDAV handler that reads logs from
44 // the indicated container request. It works by proxying the incoming
47 // - the container gateway, if there is an associated container that
50 // - a different controller process, if there is a running container
51 // whose gateway is accessible through a tunnel to a different
54 // - keep-web, if saved logs exist and there is no gateway (or the
55 // associated container is finished)
57 // - an empty-collection stub, if there is no gateway and no saved
60 // For an incoming request
62 // GET /arvados/v1/container_requests/{cr_uuid}/log/{c_uuid}{/c_log_path}
64 // The upstream request may be to {c_uuid}'s container gateway
66 // GET /arvados/v1/container_requests/{cr_uuid}/log/{c_uuid}{/c_log_path}
67 // X-Webdav-Prefix: /arvados/v1/container_requests/{cr_uuid}/log/{c_uuid}
68 // X-Webdav-Source: /log
70 // ...or the upstream request may be to keep-web (where {cr_log_uuid}
71 // is the container request log collection UUID)
73 // GET /arvados/v1/container_requests/{cr_uuid}/log/{c_uuid}{/c_log_path}
74 // Host: {cr_log_uuid}.internal
75 // X-Webdav-Prefix: /arvados/v1/container_requests/{cr_uuid}/log
76 // X-Arvados-Container-Uuid: {c_uuid}
78 // ...or the request may be handled locally using an empty-collection
80 func (conn *Conn) ContainerRequestLog(ctx context.Context, opts arvados.ContainerLogOptions) (http.Handler, error) {
81 cr, err := conn.railsProxy.ContainerRequestGet(ctx, arvados.GetOptions{UUID: opts.UUID, Select: []string{"uuid", "container_uuid", "log_uuid"}})
83 if se := httpserver.HTTPStatusError(nil); errors.As(err, &se) && se.HTTPStatus() == http.StatusUnauthorized {
84 // Hint to WebDAV client that we accept HTTP basic auth.
85 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
86 w.Header().Set("Www-Authenticate", "Basic realm=\"collections\"")
87 w.WriteHeader(http.StatusUnauthorized)
92 ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{UUID: cr.ContainerUUID, Select: []string{"uuid", "state", "gateway_address"}})
96 // .../log/{ctr.UUID} is a directory where the currently
97 // assigned container's log data [will] appear (as opposed to
98 // previous attempts in .../log/{previous_ctr_uuid}). Requests
99 // that are outside that directory, and requests on a
100 // non-running container, are proxied to keep-web instead of
101 // going through the container gateway system.
103 // Side note: a depth>1 directory tree listing starting at
104 // .../{cr_uuid}/log will only include subdirectories for
105 // finished containers, i.e., will not include a subdirectory
106 // with log data for a current (unfinished) container UUID.
107 // In order to access live logs, a client must look up the
108 // container_uuid field of the container request record, and
109 // explicitly request a path under .../{cr_uuid}/log/{c_uuid}.
110 if ctr.GatewayAddress == "" ||
111 (ctr.State != arvados.ContainerStateLocked && ctr.State != arvados.ContainerStateRunning) ||
112 !(opts.Path == "/"+ctr.UUID || strings.HasPrefix(opts.Path, "/"+ctr.UUID+"/")) {
113 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
114 conn.serveContainerRequestLogViaKeepWeb(opts, cr, w, r)
117 dial, arpc, err := conn.findGateway(ctx, ctr, opts.NoForward)
122 opts.NoForward = true
123 return arpc.ContainerRequestLog(ctx, opts)
125 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
126 r = r.WithContext(ctx)
127 var proxyReq *http.Request
129 var expectRespondAuth string
130 proxy := &httputil.ReverseProxy{
131 // Our custom Transport:
133 // - Uses a custom dialer to connect to the
134 // gateway (either directly or through a
135 // tunnel set up though ContainerTunnel)
137 // - Verifies the gateway's TLS certificate
138 // using X-Arvados-Authorization headers.
140 // This involves modifying the outgoing
141 // request header in DialTLSContext.
142 // (ReverseProxy certainly doesn't expect us
143 // to do this, but it works.)
144 Transport: &http.Transport{
145 DialTLSContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
146 tlsconn, requestAuth, respondAuth, err := dial()
150 proxyReq.Header.Set("X-Arvados-Authorization", requestAuth)
151 expectRespondAuth = respondAuth
155 Director: func(r *http.Request) {
156 // Scheme/host of incoming r.URL are
157 // irrelevant now, and may even be
158 // missing. Host is ignored by our
159 // DialTLSContext, but we need a
160 // generic syntactically correct URL
161 // for net/http to work with.
162 r.URL.Scheme = "https"
163 r.URL.Host = "0.0.0.0:0"
164 r.Header.Set("X-Arvados-Container-Gateway-Uuid", ctr.UUID)
165 r.Header.Set("X-Webdav-Prefix", "/arvados/v1/container_requests/"+cr.UUID+"/log/"+ctr.UUID)
166 r.Header.Set("X-Webdav-Source", "/log")
169 ModifyResponse: func(resp *http.Response) error {
170 if resp.Header.Get("X-Arvados-Authorization-Response") != expectRespondAuth {
171 // Note this is how we detect
172 // an attacker-in-the-middle.
173 return httpserver.ErrorWithStatus(errors.New("bad X-Arvados-Authorization-Response header"), http.StatusBadGateway)
177 ErrorHandler: func(w http.ResponseWriter, r *http.Request, err error) {
181 proxy.ServeHTTP(w, r)
186 // If proxying to the container gateway fails, it
187 // might be caused by a race where crunch-run exited
188 // after we decided (above) the log was not final.
189 // In that case we should proxy to keep-web.
190 ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{
192 Select: []string{"uuid", "state", "gateway_address", "log"},
195 // Lost access to the container record?
196 httpserver.Error(w, "error re-fetching container record: "+err.Error(), http.StatusServiceUnavailable)
197 } else if ctr.State == arvados.ContainerStateLocked || ctr.State == arvados.ContainerStateRunning {
198 // No race, proxyErr was the best we can do
199 httpserver.Error(w, "proxy error: "+proxyErr.Error(), http.StatusServiceUnavailable)
201 conn.serveContainerRequestLogViaKeepWeb(opts, cr, w, r)
206 // serveContainerLogViaKeepWeb handles a request for saved container
207 // log content by proxying to one of the configured keep-web servers.
209 // It tries to choose a keep-web server that is running on this host.
210 func (conn *Conn) serveContainerRequestLogViaKeepWeb(opts arvados.ContainerLogOptions, cr arvados.ContainerRequest, w http.ResponseWriter, r *http.Request) {
211 if cr.LogUUID == "" {
212 // Special case: if no log data exists yet, we serve
213 // an empty collection by ourselves instead of
214 // proxying to keep-web.
215 conn.serveEmptyDir("/arvados/v1/container_requests/"+cr.UUID+"/log", w, r)
218 myURL, _ := service.URLFromContext(r.Context())
220 myHostname := u.Hostname()
221 var webdavBase arvados.URL
223 for webdavBase = range conn.cluster.Services.WebDAV.InternalURLs {
225 u := url.URL(webdavBase)
226 if h := u.Hostname(); h == "127.0.0.1" || h == "0.0.0.0" || h == "::1" || h == myHostname {
227 // Prefer a keep-web service running on the
228 // same host as us. (If we don't find one, we
229 // pick one arbitrarily.)
234 httpserver.Error(w, "no internalURLs configured for WebDAV service", http.StatusInternalServerError)
237 proxy := &httputil.ReverseProxy{
238 Director: func(r *http.Request) {
239 r.URL.Scheme = webdavBase.Scheme
240 r.URL.Host = webdavBase.Host
241 // Outgoing Host header specifies the
243 r.Host = cr.LogUUID + ".internal"
244 // We already checked permission on the
245 // container, so we can use a root token here
246 // instead of counting on the "access to log
247 // via container request and container"
248 // permission check, which can be racy when a
249 // request gets retried with a new container.
250 r.Header.Set("Authorization", "Bearer "+conn.cluster.SystemRootToken)
251 // We can't change r.URL.Path without
252 // confusing WebDAV (request body and response
253 // headers refer to the same paths) so we tell
254 // keep-web to map the log collection onto the
255 // containers/X/log/ namespace.
256 r.Header.Set("X-Webdav-Prefix", "/arvados/v1/container_requests/"+cr.UUID+"/log")
257 if len(opts.Path) >= 28 && opts.Path[6:13] == "-dz642-" {
258 // "/arvados/v1/container_requests/{crUUID}/log/{cUUID}..."
260 // "/log for container {cUUID}..."
261 r.Header.Set("X-Webdav-Prefix", "/arvados/v1/container_requests/"+cr.UUID+"/log/"+opts.Path[1:28])
262 r.Header.Set("X-Webdav-Source", "/log for container "+opts.Path[1:28]+"/")
266 if conn.cluster.TLS.Insecure {
267 proxy.Transport = &http.Transport{
268 TLSClientConfig: &tls.Config{
269 InsecureSkipVerify: conn.cluster.TLS.Insecure,
273 proxy.ServeHTTP(w, r)
276 // serveEmptyDir handles read-only webdav requests as if there was an
277 // empty collection rooted at the given path. It's equivalent to
278 // proxying to an empty collection in keep-web, but avoids the extra
280 func (conn *Conn) serveEmptyDir(path string, w http.ResponseWriter, r *http.Request) {
281 wh := webdav.Handler{
283 FileSystem: webdav.NewMemFS(),
284 LockSystem: webdavfs.NoLockSystem,
285 Logger: func(r *http.Request, err error) {
286 if err != nil && !os.IsNotExist(err) {
287 ctxlog.FromContext(r.Context()).WithError(err).Info("webdav error on empty collection fs")
294 // ContainerSSH returns a connection to the SSH server in the
295 // appropriate crunch-run process on the worker node where the
296 // specified container is running.
298 // If the returned error is nil, the caller is responsible for closing
300 func (conn *Conn) ContainerSSH(ctx context.Context, opts arvados.ContainerSSHOptions) (sshconn arvados.ConnectionResponse, err error) {
301 user, err := conn.railsProxy.UserGetCurrent(ctx, arvados.GetOptions{})
305 ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{UUID: opts.UUID, Select: []string{"uuid", "state", "gateway_address", "interactive_session_started"}})
309 ctxRoot := auth.NewContext(ctx, &auth.Credentials{Tokens: []string{conn.cluster.SystemRootToken}})
310 if !user.IsAdmin || !conn.cluster.Containers.ShellAccess.Admin {
311 if !conn.cluster.Containers.ShellAccess.User {
312 return sshconn, httpserver.ErrorWithStatus(errors.New("shell access is disabled in config"), http.StatusServiceUnavailable)
314 crs, err := conn.railsProxy.ContainerRequestList(ctxRoot, arvados.ListOptions{Limit: -1, Filters: []arvados.Filter{{"container_uuid", "=", opts.UUID}}})
318 for _, cr := range crs.Items {
319 if cr.ModifiedByUserUUID != user.UUID {
320 return sshconn, httpserver.ErrorWithStatus(errors.New("permission denied: container is associated with requests submitted by other users"), http.StatusForbidden)
323 if crs.ItemsAvailable != len(crs.Items) {
324 return sshconn, httpserver.ErrorWithStatus(errors.New("incomplete response while checking permission"), http.StatusInternalServerError)
328 if ctr.State == arvados.ContainerStateQueued || ctr.State == arvados.ContainerStateLocked {
329 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("container is not running yet (state is %q)", ctr.State), http.StatusServiceUnavailable)
330 } else if ctr.State != arvados.ContainerStateRunning {
331 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("container has ended (state is %q)", ctr.State), http.StatusGone)
334 dial, arpc, err := conn.findGateway(ctx, ctr, opts.NoForward)
339 opts.NoForward = true
340 return arpc.ContainerSSH(ctx, opts)
343 tlsconn, requestAuth, respondAuth, err := dial()
347 bufr := bufio.NewReader(tlsconn)
348 bufw := bufio.NewWriter(tlsconn)
352 Host: tlsconn.RemoteAddr().String(),
355 postform := url.Values{
356 // uuid is only needed for older crunch-run versions
357 // (current version uses X-Arvados-* header below)
359 "detach_keys": {opts.DetachKeys},
360 "login_username": {opts.LoginUsername},
361 "no_forward": {fmt.Sprintf("%v", opts.NoForward)},
363 postdata := postform.Encode()
364 bufw.WriteString("POST " + u.String() + " HTTP/1.1\r\n")
365 bufw.WriteString("Host: " + u.Host + "\r\n")
366 bufw.WriteString("Upgrade: ssh\r\n")
367 bufw.WriteString("X-Arvados-Container-Gateway-Uuid: " + opts.UUID + "\r\n")
368 bufw.WriteString("X-Arvados-Authorization: " + requestAuth + "\r\n")
369 bufw.WriteString("Content-Type: application/x-www-form-urlencoded\r\n")
370 fmt.Fprintf(bufw, "Content-Length: %d\r\n", len(postdata))
371 bufw.WriteString("\r\n")
372 bufw.WriteString(postdata)
374 resp, err := http.ReadResponse(bufr, &http.Request{Method: "POST"})
377 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("error reading http response from gateway: %w", err), http.StatusBadGateway)
379 defer resp.Body.Close()
380 if resp.StatusCode != http.StatusSwitchingProtocols {
381 body, _ := ioutil.ReadAll(io.LimitReader(resp.Body, 1000))
383 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("unexpected status %s %q", resp.Status, body), http.StatusBadGateway)
385 if strings.ToLower(resp.Header.Get("Upgrade")) != "ssh" ||
386 strings.ToLower(resp.Header.Get("Connection")) != "upgrade" {
388 return sshconn, httpserver.ErrorWithStatus(errors.New("bad upgrade"), http.StatusBadGateway)
390 if resp.Header.Get("X-Arvados-Authorization-Response") != respondAuth {
392 return sshconn, httpserver.ErrorWithStatus(errors.New("bad X-Arvados-Authorization-Response header"), http.StatusBadGateway)
395 if !ctr.InteractiveSessionStarted {
396 _, err = conn.railsProxy.ContainerUpdate(ctxRoot, arvados.UpdateOptions{
398 Attrs: map[string]interface{}{
399 "interactive_session_started": true,
404 return sshconn, httpserver.ErrorWithStatus(err, http.StatusInternalServerError)
408 sshconn.Conn = tlsconn
409 sshconn.Bufrw = &bufio.ReadWriter{Reader: bufr, Writer: bufw}
410 sshconn.Logger = ctxlog.FromContext(ctx)
411 sshconn.Header = http.Header{"Upgrade": {"ssh"}}
415 // ContainerGatewayTunnel sets up a tunnel enabling us (controller) to
416 // connect to the caller's (crunch-run's) gateway server.
417 func (conn *Conn) ContainerGatewayTunnel(ctx context.Context, opts arvados.ContainerGatewayTunnelOptions) (resp arvados.ConnectionResponse, err error) {
418 h := hmac.New(sha256.New, []byte(conn.cluster.SystemRootToken))
419 fmt.Fprint(h, opts.UUID)
420 authSecret := fmt.Sprintf("%x", h.Sum(nil))
421 if subtle.ConstantTimeCompare([]byte(authSecret), []byte(opts.AuthSecret)) != 1 {
422 ctxlog.FromContext(ctx).Info("received incorrect auth_secret")
423 return resp, httpserver.ErrorWithStatus(errors.New("authentication error"), http.StatusUnauthorized)
426 muxconn, clientconn := net.Pipe()
427 tunnel, err := yamux.Server(muxconn, nil)
430 return resp, httpserver.ErrorWithStatus(err, http.StatusInternalServerError)
433 conn.gwTunnelsLock.Lock()
434 if conn.gwTunnels == nil {
435 conn.gwTunnels = map[string]*yamux.Session{opts.UUID: tunnel}
437 conn.gwTunnels[opts.UUID] = tunnel
439 conn.gwTunnelsLock.Unlock()
443 conn.gwTunnelsLock.Lock()
444 if conn.gwTunnels[opts.UUID] == tunnel {
445 delete(conn.gwTunnels, opts.UUID)
447 conn.gwTunnelsLock.Unlock()
450 // Assuming we're acting as the backend of an http server,
451 // lib/controller/router will call resp's ServeHTTP handler,
452 // which upgrades the incoming http connection to a raw socket
453 // and connects it to our yamux.Server through our net.Pipe().
454 resp.Conn = clientconn
455 resp.Bufrw = &bufio.ReadWriter{Reader: bufio.NewReader(&bytes.Buffer{}), Writer: bufio.NewWriter(&bytes.Buffer{})}
456 resp.Logger = ctxlog.FromContext(ctx)
457 resp.Header = http.Header{"Upgrade": {"tunnel"}}
458 if u, ok := service.URLFromContext(ctx); ok {
459 resp.Header.Set("X-Arvados-Internal-Url", u.String())
460 } else if forceInternalURLForTest != nil {
461 resp.Header.Set("X-Arvados-Internal-Url", forceInternalURLForTest.String())
466 type gatewayDialer func() (conn net.Conn, requestAuth, respondAuth string, err error)
468 // findGateway figures out how to connect to ctr's gateway.
470 // If the gateway can be contacted directly or through a tunnel on
471 // this instance, the first return value is a non-nil dialer.
473 // If the gateway is only accessible through a tunnel through a
474 // different controller process, the second return value is a non-nil
475 // *rpc.Conn for that controller.
476 func (conn *Conn) findGateway(ctx context.Context, ctr arvados.Container, noForward bool) (gatewayDialer, *rpc.Conn, error) {
477 conn.gwTunnelsLock.Lock()
478 tunnel := conn.gwTunnels[ctr.UUID]
479 conn.gwTunnelsLock.Unlock()
481 myURL, _ := service.URLFromContext(ctx)
483 if host, _, splitErr := net.SplitHostPort(ctr.GatewayAddress); splitErr == nil && host != "" && host != "127.0.0.1" {
484 // If crunch-run provided a GatewayAddress like
485 // "ipaddr:port", that means "ipaddr" is one of the
486 // external interfaces where the gateway is
487 // listening. In that case, it's the most
488 // reliable/direct option, so we use it even if a
489 // tunnel might also be available.
490 return func() (net.Conn, string, string, error) {
491 rawconn, err := (&net.Dialer{}).DialContext(ctx, "tcp", ctr.GatewayAddress)
493 return nil, "", "", httpserver.ErrorWithStatus(err, http.StatusServiceUnavailable)
495 return conn.dialGatewayTLS(ctx, ctr, rawconn)
498 if tunnel != nil && !(forceProxyForTest && !noForward) {
499 // If we can't connect directly, and the gateway has
500 // established a yamux tunnel with us, connect through
503 // ...except: forceProxyForTest means we are emulating
504 // a situation where the gateway has established a
505 // yamux tunnel with controller B, and the
506 // ContainerSSH request arrives at controller A. If
507 // noForward==false then we are acting as A, so
508 // we pretend not to have a tunnel, and fall through
509 // to the "tunurl" case below. If noForward==true
510 // then the client is A and we are acting as B, so we
511 // connect to our tunnel.
512 return func() (net.Conn, string, string, error) {
513 rawconn, err := tunnel.Open()
515 return nil, "", "", httpserver.ErrorWithStatus(err, http.StatusServiceUnavailable)
517 return conn.dialGatewayTLS(ctx, ctr, rawconn)
520 if tunurl := strings.TrimPrefix(ctr.GatewayAddress, "tunnel "); tunurl != ctr.GatewayAddress &&
522 tunurl != myURL.String() &&
524 // If crunch-run provided a GatewayAddress like
525 // "tunnel https://10.0.0.10:1010/", that means the
526 // gateway has established a yamux tunnel with the
527 // controller process at the indicated InternalURL
528 // (which isn't us, otherwise we would have had
529 // "tunnel != nil" above). We need to proxy through to
530 // the other controller process in order to use the
532 for u := range conn.cluster.Services.Controller.InternalURLs {
533 if u.String() == tunurl {
534 ctxlog.FromContext(ctx).Debugf("connecting to container gateway through other controller at %s", u)
536 return nil, rpc.NewConn(conn.cluster.ClusterID, &u, conn.cluster.TLS.Insecure, rpc.PassthroughTokenProvider), nil
539 ctxlog.FromContext(ctx).Warnf("container gateway provided a tunnel endpoint %s that is not one of Services.Controller.InternalURLs", tunurl)
540 return nil, nil, httpserver.ErrorWithStatus(errors.New("container gateway is running but tunnel endpoint is invalid"), http.StatusServiceUnavailable)
542 if ctr.GatewayAddress == "" {
543 return nil, nil, httpserver.ErrorWithStatus(errors.New("container is running but gateway is not available"), http.StatusServiceUnavailable)
545 return nil, nil, httpserver.ErrorWithStatus(errors.New("container is running but tunnel is down"), http.StatusServiceUnavailable)
549 // dialGatewayTLS negotiates a TLS connection to a container gateway
550 // over the given raw connection.
551 func (conn *Conn) dialGatewayTLS(ctx context.Context, ctr arvados.Container, rawconn net.Conn) (*tls.Conn, string, string, error) {
552 // crunch-run uses a self-signed / unverifiable TLS
553 // certificate, so we use the following scheme to ensure we're
554 // not talking to an attacker-in-the-middle.
556 // 1. Compute ctrKey = HMAC-SHA256(sysRootToken,ctrUUID) --
557 // this will be the same ctrKey that a-d-c supplied to
558 // crunch-run in the GatewayAuthSecret env var.
560 // 2. Compute requestAuth = HMAC-SHA256(ctrKey,serverCert) and
561 // send it to crunch-run as the X-Arvados-Authorization
562 // header, proving that we know ctrKey. (Note a MITM cannot
563 // replay the proof to a real crunch-run server, because the
564 // real crunch-run server would have a different cert.)
566 // 3. Compute respondAuth = HMAC-SHA256(ctrKey,requestAuth)
567 // and ensure the server returns it in the
568 // X-Arvados-Authorization-Response header, proving that the
569 // server knows ctrKey.
570 var requestAuth, respondAuth string
571 tlsconn := tls.Client(rawconn, &tls.Config{
572 InsecureSkipVerify: true,
573 VerifyPeerCertificate: func(rawCerts [][]byte, verifiedChains [][]*x509.Certificate) error {
574 if len(rawCerts) == 0 {
575 return errors.New("no certificate received, cannot compute authorization header")
577 h := hmac.New(sha256.New, []byte(conn.cluster.SystemRootToken))
578 fmt.Fprint(h, ctr.UUID)
579 authKey := fmt.Sprintf("%x", h.Sum(nil))
580 h = hmac.New(sha256.New, []byte(authKey))
582 requestAuth = fmt.Sprintf("%x", h.Sum(nil))
584 h.Write([]byte(requestAuth))
585 respondAuth = fmt.Sprintf("%x", h.Sum(nil))
589 err := tlsconn.HandshakeContext(ctx)
591 return nil, "", "", httpserver.ErrorWithStatus(fmt.Errorf("TLS handshake failed: %w", err), http.StatusBadGateway)
593 if respondAuth == "" {
595 return nil, "", "", httpserver.ErrorWithStatus(errors.New("BUG: no respondAuth"), http.StatusInternalServerError)
597 return tlsconn, requestAuth, respondAuth, nil