1 // Copyright (C) The Arvados Authors. All rights reserved.
3 // SPDX-License-Identifier: AGPL-3.0
26 "git.arvados.org/arvados.git/lib/controller/rpc"
27 "git.arvados.org/arvados.git/lib/service"
28 "git.arvados.org/arvados.git/lib/webdavfs"
29 "git.arvados.org/arvados.git/sdk/go/arvados"
30 "git.arvados.org/arvados.git/sdk/go/auth"
31 "git.arvados.org/arvados.git/sdk/go/ctxlog"
32 "git.arvados.org/arvados.git/sdk/go/httpserver"
33 "github.com/hashicorp/yamux"
34 "golang.org/x/net/webdav"
38 forceProxyForTest = false
39 forceInternalURLForTest *arvados.URL
42 // ContainerLog returns a WebDAV handler that reads logs from the
43 // indicated container. It works by proxying the request to
45 // - the container gateway, if the container is running
47 // - a different controller process, if the container is running and
48 // the gateway is accessible through a tunnel to a different
51 // - keep-web, if saved logs exist and there is no gateway (or the
52 // container is finished)
54 // - an empty-collection stub, if there is no gateway and no saved
56 func (conn *Conn) ContainerLog(ctx context.Context, opts arvados.ContainerLogOptions) (http.Handler, error) {
57 ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{UUID: opts.UUID, Select: []string{"uuid", "state", "gateway_address", "log"}})
61 if ctr.GatewayAddress == "" ||
62 (ctr.State != arvados.ContainerStateLocked && ctr.State != arvados.ContainerStateRunning) {
63 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
64 conn.serveContainerLogViaKeepWeb(opts, ctr, w, r)
67 dial, arpc, err := conn.findGateway(ctx, ctr, opts.NoForward)
73 return arpc.ContainerLog(ctx, opts)
75 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
76 r = r.WithContext(ctx)
77 var proxyReq *http.Request
79 var expectRespondAuth string
80 proxy := &httputil.ReverseProxy{
81 // Our custom Transport:
83 // - Uses a custom dialer to connect to the
84 // gateway (either directly or through a
85 // tunnel set up though ContainerTunnel)
87 // - Verifies the gateway's TLS certificate
88 // using X-Arvados-Authorization headers.
90 // This involves modifying the outgoing
91 // request header in DialTLSContext.
92 // (ReverseProxy certainly doesn't expect us
93 // to do this, but it works.)
94 Transport: &http.Transport{
95 DialTLSContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
96 tlsconn, requestAuth, respondAuth, err := dial()
100 proxyReq.Header.Set("X-Arvados-Authorization", requestAuth)
101 expectRespondAuth = respondAuth
105 Director: func(r *http.Request) {
106 // Scheme/host of incoming r.URL are
107 // irrelevant now, and may even be
108 // missing. Host is ignored by our
109 // DialTLSContext, but we need a
110 // generic syntactically correct URL
111 // for net/http to work with.
112 r.URL.Scheme = "https"
113 r.URL.Host = "0.0.0.0:0"
114 r.Header.Set("X-Arvados-Container-Gateway-Uuid", opts.UUID)
117 ModifyResponse: func(resp *http.Response) error {
118 if resp.Header.Get("X-Arvados-Authorization-Response") != expectRespondAuth {
119 // Note this is how we detect
120 // an attacker-in-the-middle.
121 return httpserver.ErrorWithStatus(errors.New("bad X-Arvados-Authorization-Response header"), http.StatusBadGateway)
125 ErrorHandler: func(w http.ResponseWriter, r *http.Request, err error) {
129 proxy.ServeHTTP(w, r)
134 // If proxying to the container gateway fails, it
135 // might be caused by a race where crunch-run exited
136 // after we decided (above) the log was not final.
137 // In that case we should proxy to keep-web.
138 ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{
140 Select: []string{"uuid", "state", "gateway_address", "log"},
143 // Lost access to the container record?
144 httpserver.Error(w, "error re-fetching container record: "+err.Error(), http.StatusServiceUnavailable)
145 } else if ctr.State == arvados.ContainerStateLocked || ctr.State == arvados.ContainerStateRunning {
146 // No race, proxyErr was the best we can do
147 httpserver.Error(w, "proxy error: "+proxyErr.Error(), http.StatusServiceUnavailable)
149 conn.serveContainerLogViaKeepWeb(opts, ctr, w, r)
154 // serveContainerLogViaKeepWeb handles a request for saved container
155 // log content by proxying to one of the configured keep-web servers.
157 // It tries to choose a keep-web server that is running on this host.
158 func (conn *Conn) serveContainerLogViaKeepWeb(opts arvados.ContainerLogOptions, ctr arvados.Container, w http.ResponseWriter, r *http.Request) {
160 // Special case: if no log data exists yet, we serve
161 // an empty collection by ourselves instead of
162 // proxying to keep-web.
163 conn.serveEmptyDir("/arvados/v1/containers/"+ctr.UUID+"/log", w, r)
166 myURL, _ := service.URLFromContext(r.Context())
168 myHostname := u.Hostname()
169 var webdavBase arvados.URL
171 for webdavBase = range conn.cluster.Services.WebDAVDownload.InternalURLs {
173 u := url.URL(webdavBase)
174 if h := u.Hostname(); h == "127.0.0.1" || h == "0.0.0.0" || h == "::1" || h == myHostname {
175 // Prefer a keep-web service running on the
176 // same host as us. (If we don't find one, we
177 // pick one arbitrarily.)
182 httpserver.Error(w, "no internalURLs configured for WebDAV service", http.StatusInternalServerError)
185 proxy := &httputil.ReverseProxy{
186 Director: func(r *http.Request) {
188 Scheme: webdavBase.Scheme,
189 Host: webdavBase.Host,
190 Path: "/by_id/" + url.PathEscape(ctr.Log) + opts.Path,
192 // Our outgoing Host header must match
193 // WebDAVDownload.ExternalURL, otherwise
194 // keep-web does not accept an auth token.
195 r.Host = conn.cluster.Services.WebDAVDownload.ExternalURL.Host
196 // We already checked permission on the
197 // container, so we can use a root token here
198 // instead of counting on the "access to log
199 // via container request and container"
200 // permission check, which can be racy when a
201 // request gets retried with a new container.
202 r.Header.Set("Authorization", "Bearer "+conn.cluster.SystemRootToken)
205 if conn.cluster.TLS.Insecure {
206 proxy.Transport = &http.Transport{
207 TLSClientConfig: &tls.Config{
208 InsecureSkipVerify: conn.cluster.TLS.Insecure,
212 proxy.ServeHTTP(w, r)
215 // serveEmptyDir handles read-only webdav requests as if there was an
216 // empty collection rooted at the given path. It's equivalent to
217 // proxying to an empty collection in keep-web, but avoids the extra
219 func (conn *Conn) serveEmptyDir(path string, w http.ResponseWriter, r *http.Request) {
220 wh := webdav.Handler{
222 FileSystem: webdav.NewMemFS(),
223 LockSystem: webdavfs.NoLockSystem,
224 Logger: func(r *http.Request, err error) {
226 ctxlog.FromContext(r.Context()).WithError(err).Info("webdav error on empty collection fs")
233 // ContainerSSH returns a connection to the SSH server in the
234 // appropriate crunch-run process on the worker node where the
235 // specified container is running.
237 // If the returned error is nil, the caller is responsible for closing
239 func (conn *Conn) ContainerSSH(ctx context.Context, opts arvados.ContainerSSHOptions) (sshconn arvados.ConnectionResponse, err error) {
240 user, err := conn.railsProxy.UserGetCurrent(ctx, arvados.GetOptions{})
244 ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{UUID: opts.UUID, Select: []string{"uuid", "state", "gateway_address", "interactive_session_started"}})
248 ctxRoot := auth.NewContext(ctx, &auth.Credentials{Tokens: []string{conn.cluster.SystemRootToken}})
249 if !user.IsAdmin || !conn.cluster.Containers.ShellAccess.Admin {
250 if !conn.cluster.Containers.ShellAccess.User {
251 return sshconn, httpserver.ErrorWithStatus(errors.New("shell access is disabled in config"), http.StatusServiceUnavailable)
253 crs, err := conn.railsProxy.ContainerRequestList(ctxRoot, arvados.ListOptions{Limit: -1, Filters: []arvados.Filter{{"container_uuid", "=", opts.UUID}}})
257 for _, cr := range crs.Items {
258 if cr.ModifiedByUserUUID != user.UUID {
259 return sshconn, httpserver.ErrorWithStatus(errors.New("permission denied: container is associated with requests submitted by other users"), http.StatusForbidden)
262 if crs.ItemsAvailable != len(crs.Items) {
263 return sshconn, httpserver.ErrorWithStatus(errors.New("incomplete response while checking permission"), http.StatusInternalServerError)
267 if ctr.State == arvados.ContainerStateQueued || ctr.State == arvados.ContainerStateLocked {
268 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("container is not running yet (state is %q)", ctr.State), http.StatusServiceUnavailable)
269 } else if ctr.State != arvados.ContainerStateRunning {
270 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("container has ended (state is %q)", ctr.State), http.StatusGone)
273 dial, arpc, err := conn.findGateway(ctx, ctr, opts.NoForward)
278 opts.NoForward = true
279 return arpc.ContainerSSH(ctx, opts)
282 tlsconn, requestAuth, respondAuth, err := dial()
286 bufr := bufio.NewReader(tlsconn)
287 bufw := bufio.NewWriter(tlsconn)
291 Host: tlsconn.RemoteAddr().String(),
294 postform := url.Values{
295 // uuid is only needed for older crunch-run versions
296 // (current version uses X-Arvados-* header below)
298 "detach_keys": {opts.DetachKeys},
299 "login_username": {opts.LoginUsername},
300 "no_forward": {fmt.Sprintf("%v", opts.NoForward)},
302 postdata := postform.Encode()
303 bufw.WriteString("POST " + u.String() + " HTTP/1.1\r\n")
304 bufw.WriteString("Host: " + u.Host + "\r\n")
305 bufw.WriteString("Upgrade: ssh\r\n")
306 bufw.WriteString("X-Arvados-Container-Gateway-Uuid: " + opts.UUID + "\r\n")
307 bufw.WriteString("X-Arvados-Authorization: " + requestAuth + "\r\n")
308 bufw.WriteString("Content-Type: application/x-www-form-urlencoded\r\n")
309 fmt.Fprintf(bufw, "Content-Length: %d\r\n", len(postdata))
310 bufw.WriteString("\r\n")
311 bufw.WriteString(postdata)
313 resp, err := http.ReadResponse(bufr, &http.Request{Method: "POST"})
316 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("error reading http response from gateway: %w", err), http.StatusBadGateway)
318 defer resp.Body.Close()
319 if resp.StatusCode != http.StatusSwitchingProtocols {
320 body, _ := ioutil.ReadAll(io.LimitReader(resp.Body, 1000))
322 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("unexpected status %s %q", resp.Status, body), http.StatusBadGateway)
324 if strings.ToLower(resp.Header.Get("Upgrade")) != "ssh" ||
325 strings.ToLower(resp.Header.Get("Connection")) != "upgrade" {
327 return sshconn, httpserver.ErrorWithStatus(errors.New("bad upgrade"), http.StatusBadGateway)
329 if resp.Header.Get("X-Arvados-Authorization-Response") != respondAuth {
331 return sshconn, httpserver.ErrorWithStatus(errors.New("bad X-Arvados-Authorization-Response header"), http.StatusBadGateway)
334 if !ctr.InteractiveSessionStarted {
335 _, err = conn.railsProxy.ContainerUpdate(ctxRoot, arvados.UpdateOptions{
337 Attrs: map[string]interface{}{
338 "interactive_session_started": true,
343 return sshconn, httpserver.ErrorWithStatus(err, http.StatusInternalServerError)
347 sshconn.Conn = tlsconn
348 sshconn.Bufrw = &bufio.ReadWriter{Reader: bufr, Writer: bufw}
349 sshconn.Logger = ctxlog.FromContext(ctx)
350 sshconn.Header = http.Header{"Upgrade": {"ssh"}}
354 // ContainerGatewayTunnel sets up a tunnel enabling us (controller) to
355 // connect to the caller's (crunch-run's) gateway server.
356 func (conn *Conn) ContainerGatewayTunnel(ctx context.Context, opts arvados.ContainerGatewayTunnelOptions) (resp arvados.ConnectionResponse, err error) {
357 h := hmac.New(sha256.New, []byte(conn.cluster.SystemRootToken))
358 fmt.Fprint(h, opts.UUID)
359 authSecret := fmt.Sprintf("%x", h.Sum(nil))
360 if subtle.ConstantTimeCompare([]byte(authSecret), []byte(opts.AuthSecret)) != 1 {
361 ctxlog.FromContext(ctx).Info("received incorrect auth_secret")
362 return resp, httpserver.ErrorWithStatus(errors.New("authentication error"), http.StatusUnauthorized)
365 muxconn, clientconn := net.Pipe()
366 tunnel, err := yamux.Server(muxconn, nil)
369 return resp, httpserver.ErrorWithStatus(err, http.StatusInternalServerError)
372 conn.gwTunnelsLock.Lock()
373 if conn.gwTunnels == nil {
374 conn.gwTunnels = map[string]*yamux.Session{opts.UUID: tunnel}
376 conn.gwTunnels[opts.UUID] = tunnel
378 conn.gwTunnelsLock.Unlock()
382 conn.gwTunnelsLock.Lock()
383 if conn.gwTunnels[opts.UUID] == tunnel {
384 delete(conn.gwTunnels, opts.UUID)
386 conn.gwTunnelsLock.Unlock()
389 // Assuming we're acting as the backend of an http server,
390 // lib/controller/router will call resp's ServeHTTP handler,
391 // which upgrades the incoming http connection to a raw socket
392 // and connects it to our yamux.Server through our net.Pipe().
393 resp.Conn = clientconn
394 resp.Bufrw = &bufio.ReadWriter{Reader: bufio.NewReader(&bytes.Buffer{}), Writer: bufio.NewWriter(&bytes.Buffer{})}
395 resp.Logger = ctxlog.FromContext(ctx)
396 resp.Header = http.Header{"Upgrade": {"tunnel"}}
397 if u, ok := service.URLFromContext(ctx); ok {
398 resp.Header.Set("X-Arvados-Internal-Url", u.String())
399 } else if forceInternalURLForTest != nil {
400 resp.Header.Set("X-Arvados-Internal-Url", forceInternalURLForTest.String())
405 type gatewayDialer func() (conn net.Conn, requestAuth, respondAuth string, err error)
407 // findGateway figures out how to connect to ctr's gateway.
409 // If the gateway can be contacted directly or through a tunnel on
410 // this instance, the first return value is a non-nil dialer.
412 // If the gateway is only accessible through a tunnel through a
413 // different controller process, the second return value is a non-nil
414 // *rpc.Conn for that controller.
415 func (conn *Conn) findGateway(ctx context.Context, ctr arvados.Container, noForward bool) (gatewayDialer, *rpc.Conn, error) {
416 conn.gwTunnelsLock.Lock()
417 tunnel := conn.gwTunnels[ctr.UUID]
418 conn.gwTunnelsLock.Unlock()
420 myURL, _ := service.URLFromContext(ctx)
422 if host, _, splitErr := net.SplitHostPort(ctr.GatewayAddress); splitErr == nil && host != "" && host != "127.0.0.1" {
423 // If crunch-run provided a GatewayAddress like
424 // "ipaddr:port", that means "ipaddr" is one of the
425 // external interfaces where the gateway is
426 // listening. In that case, it's the most
427 // reliable/direct option, so we use it even if a
428 // tunnel might also be available.
429 return func() (net.Conn, string, string, error) {
430 rawconn, err := (&net.Dialer{}).DialContext(ctx, "tcp", ctr.GatewayAddress)
432 err = httpserver.ErrorWithStatus(err, http.StatusServiceUnavailable)
434 return conn.dialGatewayTLS(ctx, ctr, rawconn)
437 if tunnel != nil && !(forceProxyForTest && !noForward) {
438 // If we can't connect directly, and the gateway has
439 // established a yamux tunnel with us, connect through
442 // ...except: forceProxyForTest means we are emulating
443 // a situation where the gateway has established a
444 // yamux tunnel with controller B, and the
445 // ContainerSSH request arrives at controller A. If
446 // noForward==false then we are acting as A, so
447 // we pretend not to have a tunnel, and fall through
448 // to the "tunurl" case below. If noForward==true
449 // then the client is A and we are acting as B, so we
450 // connect to our tunnel.
451 return func() (net.Conn, string, string, error) {
452 rawconn, err := tunnel.Open()
454 err = httpserver.ErrorWithStatus(err, http.StatusServiceUnavailable)
456 return conn.dialGatewayTLS(ctx, ctr, rawconn)
459 if tunurl := strings.TrimPrefix(ctr.GatewayAddress, "tunnel "); tunurl != ctr.GatewayAddress &&
461 tunurl != myURL.String() &&
463 // If crunch-run provided a GatewayAddress like
464 // "tunnel https://10.0.0.10:1010/", that means the
465 // gateway has established a yamux tunnel with the
466 // controller process at the indicated InternalURL
467 // (which isn't us, otherwise we would have had
468 // "tunnel != nil" above). We need to proxy through to
469 // the other controller process in order to use the
471 for u := range conn.cluster.Services.Controller.InternalURLs {
472 if u.String() == tunurl {
473 ctxlog.FromContext(ctx).Debugf("connecting to container gateway through other controller at %s", u)
475 return nil, rpc.NewConn(conn.cluster.ClusterID, &u, conn.cluster.TLS.Insecure, rpc.PassthroughTokenProvider), nil
478 ctxlog.FromContext(ctx).Warnf("container gateway provided a tunnel endpoint %s that is not one of Services.Controller.InternalURLs", tunurl)
479 return nil, nil, httpserver.ErrorWithStatus(errors.New("container gateway is running but tunnel endpoint is invalid"), http.StatusServiceUnavailable)
481 if ctr.GatewayAddress == "" {
482 return nil, nil, httpserver.ErrorWithStatus(errors.New("container is running but gateway is not available"), http.StatusServiceUnavailable)
484 return nil, nil, httpserver.ErrorWithStatus(errors.New("container is running but tunnel is down"), http.StatusServiceUnavailable)
488 // dialGatewayTLS negotiates a TLS connection to a container gateway
489 // over the given raw connection.
490 func (conn *Conn) dialGatewayTLS(ctx context.Context, ctr arvados.Container, rawconn net.Conn) (*tls.Conn, string, string, error) {
491 // crunch-run uses a self-signed / unverifiable TLS
492 // certificate, so we use the following scheme to ensure we're
493 // not talking to an attacker-in-the-middle.
495 // 1. Compute ctrKey = HMAC-SHA256(sysRootToken,ctrUUID) --
496 // this will be the same ctrKey that a-d-c supplied to
497 // crunch-run in the GatewayAuthSecret env var.
499 // 2. Compute requestAuth = HMAC-SHA256(ctrKey,serverCert) and
500 // send it to crunch-run as the X-Arvados-Authorization
501 // header, proving that we know ctrKey. (Note a MITM cannot
502 // replay the proof to a real crunch-run server, because the
503 // real crunch-run server would have a different cert.)
505 // 3. Compute respondAuth = HMAC-SHA256(ctrKey,requestAuth)
506 // and ensure the server returns it in the
507 // X-Arvados-Authorization-Response header, proving that the
508 // server knows ctrKey.
509 var requestAuth, respondAuth string
510 tlsconn := tls.Client(rawconn, &tls.Config{
511 InsecureSkipVerify: true,
512 VerifyPeerCertificate: func(rawCerts [][]byte, verifiedChains [][]*x509.Certificate) error {
513 if len(rawCerts) == 0 {
514 return errors.New("no certificate received, cannot compute authorization header")
516 h := hmac.New(sha256.New, []byte(conn.cluster.SystemRootToken))
517 fmt.Fprint(h, ctr.UUID)
518 authKey := fmt.Sprintf("%x", h.Sum(nil))
519 h = hmac.New(sha256.New, []byte(authKey))
521 requestAuth = fmt.Sprintf("%x", h.Sum(nil))
523 h.Write([]byte(requestAuth))
524 respondAuth = fmt.Sprintf("%x", h.Sum(nil))
528 err := tlsconn.HandshakeContext(ctx)
530 return nil, "", "", httpserver.ErrorWithStatus(fmt.Errorf("TLS handshake failed: %w", err), http.StatusBadGateway)
532 if respondAuth == "" {
534 return nil, "", "", httpserver.ErrorWithStatus(errors.New("BUG: no respondAuth"), http.StatusInternalServerError)
536 return tlsconn, requestAuth, respondAuth, nil