X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/c0924347a69157d3058a39d238fb0e0bacefa3a2..00c93619f7691c0828f5273bc457e2840dbdc084:/lib/controller/localdb/container_gateway.go diff --git a/lib/controller/localdb/container_gateway.go b/lib/controller/localdb/container_gateway.go index 77c5182e9c..e42a447308 100644 --- a/lib/controller/localdb/container_gateway.go +++ b/lib/controller/localdb/container_gateway.go @@ -19,16 +19,21 @@ import ( "io/ioutil" "net" "net/http" + "net/http/httputil" "net/url" + "os" "strings" "git.arvados.org/arvados.git/lib/controller/rpc" "git.arvados.org/arvados.git/lib/service" + "git.arvados.org/arvados.git/lib/webdavfs" "git.arvados.org/arvados.git/sdk/go/arvados" "git.arvados.org/arvados.git/sdk/go/auth" "git.arvados.org/arvados.git/sdk/go/ctxlog" "git.arvados.org/arvados.git/sdk/go/httpserver" + keepweb "git.arvados.org/arvados.git/services/keep-web" "github.com/hashicorp/yamux" + "golang.org/x/net/webdav" ) var ( @@ -36,6 +41,287 @@ var ( forceInternalURLForTest *arvados.URL ) +// ContainerRequestLog returns a WebDAV handler that reads logs from +// the indicated container request. It works by proxying the incoming +// HTTP request to +// +// - the container gateway, if there is an associated container that +// is running +// +// - a different controller process, if there is a running container +// whose gateway is accessible through a tunnel to a different +// controller process +// +// - keep-web, if saved logs exist and there is no gateway (or the +// associated container is finished) +// +// - an empty-collection stub, if there is no gateway and no saved +// log +// +// For an incoming request +// +// GET /arvados/v1/container_requests/{cr_uuid}/log/{c_uuid}{/c_log_path} +// +// The upstream request may be to {c_uuid}'s container gateway +// +// GET /arvados/v1/container_requests/{cr_uuid}/log/{c_uuid}{/c_log_path} +// X-Webdav-Prefix: /arvados/v1/container_requests/{cr_uuid}/log/{c_uuid} +// X-Webdav-Source: /log +// +// ...or the upstream request may be to keep-web (where {cr_log_uuid} +// is the container request log collection UUID) +// +// GET /arvados/v1/container_requests/{cr_uuid}/log/{c_uuid}{/c_log_path} +// Host: {cr_log_uuid}.internal +// X-Webdav-Prefix: /arvados/v1/container_requests/{cr_uuid}/log +// X-Arvados-Container-Uuid: {c_uuid} +// +// ...or the request may be handled locally using an empty-collection +// stub. +func (conn *Conn) ContainerRequestLog(ctx context.Context, opts arvados.ContainerLogOptions) (http.Handler, error) { + if opts.Method == "OPTIONS" && opts.Header.Get("Access-Control-Request-Method") != "" { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if !keepweb.ServeCORSPreflight(w, opts.Header) { + // Inconceivable. We already checked + // for the only condition where + // ServeCORSPreflight returns false. + httpserver.Error(w, "unhandled CORS preflight request", http.StatusInternalServerError) + } + }), nil + } + cr, err := conn.railsProxy.ContainerRequestGet(ctx, arvados.GetOptions{UUID: opts.UUID, Select: []string{"uuid", "container_uuid", "log_uuid"}}) + if err != nil { + if se := httpserver.HTTPStatusError(nil); errors.As(err, &se) && se.HTTPStatus() == http.StatusUnauthorized { + // Hint to WebDAV client that we accept HTTP basic auth. + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Www-Authenticate", "Basic realm=\"collections\"") + w.WriteHeader(http.StatusUnauthorized) + }), nil + } + return nil, err + } + ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{UUID: cr.ContainerUUID, Select: []string{"uuid", "state", "gateway_address"}}) + if err != nil { + return nil, err + } + // .../log/{ctr.UUID} is a directory where the currently + // assigned container's log data [will] appear (as opposed to + // previous attempts in .../log/{previous_ctr_uuid}). Requests + // that are outside that directory, and requests on a + // non-running container, are proxied to keep-web instead of + // going through the container gateway system. + // + // Side note: a depth>1 directory tree listing starting at + // .../{cr_uuid}/log will only include subdirectories for + // finished containers, i.e., will not include a subdirectory + // with log data for a current (unfinished) container UUID. + // In order to access live logs, a client must look up the + // container_uuid field of the container request record, and + // explicitly request a path under .../{cr_uuid}/log/{c_uuid}. + if ctr.GatewayAddress == "" || + (ctr.State != arvados.ContainerStateLocked && ctr.State != arvados.ContainerStateRunning) || + !(opts.Path == "/"+ctr.UUID || strings.HasPrefix(opts.Path, "/"+ctr.UUID+"/")) { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + conn.serveContainerRequestLogViaKeepWeb(opts, cr, w, r) + }), nil + } + dial, arpc, err := conn.findGateway(ctx, ctr, opts.NoForward) + if err != nil { + return nil, err + } + if arpc != nil { + opts.NoForward = true + return arpc.ContainerRequestLog(ctx, opts) + } + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + r = r.WithContext(ctx) + var proxyReq *http.Request + var proxyErr error + var expectRespondAuth string + proxy := &httputil.ReverseProxy{ + // Our custom Transport: + // + // - Uses a custom dialer to connect to the + // gateway (either directly or through a + // tunnel set up though ContainerTunnel) + // + // - Verifies the gateway's TLS certificate + // using X-Arvados-Authorization headers. + // + // This involves modifying the outgoing + // request header in DialTLSContext. + // (ReverseProxy certainly doesn't expect us + // to do this, but it works.) + Transport: &http.Transport{ + DialTLSContext: func(ctx context.Context, network, addr string) (net.Conn, error) { + tlsconn, requestAuth, respondAuth, err := dial() + if err != nil { + return nil, err + } + proxyReq.Header.Set("X-Arvados-Authorization", requestAuth) + expectRespondAuth = respondAuth + return tlsconn, nil + }, + }, + Director: func(r *http.Request) { + // Scheme/host of incoming r.URL are + // irrelevant now, and may even be + // missing. Host is ignored by our + // DialTLSContext, but we need a + // generic syntactically correct URL + // for net/http to work with. + r.URL.Scheme = "https" + r.URL.Host = "0.0.0.0:0" + r.Header.Set("X-Arvados-Container-Gateway-Uuid", ctr.UUID) + r.Header.Set("X-Webdav-Prefix", "/arvados/v1/container_requests/"+cr.UUID+"/log/"+ctr.UUID) + r.Header.Set("X-Webdav-Source", "/log") + proxyReq = r + }, + ModifyResponse: func(resp *http.Response) error { + if resp.Header.Get("X-Arvados-Authorization-Response") != expectRespondAuth { + // Note this is how we detect + // an attacker-in-the-middle. + return httpserver.ErrorWithStatus(errors.New("bad X-Arvados-Authorization-Response header"), http.StatusBadGateway) + } + resp.Header.Del("X-Arvados-Authorization-Response") + for hdr := range resp.Header { + // proxy.ServeHTTP adds each + // resp.Header to w.Header, + // which causes duplicate CORS + // and request-id headers, + // unless we do this. + w.Header().Del(hdr) + } + return nil + }, + ErrorHandler: func(w http.ResponseWriter, r *http.Request, err error) { + proxyErr = err + }, + } + proxy.ServeHTTP(w, r) + if proxyErr == nil { + // proxy succeeded + return + } + // If proxying to the container gateway fails, it + // might be caused by a race where crunch-run exited + // after we decided (above) the log was not final. + // In that case we should proxy to keep-web. + ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{ + UUID: ctr.UUID, + Select: []string{"uuid", "state", "gateway_address", "log"}, + }) + if err != nil { + // Lost access to the container record? + httpserver.Error(w, "error re-fetching container record: "+err.Error(), http.StatusServiceUnavailable) + } else if ctr.State == arvados.ContainerStateLocked || ctr.State == arvados.ContainerStateRunning { + // No race, proxyErr was the best we can do + httpserver.Error(w, "proxy error: "+proxyErr.Error(), http.StatusServiceUnavailable) + } else { + conn.serveContainerRequestLogViaKeepWeb(opts, cr, w, r) + } + }), nil +} + +// serveContainerLogViaKeepWeb handles a request for saved container +// log content by proxying to one of the configured keep-web servers. +// +// It tries to choose a keep-web server that is running on this host. +func (conn *Conn) serveContainerRequestLogViaKeepWeb(opts arvados.ContainerLogOptions, cr arvados.ContainerRequest, w http.ResponseWriter, r *http.Request) { + if cr.LogUUID == "" { + // Special case: if no log data exists yet, we serve + // an empty collection by ourselves instead of + // proxying to keep-web. + conn.serveEmptyDir("/arvados/v1/container_requests/"+cr.UUID+"/log", w, r) + return + } + myURL, _ := service.URLFromContext(r.Context()) + u := url.URL(myURL) + myHostname := u.Hostname() + var webdavBase arvados.URL + var ok bool + for webdavBase = range conn.cluster.Services.WebDAV.InternalURLs { + ok = true + u := url.URL(webdavBase) + if h := u.Hostname(); h == "127.0.0.1" || h == "0.0.0.0" || h == "::1" || h == myHostname { + // Prefer a keep-web service running on the + // same host as us. (If we don't find one, we + // pick one arbitrarily.) + break + } + } + if !ok { + httpserver.Error(w, "no internalURLs configured for WebDAV service", http.StatusInternalServerError) + return + } + proxy := &httputil.ReverseProxy{ + Director: func(r *http.Request) { + r.URL.Scheme = webdavBase.Scheme + r.URL.Host = webdavBase.Host + // Outgoing Host header specifies the + // collection ID. + r.Host = cr.LogUUID + ".internal" + // We already checked permission on the + // container, so we can use a root token here + // instead of counting on the "access to log + // via container request and container" + // permission check, which can be racy when a + // request gets retried with a new container. + r.Header.Set("Authorization", "Bearer "+conn.cluster.SystemRootToken) + // We can't change r.URL.Path without + // confusing WebDAV (request body and response + // headers refer to the same paths) so we tell + // keep-web to map the log collection onto the + // containers/X/log/ namespace. + r.Header.Set("X-Webdav-Prefix", "/arvados/v1/container_requests/"+cr.UUID+"/log") + if len(opts.Path) >= 28 && opts.Path[6:13] == "-dz642-" { + // "/arvados/v1/container_requests/{crUUID}/log/{cUUID}..." + // proxies to + // "/log for container {cUUID}..." + r.Header.Set("X-Webdav-Prefix", "/arvados/v1/container_requests/"+cr.UUID+"/log/"+opts.Path[1:28]) + r.Header.Set("X-Webdav-Source", "/log for container "+opts.Path[1:28]+"/") + } + }, + ModifyResponse: func(resp *http.Response) error { + for hdr := range resp.Header { + // proxy.ServeHTTP adds each + // resp.Header to w.Header, which + // causes duplicate CORS and + // request-id headers, unless we do + // this. + w.Header().Del(hdr) + } + return nil + }, + } + if conn.cluster.TLS.Insecure { + proxy.Transport = &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: conn.cluster.TLS.Insecure, + }, + } + } + proxy.ServeHTTP(w, r) +} + +// serveEmptyDir handles read-only webdav requests as if there was an +// empty collection rooted at the given path. It's equivalent to +// proxying to an empty collection in keep-web, but avoids the extra +// hop. +func (conn *Conn) serveEmptyDir(path string, w http.ResponseWriter, r *http.Request) { + wh := webdav.Handler{ + Prefix: path, + FileSystem: webdav.NewMemFS(), + LockSystem: webdavfs.NoLockSystem, + Logger: func(r *http.Request, err error) { + if err != nil && !os.IsNotExist(err) { + ctxlog.FromContext(r.Context()).WithError(err).Info("webdav error on empty collection fs") + } + }, + } + wh.ServeHTTP(w, r) +} + // ContainerSSH returns a connection to the SSH server in the // appropriate crunch-run process on the worker node where the // specified container is running. @@ -47,7 +333,7 @@ func (conn *Conn) ContainerSSH(ctx context.Context, opts arvados.ContainerSSHOpt if err != nil { return sshconn, err } - ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{UUID: opts.UUID}) + ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{UUID: opts.UUID, Select: []string{"uuid", "state", "gateway_address", "interactive_session_started"}}) if err != nil { return sshconn, err } @@ -70,138 +356,36 @@ func (conn *Conn) ContainerSSH(ctx context.Context, opts arvados.ContainerSSHOpt } } - conn.gwTunnelsLock.Lock() - tunnel := conn.gwTunnels[opts.UUID] - conn.gwTunnelsLock.Unlock() - if ctr.State == arvados.ContainerStateQueued || ctr.State == arvados.ContainerStateLocked { return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("container is not running yet (state is %q)", ctr.State), http.StatusServiceUnavailable) } else if ctr.State != arvados.ContainerStateRunning { return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("container has ended (state is %q)", ctr.State), http.StatusGone) } - // targetHost is the value we'll use in the Host header in our - // "Upgrade: ssh" http request. It's just a placeholder - // "localhost", unless we decide to connect directly, in which - // case we'll set it to the gateway's external ip:host. (The - // gateway doesn't even look at it, but we might as well.) - targetHost := "localhost" - myURL, _ := service.URLFromContext(ctx) - - var rawconn net.Conn - if host, _, splitErr := net.SplitHostPort(ctr.GatewayAddress); splitErr == nil && host != "" && host != "127.0.0.1" { - // If crunch-run provided a GatewayAddress like - // "ipaddr:port", that means "ipaddr" is one of the - // external interfaces where the gateway is - // listening. In that case, it's the most - // reliable/direct option, so we use it even if a - // tunnel might also be available. - targetHost = ctr.GatewayAddress - rawconn, err = net.Dial("tcp", ctr.GatewayAddress) - if err != nil { - return sshconn, httpserver.ErrorWithStatus(err, http.StatusServiceUnavailable) - } - } else if tunnel != nil && !(forceProxyForTest && !opts.NoForward) { - // If we can't connect directly, and the gateway has - // established a yamux tunnel with us, connect through - // the tunnel. - // - // ...except: forceProxyForTest means we are emulating - // a situation where the gateway has established a - // yamux tunnel with controller B, and the - // ContainerSSH request arrives at controller A. If - // opts.NoForward==false then we are acting as A, so - // we pretend not to have a tunnel, and fall through - // to the "tunurl" case below. If opts.NoForward==true - // then the client is A and we are acting as B, so we - // connect to our tunnel. - rawconn, err = tunnel.Open() - if err != nil { - return sshconn, httpserver.ErrorWithStatus(err, http.StatusServiceUnavailable) - } - } else if ctr.GatewayAddress == "" { - return sshconn, httpserver.ErrorWithStatus(errors.New("container is running but gateway is not available"), http.StatusServiceUnavailable) - } else if tunurl := strings.TrimPrefix(ctr.GatewayAddress, "tunnel "); tunurl != ctr.GatewayAddress && - tunurl != "" && - tunurl != myURL.String() && - !opts.NoForward { - // If crunch-run provided a GatewayAddress like - // "tunnel https://10.0.0.10:1010/", that means the - // gateway has established a yamux tunnel with the - // controller process at the indicated InternalURL - // (which isn't us, otherwise we would have had - // "tunnel != nil" above). We need to proxy through to - // the other controller process in order to use the - // tunnel. - for u := range conn.cluster.Services.Controller.InternalURLs { - if u.String() == tunurl { - ctxlog.FromContext(ctx).Debugf("proxying ContainerSSH request to other controller at %s", u) - u := url.URL(u) - arpc := rpc.NewConn(conn.cluster.ClusterID, &u, conn.cluster.TLS.Insecure, rpc.PassthroughTokenProvider) - opts.NoForward = true - return arpc.ContainerSSH(ctx, opts) - } - } - ctxlog.FromContext(ctx).Warnf("container gateway provided a tunnel endpoint %s that is not one of Services.Controller.InternalURLs", tunurl) - return sshconn, httpserver.ErrorWithStatus(errors.New("container gateway is running but tunnel endpoint is invalid"), http.StatusServiceUnavailable) - } else { - return sshconn, httpserver.ErrorWithStatus(errors.New("container gateway is running but tunnel is down"), http.StatusServiceUnavailable) + dial, arpc, err := conn.findGateway(ctx, ctr, opts.NoForward) + if err != nil { + return sshconn, err + } + if arpc != nil { + opts.NoForward = true + return arpc.ContainerSSH(ctx, opts) } - // crunch-run uses a self-signed / unverifiable TLS - // certificate, so we use the following scheme to ensure we're - // not talking to a MITM. - // - // 1. Compute ctrKey = HMAC-SHA256(sysRootToken,ctrUUID) -- - // this will be the same ctrKey that a-d-c supplied to - // crunch-run in the GatewayAuthSecret env var. - // - // 2. Compute requestAuth = HMAC-SHA256(ctrKey,serverCert) and - // send it to crunch-run as the X-Arvados-Authorization - // header, proving that we know ctrKey. (Note a MITM cannot - // replay the proof to a real crunch-run server, because the - // real crunch-run server would have a different cert.) - // - // 3. Compute respondAuth = HMAC-SHA256(ctrKey,requestAuth) - // and ensure the server returns it in the - // X-Arvados-Authorization-Response header, proving that the - // server knows ctrKey. - var requestAuth, respondAuth string - tlsconn := tls.Client(rawconn, &tls.Config{ - InsecureSkipVerify: true, - VerifyPeerCertificate: func(rawCerts [][]byte, verifiedChains [][]*x509.Certificate) error { - if len(rawCerts) == 0 { - return errors.New("no certificate received, cannot compute authorization header") - } - h := hmac.New(sha256.New, []byte(conn.cluster.SystemRootToken)) - fmt.Fprint(h, opts.UUID) - authKey := fmt.Sprintf("%x", h.Sum(nil)) - h = hmac.New(sha256.New, []byte(authKey)) - h.Write(rawCerts[0]) - requestAuth = fmt.Sprintf("%x", h.Sum(nil)) - h.Reset() - h.Write([]byte(requestAuth)) - respondAuth = fmt.Sprintf("%x", h.Sum(nil)) - return nil - }, - }) - err = tlsconn.HandshakeContext(ctx) + tlsconn, requestAuth, respondAuth, err := dial() if err != nil { - return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("TLS handshake failed: %w", err), http.StatusBadGateway) - } - if respondAuth == "" { - tlsconn.Close() - return sshconn, httpserver.ErrorWithStatus(errors.New("BUG: no respondAuth"), http.StatusInternalServerError) + return sshconn, err } bufr := bufio.NewReader(tlsconn) bufw := bufio.NewWriter(tlsconn) u := url.URL{ Scheme: "http", - Host: targetHost, + Host: tlsconn.RemoteAddr().String(), Path: "/ssh", } postform := url.Values{ + // uuid is only needed for older crunch-run versions + // (current version uses X-Arvados-* header below) "uuid": {opts.UUID}, "detach_keys": {opts.DetachKeys}, "login_username": {opts.LoginUsername}, @@ -211,6 +395,7 @@ func (conn *Conn) ContainerSSH(ctx context.Context, opts arvados.ContainerSSHOpt bufw.WriteString("POST " + u.String() + " HTTP/1.1\r\n") bufw.WriteString("Host: " + u.Host + "\r\n") bufw.WriteString("Upgrade: ssh\r\n") + bufw.WriteString("X-Arvados-Container-Gateway-Uuid: " + opts.UUID + "\r\n") bufw.WriteString("X-Arvados-Authorization: " + requestAuth + "\r\n") bufw.WriteString("Content-Type: application/x-www-form-urlencoded\r\n") fmt.Fprintf(bufw, "Content-Length: %d\r\n", len(postdata)) @@ -308,3 +493,137 @@ func (conn *Conn) ContainerGatewayTunnel(ctx context.Context, opts arvados.Conta } return } + +type gatewayDialer func() (conn net.Conn, requestAuth, respondAuth string, err error) + +// findGateway figures out how to connect to ctr's gateway. +// +// If the gateway can be contacted directly or through a tunnel on +// this instance, the first return value is a non-nil dialer. +// +// If the gateway is only accessible through a tunnel through a +// different controller process, the second return value is a non-nil +// *rpc.Conn for that controller. +func (conn *Conn) findGateway(ctx context.Context, ctr arvados.Container, noForward bool) (gatewayDialer, *rpc.Conn, error) { + conn.gwTunnelsLock.Lock() + tunnel := conn.gwTunnels[ctr.UUID] + conn.gwTunnelsLock.Unlock() + + myURL, _ := service.URLFromContext(ctx) + + if host, _, splitErr := net.SplitHostPort(ctr.GatewayAddress); splitErr == nil && host != "" && host != "127.0.0.1" { + // If crunch-run provided a GatewayAddress like + // "ipaddr:port", that means "ipaddr" is one of the + // external interfaces where the gateway is + // listening. In that case, it's the most + // reliable/direct option, so we use it even if a + // tunnel might also be available. + return func() (net.Conn, string, string, error) { + rawconn, err := (&net.Dialer{}).DialContext(ctx, "tcp", ctr.GatewayAddress) + if err != nil { + return nil, "", "", httpserver.ErrorWithStatus(err, http.StatusServiceUnavailable) + } + return conn.dialGatewayTLS(ctx, ctr, rawconn) + }, nil, nil + } + if tunnel != nil && !(forceProxyForTest && !noForward) { + // If we can't connect directly, and the gateway has + // established a yamux tunnel with us, connect through + // the tunnel. + // + // ...except: forceProxyForTest means we are emulating + // a situation where the gateway has established a + // yamux tunnel with controller B, and the + // ContainerSSH request arrives at controller A. If + // noForward==false then we are acting as A, so + // we pretend not to have a tunnel, and fall through + // to the "tunurl" case below. If noForward==true + // then the client is A and we are acting as B, so we + // connect to our tunnel. + return func() (net.Conn, string, string, error) { + rawconn, err := tunnel.Open() + if err != nil { + return nil, "", "", httpserver.ErrorWithStatus(err, http.StatusServiceUnavailable) + } + return conn.dialGatewayTLS(ctx, ctr, rawconn) + }, nil, nil + } + if tunurl := strings.TrimPrefix(ctr.GatewayAddress, "tunnel "); tunurl != ctr.GatewayAddress && + tunurl != "" && + tunurl != myURL.String() && + !noForward { + // If crunch-run provided a GatewayAddress like + // "tunnel https://10.0.0.10:1010/", that means the + // gateway has established a yamux tunnel with the + // controller process at the indicated InternalURL + // (which isn't us, otherwise we would have had + // "tunnel != nil" above). We need to proxy through to + // the other controller process in order to use the + // tunnel. + for u := range conn.cluster.Services.Controller.InternalURLs { + if u.String() == tunurl { + ctxlog.FromContext(ctx).Debugf("connecting to container gateway through other controller at %s", u) + u := url.URL(u) + return nil, rpc.NewConn(conn.cluster.ClusterID, &u, conn.cluster.TLS.Insecure, rpc.PassthroughTokenProvider), nil + } + } + ctxlog.FromContext(ctx).Warnf("container gateway provided a tunnel endpoint %s that is not one of Services.Controller.InternalURLs", tunurl) + return nil, nil, httpserver.ErrorWithStatus(errors.New("container gateway is running but tunnel endpoint is invalid"), http.StatusServiceUnavailable) + } + if ctr.GatewayAddress == "" { + return nil, nil, httpserver.ErrorWithStatus(errors.New("container is running but gateway is not available"), http.StatusServiceUnavailable) + } else { + return nil, nil, httpserver.ErrorWithStatus(errors.New("container is running but tunnel is down"), http.StatusServiceUnavailable) + } +} + +// dialGatewayTLS negotiates a TLS connection to a container gateway +// over the given raw connection. +func (conn *Conn) dialGatewayTLS(ctx context.Context, ctr arvados.Container, rawconn net.Conn) (*tls.Conn, string, string, error) { + // crunch-run uses a self-signed / unverifiable TLS + // certificate, so we use the following scheme to ensure we're + // not talking to an attacker-in-the-middle. + // + // 1. Compute ctrKey = HMAC-SHA256(sysRootToken,ctrUUID) -- + // this will be the same ctrKey that a-d-c supplied to + // crunch-run in the GatewayAuthSecret env var. + // + // 2. Compute requestAuth = HMAC-SHA256(ctrKey,serverCert) and + // send it to crunch-run as the X-Arvados-Authorization + // header, proving that we know ctrKey. (Note a MITM cannot + // replay the proof to a real crunch-run server, because the + // real crunch-run server would have a different cert.) + // + // 3. Compute respondAuth = HMAC-SHA256(ctrKey,requestAuth) + // and ensure the server returns it in the + // X-Arvados-Authorization-Response header, proving that the + // server knows ctrKey. + var requestAuth, respondAuth string + tlsconn := tls.Client(rawconn, &tls.Config{ + InsecureSkipVerify: true, + VerifyPeerCertificate: func(rawCerts [][]byte, verifiedChains [][]*x509.Certificate) error { + if len(rawCerts) == 0 { + return errors.New("no certificate received, cannot compute authorization header") + } + h := hmac.New(sha256.New, []byte(conn.cluster.SystemRootToken)) + fmt.Fprint(h, ctr.UUID) + authKey := fmt.Sprintf("%x", h.Sum(nil)) + h = hmac.New(sha256.New, []byte(authKey)) + h.Write(rawCerts[0]) + requestAuth = fmt.Sprintf("%x", h.Sum(nil)) + h.Reset() + h.Write([]byte(requestAuth)) + respondAuth = fmt.Sprintf("%x", h.Sum(nil)) + return nil + }, + }) + err := tlsconn.HandshakeContext(ctx) + if err != nil { + return nil, "", "", httpserver.ErrorWithStatus(fmt.Errorf("TLS handshake failed: %w", err), http.StatusBadGateway) + } + if respondAuth == "" { + tlsconn.Close() + return nil, "", "", httpserver.ErrorWithStatus(errors.New("BUG: no respondAuth"), http.StatusInternalServerError) + } + return tlsconn, requestAuth, respondAuth, nil +}