Merge branch '19975-oom-resubmit' refs #19975
[arvados.git] / lib / controller / localdb / container_gateway.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package localdb
6
7 import (
8         "bufio"
9         "bytes"
10         "context"
11         "crypto/hmac"
12         "crypto/sha256"
13         "crypto/subtle"
14         "crypto/tls"
15         "crypto/x509"
16         "errors"
17         "fmt"
18         "io"
19         "io/ioutil"
20         "net"
21         "net/http"
22         "net/url"
23         "strings"
24
25         "git.arvados.org/arvados.git/lib/controller/rpc"
26         "git.arvados.org/arvados.git/lib/service"
27         "git.arvados.org/arvados.git/sdk/go/arvados"
28         "git.arvados.org/arvados.git/sdk/go/auth"
29         "git.arvados.org/arvados.git/sdk/go/ctxlog"
30         "git.arvados.org/arvados.git/sdk/go/httpserver"
31         "github.com/hashicorp/yamux"
32 )
33
34 var (
35         forceProxyForTest       = false
36         forceInternalURLForTest *arvados.URL
37 )
38
39 // ContainerSSH returns a connection to the SSH server in the
40 // appropriate crunch-run process on the worker node where the
41 // specified container is running.
42 //
43 // If the returned error is nil, the caller is responsible for closing
44 // sshconn.Conn.
45 func (conn *Conn) ContainerSSH(ctx context.Context, opts arvados.ContainerSSHOptions) (sshconn arvados.ConnectionResponse, err error) {
46         user, err := conn.railsProxy.UserGetCurrent(ctx, arvados.GetOptions{})
47         if err != nil {
48                 return sshconn, err
49         }
50         ctr, err := conn.railsProxy.ContainerGet(ctx, arvados.GetOptions{UUID: opts.UUID})
51         if err != nil {
52                 return sshconn, err
53         }
54         ctxRoot := auth.NewContext(ctx, &auth.Credentials{Tokens: []string{conn.cluster.SystemRootToken}})
55         if !user.IsAdmin || !conn.cluster.Containers.ShellAccess.Admin {
56                 if !conn.cluster.Containers.ShellAccess.User {
57                         return sshconn, httpserver.ErrorWithStatus(errors.New("shell access is disabled in config"), http.StatusServiceUnavailable)
58                 }
59                 crs, err := conn.railsProxy.ContainerRequestList(ctxRoot, arvados.ListOptions{Limit: -1, Filters: []arvados.Filter{{"container_uuid", "=", opts.UUID}}})
60                 if err != nil {
61                         return sshconn, err
62                 }
63                 for _, cr := range crs.Items {
64                         if cr.ModifiedByUserUUID != user.UUID {
65                                 return sshconn, httpserver.ErrorWithStatus(errors.New("permission denied: container is associated with requests submitted by other users"), http.StatusForbidden)
66                         }
67                 }
68                 if crs.ItemsAvailable != len(crs.Items) {
69                         return sshconn, httpserver.ErrorWithStatus(errors.New("incomplete response while checking permission"), http.StatusInternalServerError)
70                 }
71         }
72
73         conn.gwTunnelsLock.Lock()
74         tunnel := conn.gwTunnels[opts.UUID]
75         conn.gwTunnelsLock.Unlock()
76
77         if ctr.State == arvados.ContainerStateQueued || ctr.State == arvados.ContainerStateLocked {
78                 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("container is not running yet (state is %q)", ctr.State), http.StatusServiceUnavailable)
79         } else if ctr.State != arvados.ContainerStateRunning {
80                 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("container has ended (state is %q)", ctr.State), http.StatusGone)
81         }
82
83         // targetHost is the value we'll use in the Host header in our
84         // "Upgrade: ssh" http request. It's just a placeholder
85         // "localhost", unless we decide to connect directly, in which
86         // case we'll set it to the gateway's external ip:host. (The
87         // gateway doesn't even look at it, but we might as well.)
88         targetHost := "localhost"
89         myURL, _ := service.URLFromContext(ctx)
90
91         var rawconn net.Conn
92         if host, _, splitErr := net.SplitHostPort(ctr.GatewayAddress); splitErr == nil && host != "" && host != "127.0.0.1" {
93                 // If crunch-run provided a GatewayAddress like
94                 // "ipaddr:port", that means "ipaddr" is one of the
95                 // external interfaces where the gateway is
96                 // listening. In that case, it's the most
97                 // reliable/direct option, so we use it even if a
98                 // tunnel might also be available.
99                 targetHost = ctr.GatewayAddress
100                 rawconn, err = net.Dial("tcp", ctr.GatewayAddress)
101                 if err != nil {
102                         return sshconn, httpserver.ErrorWithStatus(err, http.StatusServiceUnavailable)
103                 }
104         } else if tunnel != nil && !(forceProxyForTest && !opts.NoForward) {
105                 // If we can't connect directly, and the gateway has
106                 // established a yamux tunnel with us, connect through
107                 // the tunnel.
108                 //
109                 // ...except: forceProxyForTest means we are emulating
110                 // a situation where the gateway has established a
111                 // yamux tunnel with controller B, and the
112                 // ContainerSSH request arrives at controller A. If
113                 // opts.NoForward==false then we are acting as A, so
114                 // we pretend not to have a tunnel, and fall through
115                 // to the "tunurl" case below. If opts.NoForward==true
116                 // then the client is A and we are acting as B, so we
117                 // connect to our tunnel.
118                 rawconn, err = tunnel.Open()
119                 if err != nil {
120                         return sshconn, httpserver.ErrorWithStatus(err, http.StatusServiceUnavailable)
121                 }
122         } else if ctr.GatewayAddress == "" {
123                 return sshconn, httpserver.ErrorWithStatus(errors.New("container is running but gateway is not available"), http.StatusServiceUnavailable)
124         } else if tunurl := strings.TrimPrefix(ctr.GatewayAddress, "tunnel "); tunurl != ctr.GatewayAddress &&
125                 tunurl != "" &&
126                 tunurl != myURL.String() &&
127                 !opts.NoForward {
128                 // If crunch-run provided a GatewayAddress like
129                 // "tunnel https://10.0.0.10:1010/", that means the
130                 // gateway has established a yamux tunnel with the
131                 // controller process at the indicated InternalURL
132                 // (which isn't us, otherwise we would have had
133                 // "tunnel != nil" above). We need to proxy through to
134                 // the other controller process in order to use the
135                 // tunnel.
136                 for u := range conn.cluster.Services.Controller.InternalURLs {
137                         if u.String() == tunurl {
138                                 ctxlog.FromContext(ctx).Debugf("proxying ContainerSSH request to other controller at %s", u)
139                                 u := url.URL(u)
140                                 arpc := rpc.NewConn(conn.cluster.ClusterID, &u, conn.cluster.TLS.Insecure, rpc.PassthroughTokenProvider)
141                                 opts.NoForward = true
142                                 return arpc.ContainerSSH(ctx, opts)
143                         }
144                 }
145                 ctxlog.FromContext(ctx).Warnf("container gateway provided a tunnel endpoint %s that is not one of Services.Controller.InternalURLs", tunurl)
146                 return sshconn, httpserver.ErrorWithStatus(errors.New("container gateway is running but tunnel endpoint is invalid"), http.StatusServiceUnavailable)
147         } else {
148                 return sshconn, httpserver.ErrorWithStatus(errors.New("container gateway is running but tunnel is down"), http.StatusServiceUnavailable)
149         }
150
151         // crunch-run uses a self-signed / unverifiable TLS
152         // certificate, so we use the following scheme to ensure we're
153         // not talking to a MITM.
154         //
155         // 1. Compute ctrKey = HMAC-SHA256(sysRootToken,ctrUUID) --
156         // this will be the same ctrKey that a-d-c supplied to
157         // crunch-run in the GatewayAuthSecret env var.
158         //
159         // 2. Compute requestAuth = HMAC-SHA256(ctrKey,serverCert) and
160         // send it to crunch-run as the X-Arvados-Authorization
161         // header, proving that we know ctrKey. (Note a MITM cannot
162         // replay the proof to a real crunch-run server, because the
163         // real crunch-run server would have a different cert.)
164         //
165         // 3. Compute respondAuth = HMAC-SHA256(ctrKey,requestAuth)
166         // and ensure the server returns it in the
167         // X-Arvados-Authorization-Response header, proving that the
168         // server knows ctrKey.
169         var requestAuth, respondAuth string
170         tlsconn := tls.Client(rawconn, &tls.Config{
171                 InsecureSkipVerify: true,
172                 VerifyPeerCertificate: func(rawCerts [][]byte, verifiedChains [][]*x509.Certificate) error {
173                         if len(rawCerts) == 0 {
174                                 return errors.New("no certificate received, cannot compute authorization header")
175                         }
176                         h := hmac.New(sha256.New, []byte(conn.cluster.SystemRootToken))
177                         fmt.Fprint(h, opts.UUID)
178                         authKey := fmt.Sprintf("%x", h.Sum(nil))
179                         h = hmac.New(sha256.New, []byte(authKey))
180                         h.Write(rawCerts[0])
181                         requestAuth = fmt.Sprintf("%x", h.Sum(nil))
182                         h.Reset()
183                         h.Write([]byte(requestAuth))
184                         respondAuth = fmt.Sprintf("%x", h.Sum(nil))
185                         return nil
186                 },
187         })
188         err = tlsconn.HandshakeContext(ctx)
189         if err != nil {
190                 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("TLS handshake failed: %w", err), http.StatusBadGateway)
191         }
192         if respondAuth == "" {
193                 tlsconn.Close()
194                 return sshconn, httpserver.ErrorWithStatus(errors.New("BUG: no respondAuth"), http.StatusInternalServerError)
195         }
196         bufr := bufio.NewReader(tlsconn)
197         bufw := bufio.NewWriter(tlsconn)
198
199         u := url.URL{
200                 Scheme: "http",
201                 Host:   targetHost,
202                 Path:   "/ssh",
203         }
204         postform := url.Values{
205                 "uuid":           {opts.UUID},
206                 "detach_keys":    {opts.DetachKeys},
207                 "login_username": {opts.LoginUsername},
208                 "no_forward":     {fmt.Sprintf("%v", opts.NoForward)},
209         }
210         postdata := postform.Encode()
211         bufw.WriteString("POST " + u.String() + " HTTP/1.1\r\n")
212         bufw.WriteString("Host: " + u.Host + "\r\n")
213         bufw.WriteString("Upgrade: ssh\r\n")
214         bufw.WriteString("X-Arvados-Authorization: " + requestAuth + "\r\n")
215         bufw.WriteString("Content-Type: application/x-www-form-urlencoded\r\n")
216         fmt.Fprintf(bufw, "Content-Length: %d\r\n", len(postdata))
217         bufw.WriteString("\r\n")
218         bufw.WriteString(postdata)
219         bufw.Flush()
220         resp, err := http.ReadResponse(bufr, &http.Request{Method: "POST"})
221         if err != nil {
222                 tlsconn.Close()
223                 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("error reading http response from gateway: %w", err), http.StatusBadGateway)
224         }
225         defer resp.Body.Close()
226         if resp.StatusCode != http.StatusSwitchingProtocols {
227                 body, _ := ioutil.ReadAll(io.LimitReader(resp.Body, 1000))
228                 tlsconn.Close()
229                 return sshconn, httpserver.ErrorWithStatus(fmt.Errorf("unexpected status %s %q", resp.Status, body), http.StatusBadGateway)
230         }
231         if strings.ToLower(resp.Header.Get("Upgrade")) != "ssh" ||
232                 strings.ToLower(resp.Header.Get("Connection")) != "upgrade" {
233                 tlsconn.Close()
234                 return sshconn, httpserver.ErrorWithStatus(errors.New("bad upgrade"), http.StatusBadGateway)
235         }
236         if resp.Header.Get("X-Arvados-Authorization-Response") != respondAuth {
237                 tlsconn.Close()
238                 return sshconn, httpserver.ErrorWithStatus(errors.New("bad X-Arvados-Authorization-Response header"), http.StatusBadGateway)
239         }
240
241         if !ctr.InteractiveSessionStarted {
242                 _, err = conn.railsProxy.ContainerUpdate(ctxRoot, arvados.UpdateOptions{
243                         UUID: opts.UUID,
244                         Attrs: map[string]interface{}{
245                                 "interactive_session_started": true,
246                         },
247                 })
248                 if err != nil {
249                         tlsconn.Close()
250                         return sshconn, httpserver.ErrorWithStatus(err, http.StatusInternalServerError)
251                 }
252         }
253
254         sshconn.Conn = tlsconn
255         sshconn.Bufrw = &bufio.ReadWriter{Reader: bufr, Writer: bufw}
256         sshconn.Logger = ctxlog.FromContext(ctx)
257         sshconn.Header = http.Header{"Upgrade": {"ssh"}}
258         return sshconn, nil
259 }
260
261 // ContainerGatewayTunnel sets up a tunnel enabling us (controller) to
262 // connect to the caller's (crunch-run's) gateway server.
263 func (conn *Conn) ContainerGatewayTunnel(ctx context.Context, opts arvados.ContainerGatewayTunnelOptions) (resp arvados.ConnectionResponse, err error) {
264         h := hmac.New(sha256.New, []byte(conn.cluster.SystemRootToken))
265         fmt.Fprint(h, opts.UUID)
266         authSecret := fmt.Sprintf("%x", h.Sum(nil))
267         if subtle.ConstantTimeCompare([]byte(authSecret), []byte(opts.AuthSecret)) != 1 {
268                 ctxlog.FromContext(ctx).Info("received incorrect auth_secret")
269                 return resp, httpserver.ErrorWithStatus(errors.New("authentication error"), http.StatusUnauthorized)
270         }
271
272         muxconn, clientconn := net.Pipe()
273         tunnel, err := yamux.Server(muxconn, nil)
274         if err != nil {
275                 clientconn.Close()
276                 return resp, httpserver.ErrorWithStatus(err, http.StatusInternalServerError)
277         }
278
279         conn.gwTunnelsLock.Lock()
280         if conn.gwTunnels == nil {
281                 conn.gwTunnels = map[string]*yamux.Session{opts.UUID: tunnel}
282         } else {
283                 conn.gwTunnels[opts.UUID] = tunnel
284         }
285         conn.gwTunnelsLock.Unlock()
286
287         go func() {
288                 <-tunnel.CloseChan()
289                 conn.gwTunnelsLock.Lock()
290                 if conn.gwTunnels[opts.UUID] == tunnel {
291                         delete(conn.gwTunnels, opts.UUID)
292                 }
293                 conn.gwTunnelsLock.Unlock()
294         }()
295
296         // Assuming we're acting as the backend of an http server,
297         // lib/controller/router will call resp's ServeHTTP handler,
298         // which upgrades the incoming http connection to a raw socket
299         // and connects it to our yamux.Server through our net.Pipe().
300         resp.Conn = clientconn
301         resp.Bufrw = &bufio.ReadWriter{Reader: bufio.NewReader(&bytes.Buffer{}), Writer: bufio.NewWriter(&bytes.Buffer{})}
302         resp.Logger = ctxlog.FromContext(ctx)
303         resp.Header = http.Header{"Upgrade": {"tunnel"}}
304         if u, ok := service.URLFromContext(ctx); ok {
305                 resp.Header.Set("X-Arvados-Internal-Url", u.String())
306         } else if forceInternalURLForTest != nil {
307                 resp.Header.Set("X-Arvados-Internal-Url", forceInternalURLForTest.String())
308         }
309         return
310 }