19886: crunch-run records initial log with PDH
[arvados.git] / lib / crunchrun / container_gateway.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package crunchrun
6
7 import (
8         "crypto/hmac"
9         "crypto/rand"
10         "crypto/rsa"
11         "crypto/sha256"
12         "crypto/tls"
13         "fmt"
14         "io"
15         "net"
16         "net/http"
17         "net/url"
18         "os"
19         "os/exec"
20         "sync"
21         "syscall"
22         "time"
23
24         "git.arvados.org/arvados.git/lib/controller/rpc"
25         "git.arvados.org/arvados.git/lib/selfsigned"
26         "git.arvados.org/arvados.git/sdk/go/arvados"
27         "git.arvados.org/arvados.git/sdk/go/auth"
28         "git.arvados.org/arvados.git/sdk/go/ctxlog"
29         "git.arvados.org/arvados.git/sdk/go/httpserver"
30         "github.com/creack/pty"
31         "github.com/google/shlex"
32         "github.com/hashicorp/yamux"
33         "golang.org/x/crypto/ssh"
34         "golang.org/x/net/context"
35 )
36
37 type GatewayTarget interface {
38         // Command that will execute cmd inside the container
39         InjectCommand(ctx context.Context, detachKeys, username string, usingTTY bool, cmd []string) (*exec.Cmd, error)
40
41         // IP address inside container
42         IPAddress() (string, error)
43 }
44
45 type GatewayTargetStub struct{}
46
47 func (GatewayTargetStub) IPAddress() (string, error) { return "127.0.0.1", nil }
48 func (GatewayTargetStub) InjectCommand(ctx context.Context, detachKeys, username string, usingTTY bool, cmd []string) (*exec.Cmd, error) {
49         return exec.CommandContext(ctx, cmd[0], cmd[1:]...), nil
50 }
51
52 type Gateway struct {
53         ContainerUUID string
54         // Caller should set Address to "", or "host:0" or "host:port"
55         // where host is a known external IP address; port is a
56         // desired port number to listen on; and ":0" chooses an
57         // available dynamic port.
58         //
59         // If Address is "", Start() listens only on the loopback
60         // interface (and changes Address to "127.0.0.1:port").
61         // Otherwise it listens on all interfaces.
62         //
63         // If Address is "host:0", Start() updates Address to
64         // "host:port".
65         Address    string
66         AuthSecret string
67         Target     GatewayTarget
68         Log        interface {
69                 Printf(fmt string, args ...interface{})
70         }
71         // If non-nil, set up a ContainerGatewayTunnel, so that the
72         // controller can connect to us even if our external IP
73         // address is unknown or not routable from controller.
74         ArvadosClient *arvados.Client
75
76         // When a tunnel is connected or reconnected, this func (if
77         // not nil) will be called with the InternalURL of the
78         // controller process at the other end of the tunnel.
79         UpdateTunnelURL func(url string)
80
81         sshConfig   ssh.ServerConfig
82         requestAuth string
83         respondAuth string
84 }
85
86 // Start starts an http server that allows authenticated clients to open an
87 // interactive "docker exec" session and (in future) connect to tcp ports
88 // inside the docker container.
89 func (gw *Gateway) Start() error {
90         gw.sshConfig = ssh.ServerConfig{
91                 NoClientAuth: true,
92                 PasswordCallback: func(c ssh.ConnMetadata, pass []byte) (*ssh.Permissions, error) {
93                         if c.User() == "_" {
94                                 return nil, nil
95                         }
96                         return nil, fmt.Errorf("cannot specify user %q via ssh client", c.User())
97                 },
98                 PublicKeyCallback: func(c ssh.ConnMetadata, pubKey ssh.PublicKey) (*ssh.Permissions, error) {
99                         if c.User() == "_" {
100                                 return &ssh.Permissions{
101                                         Extensions: map[string]string{
102                                                 "pubkey-fp": ssh.FingerprintSHA256(pubKey),
103                                         },
104                                 }, nil
105                         }
106                         return nil, fmt.Errorf("cannot specify user %q via ssh client", c.User())
107                 },
108         }
109         pvt, err := rsa.GenerateKey(rand.Reader, 2048)
110         if err != nil {
111                 return err
112         }
113         err = pvt.Validate()
114         if err != nil {
115                 return err
116         }
117         signer, err := ssh.NewSignerFromKey(pvt)
118         if err != nil {
119                 return err
120         }
121         gw.sshConfig.AddHostKey(signer)
122
123         // Address (typically provided by arvados-dispatch-cloud) is
124         // HOST:PORT where HOST is our IP address or hostname as seen
125         // from arvados-controller, and PORT is either the desired
126         // port where we should run our gateway server, or "0" if we
127         // should choose an available port.
128         extAddr := gw.Address
129         // Generally we can't know which local interface corresponds
130         // to an externally reachable IP address, so if we expect to
131         // be reachable by external hosts, we listen on all
132         // interfaces.
133         listenHost := ""
134         if extAddr == "" {
135                 // If the dispatcher doesn't tell us our external IP
136                 // address, controller will only be able to connect
137                 // through the tunnel (see runTunnel), so our gateway
138                 // server only needs to listen on the loopback
139                 // interface.
140                 extAddr = "127.0.0.1:0"
141                 listenHost = "127.0.0.1"
142         }
143         extHost, extPort, err := net.SplitHostPort(extAddr)
144         if err != nil {
145                 return err
146         }
147         cert, err := selfsigned.CertGenerator{}.Generate()
148         if err != nil {
149                 return err
150         }
151         h := hmac.New(sha256.New, []byte(gw.AuthSecret))
152         h.Write(cert.Certificate[0])
153         gw.requestAuth = fmt.Sprintf("%x", h.Sum(nil))
154         h.Reset()
155         h.Write([]byte(gw.requestAuth))
156         gw.respondAuth = fmt.Sprintf("%x", h.Sum(nil))
157
158         srv := &httpserver.Server{
159                 Server: http.Server{
160                         Handler: http.HandlerFunc(gw.handleSSH),
161                         TLSConfig: &tls.Config{
162                                 Certificates: []tls.Certificate{cert},
163                         },
164                 },
165                 Addr: net.JoinHostPort(listenHost, extPort),
166         }
167         err = srv.Start()
168         if err != nil {
169                 return err
170         }
171         go func() {
172                 err := srv.Wait()
173                 gw.Log.Printf("gateway server stopped: %s", err)
174         }()
175         // Get the port number we are listening on (extPort might be
176         // "0" or a port name, in which case this will be different).
177         _, listenPort, err := net.SplitHostPort(srv.Addr)
178         if err != nil {
179                 return err
180         }
181         // When changing state to Running, the caller will want to set
182         // gateway_address to a "HOST:PORT" that, if controller
183         // connects to it, will reach this gateway server.
184         //
185         // The most likely thing to work is: HOST is our external
186         // hostname/IP as provided by the caller
187         // (arvados-dispatch-cloud) or 127.0.0.1 to indicate
188         // non-tunnel connections aren't available; and PORT is the
189         // port number we are listening on.
190         gw.Address = net.JoinHostPort(extHost, listenPort)
191         gw.Log.Printf("gateway server listening at %s", gw.Address)
192         if gw.ArvadosClient != nil {
193                 go gw.maintainTunnel(gw.Address)
194         }
195         return nil
196 }
197
198 func (gw *Gateway) maintainTunnel(addr string) {
199         for ; ; time.Sleep(5 * time.Second) {
200                 err := gw.runTunnel(addr)
201                 gw.Log.Printf("runTunnel: %s", err)
202         }
203 }
204
205 // runTunnel connects to controller and sets up a tunnel through
206 // which controller can connect to the gateway server at the given
207 // addr.
208 func (gw *Gateway) runTunnel(addr string) error {
209         ctx := auth.NewContext(context.Background(), auth.NewCredentials(gw.ArvadosClient.AuthToken))
210         arpc := rpc.NewConn("", &url.URL{Scheme: "https", Host: gw.ArvadosClient.APIHost}, gw.ArvadosClient.Insecure, rpc.PassthroughTokenProvider)
211         tun, err := arpc.ContainerGatewayTunnel(ctx, arvados.ContainerGatewayTunnelOptions{
212                 UUID:       gw.ContainerUUID,
213                 AuthSecret: gw.AuthSecret,
214         })
215         if err != nil {
216                 return fmt.Errorf("error creating gateway tunnel: %s", err)
217         }
218         mux, err := yamux.Client(tun.Conn, nil)
219         if err != nil {
220                 return fmt.Errorf("error setting up mux client end: %s", err)
221         }
222         if url := tun.Header.Get("X-Arvados-Internal-Url"); url != "" && gw.UpdateTunnelURL != nil {
223                 gw.UpdateTunnelURL(url)
224         }
225         for {
226                 muxconn, err := mux.AcceptStream()
227                 if err != nil {
228                         return err
229                 }
230                 gw.Log.Printf("tunnel connection %d started", muxconn.StreamID())
231                 go func() {
232                         defer muxconn.Close()
233                         gwconn, err := net.Dial("tcp", addr)
234                         if err != nil {
235                                 gw.Log.Printf("tunnel connection %d: error connecting to %s: %s", muxconn.StreamID(), addr, err)
236                                 return
237                         }
238                         defer gwconn.Close()
239                         var wg sync.WaitGroup
240                         wg.Add(2)
241                         go func() {
242                                 defer wg.Done()
243                                 _, err := io.Copy(gwconn, muxconn)
244                                 if err != nil {
245                                         gw.Log.Printf("tunnel connection %d: mux end: %s", muxconn.StreamID(), err)
246                                 }
247                                 gwconn.Close()
248                         }()
249                         go func() {
250                                 defer wg.Done()
251                                 _, err := io.Copy(muxconn, gwconn)
252                                 if err != nil {
253                                         gw.Log.Printf("tunnel connection %d: gateway end: %s", muxconn.StreamID(), err)
254                                 }
255                                 muxconn.Close()
256                         }()
257                         wg.Wait()
258                         gw.Log.Printf("tunnel connection %d finished", muxconn.StreamID())
259                 }()
260         }
261 }
262
263 // handleSSH connects to an SSH server that allows the caller to run
264 // interactive commands as root (or any other desired user) inside the
265 // container. The tunnel itself can only be created by an
266 // authenticated caller, so the SSH server itself is wide open (any
267 // password or key will be accepted).
268 //
269 // Requests must have path "/ssh" and the following headers:
270 //
271 // Connection: upgrade
272 // Upgrade: ssh
273 // X-Arvados-Target-Uuid: uuid of container
274 // X-Arvados-Authorization: must match
275 // hmac(AuthSecret,certfingerprint) (this prevents other containers
276 // and shell nodes from connecting directly)
277 //
278 // Optional headers:
279 //
280 // X-Arvados-Detach-Keys: argument to "docker exec --detach-keys",
281 // e.g., "ctrl-p,ctrl-q"
282 // X-Arvados-Login-Username: argument to "docker exec --user": account
283 // used to run command(s) inside the container.
284 func (gw *Gateway) handleSSH(w http.ResponseWriter, req *http.Request) {
285         // In future we'll handle browser traffic too, but for now the
286         // only traffic we expect is an SSH tunnel from
287         // (*lib/controller/localdb.Conn)ContainerSSH()
288         if req.Method != "POST" || req.Header.Get("Upgrade") != "ssh" {
289                 http.Error(w, "path not found", http.StatusNotFound)
290                 return
291         }
292         req.ParseForm()
293         if want := req.Form.Get("uuid"); want != gw.ContainerUUID {
294                 http.Error(w, fmt.Sprintf("misdirected request: meant for %q but received by crunch-run %q", want, gw.ContainerUUID), http.StatusBadGateway)
295                 return
296         }
297         if req.Header.Get("X-Arvados-Authorization") != gw.requestAuth {
298                 http.Error(w, "bad X-Arvados-Authorization header", http.StatusUnauthorized)
299                 return
300         }
301         detachKeys := req.Form.Get("detach_keys")
302         username := req.Form.Get("login_username")
303         if username == "" {
304                 username = "root"
305         }
306         hj, ok := w.(http.Hijacker)
307         if !ok {
308                 http.Error(w, "ResponseWriter does not support connection upgrade", http.StatusInternalServerError)
309                 return
310         }
311         netconn, _, err := hj.Hijack()
312         if !ok {
313                 http.Error(w, err.Error(), http.StatusInternalServerError)
314                 return
315         }
316         defer netconn.Close()
317         w.Header().Set("Connection", "upgrade")
318         w.Header().Set("Upgrade", "ssh")
319         w.Header().Set("X-Arvados-Authorization-Response", gw.respondAuth)
320         netconn.Write([]byte("HTTP/1.1 101 Switching Protocols\r\n"))
321         w.Header().Write(netconn)
322         netconn.Write([]byte("\r\n"))
323
324         ctx := req.Context()
325
326         conn, newchans, reqs, err := ssh.NewServerConn(netconn, &gw.sshConfig)
327         if err == io.EOF {
328                 return
329         } else if err != nil {
330                 gw.Log.Printf("ssh.NewServerConn: %s", err)
331                 return
332         }
333         defer conn.Close()
334         go ssh.DiscardRequests(reqs)
335         for newch := range newchans {
336                 switch newch.ChannelType() {
337                 case "direct-tcpip":
338                         go gw.handleDirectTCPIP(ctx, newch)
339                 case "session":
340                         go gw.handleSession(ctx, newch, detachKeys, username)
341                 default:
342                         go newch.Reject(ssh.UnknownChannelType, fmt.Sprintf("unsupported channel type %q", newch.ChannelType()))
343                 }
344         }
345 }
346
347 func (gw *Gateway) handleDirectTCPIP(ctx context.Context, newch ssh.NewChannel) {
348         ch, reqs, err := newch.Accept()
349         if err != nil {
350                 gw.Log.Printf("accept direct-tcpip channel: %s", err)
351                 return
352         }
353         defer ch.Close()
354         go ssh.DiscardRequests(reqs)
355
356         // RFC 4254 7.2 (copy of channelOpenDirectMsg in
357         // golang.org/x/crypto/ssh)
358         var msg struct {
359                 Raddr string
360                 Rport uint32
361                 Laddr string
362                 Lport uint32
363         }
364         err = ssh.Unmarshal(newch.ExtraData(), &msg)
365         if err != nil {
366                 fmt.Fprintf(ch.Stderr(), "unmarshal direct-tcpip extradata: %s\n", err)
367                 return
368         }
369         switch msg.Raddr {
370         case "localhost", "0.0.0.0", "127.0.0.1", "::1", "::":
371         default:
372                 fmt.Fprintf(ch.Stderr(), "cannot forward to ports on %q, only localhost\n", msg.Raddr)
373                 return
374         }
375
376         dstaddr, err := gw.Target.IPAddress()
377         if err != nil {
378                 fmt.Fprintf(ch.Stderr(), "container has no IP address: %s\n", err)
379                 return
380         } else if dstaddr == "" {
381                 fmt.Fprintf(ch.Stderr(), "container has no IP address\n")
382                 return
383         }
384
385         dst := net.JoinHostPort(dstaddr, fmt.Sprintf("%d", msg.Rport))
386         tcpconn, err := net.Dial("tcp", dst)
387         if err != nil {
388                 fmt.Fprintf(ch.Stderr(), "%s: %s\n", dst, err)
389                 return
390         }
391         go func() {
392                 n, _ := io.Copy(ch, tcpconn)
393                 ctxlog.FromContext(ctx).Debugf("tcpip: sent %d bytes\n", n)
394                 ch.CloseWrite()
395         }()
396         n, _ := io.Copy(tcpconn, ch)
397         ctxlog.FromContext(ctx).Debugf("tcpip: received %d bytes\n", n)
398 }
399
400 func (gw *Gateway) handleSession(ctx context.Context, newch ssh.NewChannel, detachKeys, username string) {
401         ch, reqs, err := newch.Accept()
402         if err != nil {
403                 gw.Log.Printf("error accepting session channel: %s", err)
404                 return
405         }
406         defer ch.Close()
407
408         var pty0, tty0 *os.File
409         // Where to send errors/messages for the client to see
410         logw := io.Writer(ch.Stderr())
411         // How to end lines when sending errors/messages to the client
412         // (changes to \r\n when using a pty)
413         eol := "\n"
414         // Env vars to add to child process
415         termEnv := []string(nil)
416
417         started := 0
418         wantClose := make(chan struct{})
419         for {
420                 var req *ssh.Request
421                 select {
422                 case r, ok := <-reqs:
423                         if !ok {
424                                 return
425                         }
426                         req = r
427                 case <-wantClose:
428                         return
429                 }
430                 ok := false
431                 switch req.Type {
432                 case "shell", "exec":
433                         if started++; started != 1 {
434                                 // RFC 4254 6.5: "Only one of these
435                                 // requests can succeed per channel."
436                                 break
437                         }
438                         ok = true
439                         var payload struct {
440                                 Command string
441                         }
442                         ssh.Unmarshal(req.Payload, &payload)
443                         execargs, err := shlex.Split(payload.Command)
444                         if err != nil {
445                                 fmt.Fprintf(logw, "error parsing supplied command: %s"+eol, err)
446                                 return
447                         }
448                         if len(execargs) == 0 {
449                                 execargs = []string{"/bin/bash", "-login"}
450                         }
451                         go func() {
452                                 var resp struct {
453                                         Status uint32
454                                 }
455                                 defer func() {
456                                         ch.SendRequest("exit-status", false, ssh.Marshal(&resp))
457                                         close(wantClose)
458                                 }()
459
460                                 cmd, err := gw.Target.InjectCommand(ctx, detachKeys, username, tty0 != nil, execargs)
461                                 if err != nil {
462                                         fmt.Fprintln(ch.Stderr(), err)
463                                         ch.CloseWrite()
464                                         resp.Status = 1
465                                         return
466                                 }
467                                 if tty0 != nil {
468                                         cmd.Stdin = tty0
469                                         cmd.Stdout = tty0
470                                         cmd.Stderr = tty0
471                                         go io.Copy(ch, pty0)
472                                         go io.Copy(pty0, ch)
473                                         // Send our own debug messages to tty as well.
474                                         logw = tty0
475                                 } else {
476                                         // StdinPipe may seem
477                                         // superfluous here, but it's
478                                         // not: it causes cmd.Run() to
479                                         // return when the subprocess
480                                         // exits. Without it, Run()
481                                         // waits for stdin to close,
482                                         // which causes "ssh ... echo
483                                         // ok" (with the client's
484                                         // stdin connected to a
485                                         // terminal or something) to
486                                         // hang.
487                                         stdin, err := cmd.StdinPipe()
488                                         if err != nil {
489                                                 fmt.Fprintln(ch.Stderr(), err)
490                                                 ch.CloseWrite()
491                                                 resp.Status = 1
492                                                 return
493                                         }
494                                         go func() {
495                                                 io.Copy(stdin, ch)
496                                                 stdin.Close()
497                                         }()
498                                         cmd.Stdout = ch
499                                         cmd.Stderr = ch.Stderr()
500                                 }
501                                 cmd.SysProcAttr = &syscall.SysProcAttr{
502                                         Setctty: tty0 != nil,
503                                         Setsid:  true,
504                                 }
505                                 cmd.Env = append(os.Environ(), termEnv...)
506                                 err = cmd.Run()
507                                 if exiterr, ok := err.(*exec.ExitError); ok {
508                                         if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
509                                                 resp.Status = uint32(status.ExitStatus())
510                                         }
511                                 } else if err != nil {
512                                         // Propagate errors like `exec: "docker": executable file not found in $PATH`
513                                         fmt.Fprintln(ch.Stderr(), err)
514                                 }
515                                 errClose := ch.CloseWrite()
516                                 if resp.Status == 0 && (err != nil || errClose != nil) {
517                                         resp.Status = 1
518                                 }
519                         }()
520                 case "pty-req":
521                         eol = "\r\n"
522                         p, t, err := pty.Open()
523                         if err != nil {
524                                 fmt.Fprintf(ch.Stderr(), "pty failed: %s"+eol, err)
525                                 break
526                         }
527                         defer p.Close()
528                         defer t.Close()
529                         pty0, tty0 = p, t
530                         ok = true
531                         var payload struct {
532                                 Term string
533                                 Cols uint32
534                                 Rows uint32
535                                 X    uint32
536                                 Y    uint32
537                         }
538                         ssh.Unmarshal(req.Payload, &payload)
539                         termEnv = []string{"TERM=" + payload.Term, "USE_TTY=1"}
540                         err = pty.Setsize(pty0, &pty.Winsize{Rows: uint16(payload.Rows), Cols: uint16(payload.Cols), X: uint16(payload.X), Y: uint16(payload.Y)})
541                         if err != nil {
542                                 fmt.Fprintf(logw, "pty-req: setsize failed: %s"+eol, err)
543                         }
544                 case "window-change":
545                         var payload struct {
546                                 Cols uint32
547                                 Rows uint32
548                                 X    uint32
549                                 Y    uint32
550                         }
551                         ssh.Unmarshal(req.Payload, &payload)
552                         err := pty.Setsize(pty0, &pty.Winsize{Rows: uint16(payload.Rows), Cols: uint16(payload.Cols), X: uint16(payload.X), Y: uint16(payload.Y)})
553                         if err != nil {
554                                 fmt.Fprintf(logw, "window-change: setsize failed: %s"+eol, err)
555                                 break
556                         }
557                         ok = true
558                 case "env":
559                         // TODO: implement "env"
560                         // requests by setting env
561                         // vars in the docker-exec
562                         // command (not docker-exec's
563                         // own environment, which
564                         // would be a gaping security
565                         // hole).
566                 default:
567                         // fmt.Fprintf(logw, "declined request %q on ssh channel"+eol, req.Type)
568                 }
569                 if req.WantReply {
570                         req.Reply(ok, nil)
571                 }
572         }
573 }