Merge branch '19889-live-log-webdav'
[arvados.git] / lib / crunchrun / container_gateway.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package crunchrun
6
7 import (
8         "context"
9         "crypto/hmac"
10         "crypto/rand"
11         "crypto/rsa"
12         "crypto/sha256"
13         "crypto/tls"
14         "fmt"
15         "io"
16         "net"
17         "net/http"
18         "net/url"
19         "os"
20         "os/exec"
21         "strings"
22         "sync"
23         "syscall"
24         "time"
25
26         "git.arvados.org/arvados.git/lib/controller/rpc"
27         "git.arvados.org/arvados.git/lib/selfsigned"
28         "git.arvados.org/arvados.git/lib/webdavfs"
29         "git.arvados.org/arvados.git/sdk/go/arvados"
30         "git.arvados.org/arvados.git/sdk/go/auth"
31         "git.arvados.org/arvados.git/sdk/go/ctxlog"
32         "git.arvados.org/arvados.git/sdk/go/httpserver"
33         "github.com/creack/pty"
34         "github.com/google/shlex"
35         "github.com/hashicorp/yamux"
36         "golang.org/x/crypto/ssh"
37         "golang.org/x/net/webdav"
38 )
39
40 type GatewayTarget interface {
41         // Command that will execute cmd inside the container
42         InjectCommand(ctx context.Context, detachKeys, username string, usingTTY bool, cmd []string) (*exec.Cmd, error)
43
44         // IP address inside container
45         IPAddress() (string, error)
46 }
47
48 type GatewayTargetStub struct{}
49
50 func (GatewayTargetStub) IPAddress() (string, error) { return "127.0.0.1", nil }
51 func (GatewayTargetStub) InjectCommand(ctx context.Context, detachKeys, username string, usingTTY bool, cmd []string) (*exec.Cmd, error) {
52         return exec.CommandContext(ctx, cmd[0], cmd[1:]...), nil
53 }
54
55 type Gateway struct {
56         ContainerUUID string
57         // Caller should set Address to "", or "host:0" or "host:port"
58         // where host is a known external IP address; port is a
59         // desired port number to listen on; and ":0" chooses an
60         // available dynamic port.
61         //
62         // If Address is "", Start() listens only on the loopback
63         // interface (and changes Address to "127.0.0.1:port").
64         // Otherwise it listens on all interfaces.
65         //
66         // If Address is "host:0", Start() updates Address to
67         // "host:port".
68         Address    string
69         AuthSecret string
70         Target     GatewayTarget
71         Log        interface {
72                 Printf(fmt string, args ...interface{})
73         }
74         // If non-nil, set up a ContainerGatewayTunnel, so that the
75         // controller can connect to us even if our external IP
76         // address is unknown or not routable from controller.
77         ArvadosClient *arvados.Client
78
79         // When a tunnel is connected or reconnected, this func (if
80         // not nil) will be called with the InternalURL of the
81         // controller process at the other end of the tunnel.
82         UpdateTunnelURL func(url string)
83
84         // Source for serving WebDAV requests at
85         // /arvados/v1/containers/{uuid}/log/
86         LogCollection arvados.CollectionFileSystem
87
88         sshConfig   ssh.ServerConfig
89         requestAuth string
90         respondAuth string
91         logPath     string
92 }
93
94 // Start starts an http server that allows authenticated clients to open an
95 // interactive "docker exec" session and (in future) connect to tcp ports
96 // inside the docker container.
97 func (gw *Gateway) Start() error {
98         gw.sshConfig = ssh.ServerConfig{
99                 NoClientAuth: true,
100                 PasswordCallback: func(c ssh.ConnMetadata, pass []byte) (*ssh.Permissions, error) {
101                         if c.User() == "_" {
102                                 return nil, nil
103                         }
104                         return nil, fmt.Errorf("cannot specify user %q via ssh client", c.User())
105                 },
106                 PublicKeyCallback: func(c ssh.ConnMetadata, pubKey ssh.PublicKey) (*ssh.Permissions, error) {
107                         if c.User() == "_" {
108                                 return &ssh.Permissions{
109                                         Extensions: map[string]string{
110                                                 "pubkey-fp": ssh.FingerprintSHA256(pubKey),
111                                         },
112                                 }, nil
113                         }
114                         return nil, fmt.Errorf("cannot specify user %q via ssh client", c.User())
115                 },
116         }
117         pvt, err := rsa.GenerateKey(rand.Reader, 2048)
118         if err != nil {
119                 return err
120         }
121         err = pvt.Validate()
122         if err != nil {
123                 return err
124         }
125         signer, err := ssh.NewSignerFromKey(pvt)
126         if err != nil {
127                 return err
128         }
129         gw.sshConfig.AddHostKey(signer)
130
131         // Address (typically provided by arvados-dispatch-cloud) is
132         // HOST:PORT where HOST is our IP address or hostname as seen
133         // from arvados-controller, and PORT is either the desired
134         // port where we should run our gateway server, or "0" if we
135         // should choose an available port.
136         extAddr := gw.Address
137         // Generally we can't know which local interface corresponds
138         // to an externally reachable IP address, so if we expect to
139         // be reachable by external hosts, we listen on all
140         // interfaces.
141         listenHost := ""
142         if extAddr == "" {
143                 // If the dispatcher doesn't tell us our external IP
144                 // address, controller will only be able to connect
145                 // through the tunnel (see runTunnel), so our gateway
146                 // server only needs to listen on the loopback
147                 // interface.
148                 extAddr = "127.0.0.1:0"
149                 listenHost = "127.0.0.1"
150         }
151         extHost, extPort, err := net.SplitHostPort(extAddr)
152         if err != nil {
153                 return err
154         }
155         cert, err := selfsigned.CertGenerator{}.Generate()
156         if err != nil {
157                 return err
158         }
159         h := hmac.New(sha256.New, []byte(gw.AuthSecret))
160         h.Write(cert.Certificate[0])
161         gw.requestAuth = fmt.Sprintf("%x", h.Sum(nil))
162         h.Reset()
163         h.Write([]byte(gw.requestAuth))
164         gw.respondAuth = fmt.Sprintf("%x", h.Sum(nil))
165
166         gw.logPath = "/arvados/v1/containers/" + gw.ContainerUUID + "/log"
167
168         srv := &httpserver.Server{
169                 Server: http.Server{
170                         Handler: gw,
171                         TLSConfig: &tls.Config{
172                                 Certificates: []tls.Certificate{cert},
173                         },
174                 },
175                 Addr: net.JoinHostPort(listenHost, extPort),
176         }
177         err = srv.Start()
178         if err != nil {
179                 return err
180         }
181         go func() {
182                 err := srv.Wait()
183                 gw.Log.Printf("gateway server stopped: %s", err)
184         }()
185         // Get the port number we are listening on (extPort might be
186         // "0" or a port name, in which case this will be different).
187         _, listenPort, err := net.SplitHostPort(srv.Addr)
188         if err != nil {
189                 return err
190         }
191         // When changing state to Running, the caller will want to set
192         // gateway_address to a "HOST:PORT" that, if controller
193         // connects to it, will reach this gateway server.
194         //
195         // The most likely thing to work is: HOST is our external
196         // hostname/IP as provided by the caller
197         // (arvados-dispatch-cloud) or 127.0.0.1 to indicate
198         // non-tunnel connections aren't available; and PORT is the
199         // port number we are listening on.
200         gw.Address = net.JoinHostPort(extHost, listenPort)
201         gw.Log.Printf("gateway server listening at %s", gw.Address)
202         if gw.ArvadosClient != nil {
203                 go gw.maintainTunnel(gw.Address)
204         }
205         return nil
206 }
207
208 func (gw *Gateway) maintainTunnel(addr string) {
209         for ; ; time.Sleep(5 * time.Second) {
210                 err := gw.runTunnel(addr)
211                 gw.Log.Printf("runTunnel: %s", err)
212         }
213 }
214
215 // runTunnel connects to controller and sets up a tunnel through
216 // which controller can connect to the gateway server at the given
217 // addr.
218 func (gw *Gateway) runTunnel(addr string) error {
219         ctx := auth.NewContext(context.Background(), auth.NewCredentials(gw.ArvadosClient.AuthToken))
220         arpc := rpc.NewConn("", &url.URL{Scheme: "https", Host: gw.ArvadosClient.APIHost}, gw.ArvadosClient.Insecure, rpc.PassthroughTokenProvider)
221         tun, err := arpc.ContainerGatewayTunnel(ctx, arvados.ContainerGatewayTunnelOptions{
222                 UUID:       gw.ContainerUUID,
223                 AuthSecret: gw.AuthSecret,
224         })
225         if err != nil {
226                 return fmt.Errorf("error creating gateway tunnel: %s", err)
227         }
228         mux, err := yamux.Client(tun.Conn, nil)
229         if err != nil {
230                 return fmt.Errorf("error setting up mux client end: %s", err)
231         }
232         if url := tun.Header.Get("X-Arvados-Internal-Url"); url != "" && gw.UpdateTunnelURL != nil {
233                 gw.UpdateTunnelURL(url)
234         }
235         for {
236                 muxconn, err := mux.AcceptStream()
237                 if err != nil {
238                         return err
239                 }
240                 gw.Log.Printf("tunnel connection %d started", muxconn.StreamID())
241                 go func() {
242                         defer muxconn.Close()
243                         gwconn, err := net.Dial("tcp", addr)
244                         if err != nil {
245                                 gw.Log.Printf("tunnel connection %d: error connecting to %s: %s", muxconn.StreamID(), addr, err)
246                                 return
247                         }
248                         defer gwconn.Close()
249                         var wg sync.WaitGroup
250                         wg.Add(2)
251                         go func() {
252                                 defer wg.Done()
253                                 _, err := io.Copy(gwconn, muxconn)
254                                 if err != nil {
255                                         gw.Log.Printf("tunnel connection %d: mux end: %s", muxconn.StreamID(), err)
256                                 }
257                                 gwconn.Close()
258                         }()
259                         go func() {
260                                 defer wg.Done()
261                                 _, err := io.Copy(muxconn, gwconn)
262                                 if err != nil {
263                                         gw.Log.Printf("tunnel connection %d: gateway end: %s", muxconn.StreamID(), err)
264                                 }
265                                 muxconn.Close()
266                         }()
267                         wg.Wait()
268                         gw.Log.Printf("tunnel connection %d finished", muxconn.StreamID())
269                 }()
270         }
271 }
272
273 var webdavMethod = map[string]bool{
274         "GET":      true,
275         "OPTIONS":  true,
276         "PROPFIND": true,
277 }
278
279 func (gw *Gateway) ServeHTTP(w http.ResponseWriter, req *http.Request) {
280         reqUUID := req.Header.Get("X-Arvados-Container-Gateway-Uuid")
281         if reqUUID == "" {
282                 // older controller versions only send UUID as query param
283                 req.ParseForm()
284                 reqUUID = req.Form.Get("uuid")
285         }
286         if reqUUID != gw.ContainerUUID {
287                 http.Error(w, fmt.Sprintf("misdirected request: meant for %q but received by crunch-run %q", reqUUID, gw.ContainerUUID), http.StatusBadGateway)
288                 return
289         }
290         if req.Header.Get("X-Arvados-Authorization") != gw.requestAuth {
291                 http.Error(w, "bad X-Arvados-Authorization header", http.StatusUnauthorized)
292                 return
293         }
294         w.Header().Set("X-Arvados-Authorization-Response", gw.respondAuth)
295         switch {
296         case req.Method == "POST" && req.Header.Get("Upgrade") == "ssh":
297                 gw.handleSSH(w, req)
298         case req.URL.Path == gw.logPath || strings.HasPrefix(req.URL.Path, gw.logPath):
299                 if !webdavMethod[req.Method] {
300                         http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
301                         return
302                 }
303                 gw.handleLogsWebDAV(w, req)
304         default:
305                 http.Error(w, "path not found", http.StatusNotFound)
306         }
307 }
308
309 func (gw *Gateway) handleLogsWebDAV(w http.ResponseWriter, r *http.Request) {
310         if gw.LogCollection == nil {
311                 http.Error(w, "Not found", http.StatusNotFound)
312                 return
313         }
314         wh := webdav.Handler{
315                 Prefix: gw.logPath,
316                 FileSystem: &webdavfs.FS{
317                         FileSystem:    gw.LogCollection,
318                         Prefix:        "",
319                         Writing:       false,
320                         AlwaysReadEOF: r.Method == "PROPFIND",
321                 },
322                 LockSystem: webdavfs.NoLockSystem,
323                 Logger:     gw.webdavLogger,
324         }
325         wh.ServeHTTP(w, r)
326 }
327
328 func (gw *Gateway) webdavLogger(r *http.Request, err error) {
329         if err != nil && !os.IsNotExist(err) {
330                 ctxlog.FromContext(r.Context()).WithError(err).Info("error reported by webdav handler")
331         } else {
332                 ctxlog.FromContext(r.Context()).WithError(err).Debug("webdav request log")
333         }
334 }
335
336 // handleSSH connects to an SSH server that allows the caller to run
337 // interactive commands as root (or any other desired user) inside the
338 // container. The tunnel itself can only be created by an
339 // authenticated caller, so the SSH server itself is wide open (any
340 // password or key will be accepted).
341 //
342 // Requests must have path "/ssh" and the following headers:
343 //
344 // Connection: upgrade
345 // Upgrade: ssh
346 // X-Arvados-Target-Uuid: uuid of container
347 // X-Arvados-Authorization: must match
348 // hmac(AuthSecret,certfingerprint) (this prevents other containers
349 // and shell nodes from connecting directly)
350 //
351 // Optional headers:
352 //
353 // X-Arvados-Detach-Keys: argument to "docker exec --detach-keys",
354 // e.g., "ctrl-p,ctrl-q"
355 // X-Arvados-Login-Username: argument to "docker exec --user": account
356 // used to run command(s) inside the container.
357 func (gw *Gateway) handleSSH(w http.ResponseWriter, req *http.Request) {
358         req.ParseForm()
359         detachKeys := req.Form.Get("detach_keys")
360         username := req.Form.Get("login_username")
361         if username == "" {
362                 username = "root"
363         }
364         hj, ok := w.(http.Hijacker)
365         if !ok {
366                 http.Error(w, "ResponseWriter does not support connection upgrade", http.StatusInternalServerError)
367                 return
368         }
369         netconn, _, err := hj.Hijack()
370         if !ok {
371                 http.Error(w, err.Error(), http.StatusInternalServerError)
372                 return
373         }
374         defer netconn.Close()
375         w.Header().Set("Connection", "upgrade")
376         w.Header().Set("Upgrade", "ssh")
377         netconn.Write([]byte("HTTP/1.1 101 Switching Protocols\r\n"))
378         w.Header().Write(netconn)
379         netconn.Write([]byte("\r\n"))
380
381         ctx := req.Context()
382
383         conn, newchans, reqs, err := ssh.NewServerConn(netconn, &gw.sshConfig)
384         if err == io.EOF {
385                 return
386         } else if err != nil {
387                 gw.Log.Printf("ssh.NewServerConn: %s", err)
388                 return
389         }
390         defer conn.Close()
391         go ssh.DiscardRequests(reqs)
392         for newch := range newchans {
393                 switch newch.ChannelType() {
394                 case "direct-tcpip":
395                         go gw.handleDirectTCPIP(ctx, newch)
396                 case "session":
397                         go gw.handleSession(ctx, newch, detachKeys, username)
398                 default:
399                         go newch.Reject(ssh.UnknownChannelType, fmt.Sprintf("unsupported channel type %q", newch.ChannelType()))
400                 }
401         }
402 }
403
404 func (gw *Gateway) handleDirectTCPIP(ctx context.Context, newch ssh.NewChannel) {
405         ch, reqs, err := newch.Accept()
406         if err != nil {
407                 gw.Log.Printf("accept direct-tcpip channel: %s", err)
408                 return
409         }
410         defer ch.Close()
411         go ssh.DiscardRequests(reqs)
412
413         // RFC 4254 7.2 (copy of channelOpenDirectMsg in
414         // golang.org/x/crypto/ssh)
415         var msg struct {
416                 Raddr string
417                 Rport uint32
418                 Laddr string
419                 Lport uint32
420         }
421         err = ssh.Unmarshal(newch.ExtraData(), &msg)
422         if err != nil {
423                 fmt.Fprintf(ch.Stderr(), "unmarshal direct-tcpip extradata: %s\n", err)
424                 return
425         }
426         switch msg.Raddr {
427         case "localhost", "0.0.0.0", "127.0.0.1", "::1", "::":
428         default:
429                 fmt.Fprintf(ch.Stderr(), "cannot forward to ports on %q, only localhost\n", msg.Raddr)
430                 return
431         }
432
433         dstaddr, err := gw.Target.IPAddress()
434         if err != nil {
435                 fmt.Fprintf(ch.Stderr(), "container has no IP address: %s\n", err)
436                 return
437         } else if dstaddr == "" {
438                 fmt.Fprintf(ch.Stderr(), "container has no IP address\n")
439                 return
440         }
441
442         dst := net.JoinHostPort(dstaddr, fmt.Sprintf("%d", msg.Rport))
443         tcpconn, err := net.Dial("tcp", dst)
444         if err != nil {
445                 fmt.Fprintf(ch.Stderr(), "%s: %s\n", dst, err)
446                 return
447         }
448         go func() {
449                 n, _ := io.Copy(ch, tcpconn)
450                 ctxlog.FromContext(ctx).Debugf("tcpip: sent %d bytes\n", n)
451                 ch.CloseWrite()
452         }()
453         n, _ := io.Copy(tcpconn, ch)
454         ctxlog.FromContext(ctx).Debugf("tcpip: received %d bytes\n", n)
455 }
456
457 func (gw *Gateway) handleSession(ctx context.Context, newch ssh.NewChannel, detachKeys, username string) {
458         ch, reqs, err := newch.Accept()
459         if err != nil {
460                 gw.Log.Printf("error accepting session channel: %s", err)
461                 return
462         }
463         defer ch.Close()
464
465         var pty0, tty0 *os.File
466         // Where to send errors/messages for the client to see
467         logw := io.Writer(ch.Stderr())
468         // How to end lines when sending errors/messages to the client
469         // (changes to \r\n when using a pty)
470         eol := "\n"
471         // Env vars to add to child process
472         termEnv := []string(nil)
473
474         started := 0
475         wantClose := make(chan struct{})
476         for {
477                 var req *ssh.Request
478                 select {
479                 case r, ok := <-reqs:
480                         if !ok {
481                                 return
482                         }
483                         req = r
484                 case <-wantClose:
485                         return
486                 }
487                 ok := false
488                 switch req.Type {
489                 case "shell", "exec":
490                         if started++; started != 1 {
491                                 // RFC 4254 6.5: "Only one of these
492                                 // requests can succeed per channel."
493                                 break
494                         }
495                         ok = true
496                         var payload struct {
497                                 Command string
498                         }
499                         ssh.Unmarshal(req.Payload, &payload)
500                         execargs, err := shlex.Split(payload.Command)
501                         if err != nil {
502                                 fmt.Fprintf(logw, "error parsing supplied command: %s"+eol, err)
503                                 return
504                         }
505                         if len(execargs) == 0 {
506                                 execargs = []string{"/bin/bash", "-login"}
507                         }
508                         go func() {
509                                 var resp struct {
510                                         Status uint32
511                                 }
512                                 defer func() {
513                                         ch.SendRequest("exit-status", false, ssh.Marshal(&resp))
514                                         close(wantClose)
515                                 }()
516
517                                 cmd, err := gw.Target.InjectCommand(ctx, detachKeys, username, tty0 != nil, execargs)
518                                 if err != nil {
519                                         fmt.Fprintln(ch.Stderr(), err)
520                                         ch.CloseWrite()
521                                         resp.Status = 1
522                                         return
523                                 }
524                                 if tty0 != nil {
525                                         cmd.Stdin = tty0
526                                         cmd.Stdout = tty0
527                                         cmd.Stderr = tty0
528                                         go io.Copy(ch, pty0)
529                                         go io.Copy(pty0, ch)
530                                         // Send our own debug messages to tty as well.
531                                         logw = tty0
532                                 } else {
533                                         // StdinPipe may seem
534                                         // superfluous here, but it's
535                                         // not: it causes cmd.Run() to
536                                         // return when the subprocess
537                                         // exits. Without it, Run()
538                                         // waits for stdin to close,
539                                         // which causes "ssh ... echo
540                                         // ok" (with the client's
541                                         // stdin connected to a
542                                         // terminal or something) to
543                                         // hang.
544                                         stdin, err := cmd.StdinPipe()
545                                         if err != nil {
546                                                 fmt.Fprintln(ch.Stderr(), err)
547                                                 ch.CloseWrite()
548                                                 resp.Status = 1
549                                                 return
550                                         }
551                                         go func() {
552                                                 io.Copy(stdin, ch)
553                                                 stdin.Close()
554                                         }()
555                                         cmd.Stdout = ch
556                                         cmd.Stderr = ch.Stderr()
557                                 }
558                                 cmd.SysProcAttr = &syscall.SysProcAttr{
559                                         Setctty: tty0 != nil,
560                                         Setsid:  true,
561                                 }
562                                 cmd.Env = append(os.Environ(), termEnv...)
563                                 err = cmd.Run()
564                                 if exiterr, ok := err.(*exec.ExitError); ok {
565                                         if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
566                                                 resp.Status = uint32(status.ExitStatus())
567                                         }
568                                 } else if err != nil {
569                                         // Propagate errors like `exec: "docker": executable file not found in $PATH`
570                                         fmt.Fprintln(ch.Stderr(), err)
571                                 }
572                                 errClose := ch.CloseWrite()
573                                 if resp.Status == 0 && (err != nil || errClose != nil) {
574                                         resp.Status = 1
575                                 }
576                         }()
577                 case "pty-req":
578                         eol = "\r\n"
579                         p, t, err := pty.Open()
580                         if err != nil {
581                                 fmt.Fprintf(ch.Stderr(), "pty failed: %s"+eol, err)
582                                 break
583                         }
584                         defer p.Close()
585                         defer t.Close()
586                         pty0, tty0 = p, t
587                         ok = true
588                         var payload struct {
589                                 Term string
590                                 Cols uint32
591                                 Rows uint32
592                                 X    uint32
593                                 Y    uint32
594                         }
595                         ssh.Unmarshal(req.Payload, &payload)
596                         termEnv = []string{"TERM=" + payload.Term, "USE_TTY=1"}
597                         err = pty.Setsize(pty0, &pty.Winsize{Rows: uint16(payload.Rows), Cols: uint16(payload.Cols), X: uint16(payload.X), Y: uint16(payload.Y)})
598                         if err != nil {
599                                 fmt.Fprintf(logw, "pty-req: setsize failed: %s"+eol, err)
600                         }
601                 case "window-change":
602                         var payload struct {
603                                 Cols uint32
604                                 Rows uint32
605                                 X    uint32
606                                 Y    uint32
607                         }
608                         ssh.Unmarshal(req.Payload, &payload)
609                         err := pty.Setsize(pty0, &pty.Winsize{Rows: uint16(payload.Rows), Cols: uint16(payload.Cols), X: uint16(payload.X), Y: uint16(payload.Y)})
610                         if err != nil {
611                                 fmt.Fprintf(logw, "window-change: setsize failed: %s"+eol, err)
612                                 break
613                         }
614                         ok = true
615                 case "env":
616                         // TODO: implement "env"
617                         // requests by setting env
618                         // vars in the docker-exec
619                         // command (not docker-exec's
620                         // own environment, which
621                         // would be a gaping security
622                         // hole).
623                 default:
624                         // fmt.Fprintf(logw, "declined request %q on ssh channel"+eol, req.Type)
625                 }
626                 if req.WantReply {
627                         req.Reply(ok, nil)
628                 }
629         }
630 }