19889: Serve live logs via webdav.
[arvados.git] / lib / crunchrun / container_gateway.go
1 // Copyright (C) The Arvados Authors. All rights reserved.
2 //
3 // SPDX-License-Identifier: AGPL-3.0
4
5 package crunchrun
6
7 import (
8         "crypto/hmac"
9         "crypto/rand"
10         "crypto/rsa"
11         "crypto/sha256"
12         "crypto/tls"
13         "fmt"
14         "io"
15         "net"
16         "net/http"
17         "net/url"
18         "os"
19         "os/exec"
20         "strings"
21         "sync"
22         "syscall"
23         "time"
24
25         "git.arvados.org/arvados.git/lib/controller/rpc"
26         "git.arvados.org/arvados.git/lib/selfsigned"
27         "git.arvados.org/arvados.git/lib/webdavfs"
28         "git.arvados.org/arvados.git/sdk/go/arvados"
29         "git.arvados.org/arvados.git/sdk/go/auth"
30         "git.arvados.org/arvados.git/sdk/go/ctxlog"
31         "git.arvados.org/arvados.git/sdk/go/httpserver"
32         "github.com/creack/pty"
33         "github.com/google/shlex"
34         "github.com/hashicorp/yamux"
35         "golang.org/x/crypto/ssh"
36         "golang.org/x/net/context"
37         "golang.org/x/net/webdav"
38 )
39
40 type GatewayTarget interface {
41         // Command that will execute cmd inside the container
42         InjectCommand(ctx context.Context, detachKeys, username string, usingTTY bool, cmd []string) (*exec.Cmd, error)
43
44         // IP address inside container
45         IPAddress() (string, error)
46 }
47
48 type GatewayTargetStub struct{}
49
50 func (GatewayTargetStub) IPAddress() (string, error) { return "127.0.0.1", nil }
51 func (GatewayTargetStub) InjectCommand(ctx context.Context, detachKeys, username string, usingTTY bool, cmd []string) (*exec.Cmd, error) {
52         return exec.CommandContext(ctx, cmd[0], cmd[1:]...), nil
53 }
54
55 type Gateway struct {
56         ContainerUUID string
57         // Caller should set Address to "", or "host:0" or "host:port"
58         // where host is a known external IP address; port is a
59         // desired port number to listen on; and ":0" chooses an
60         // available dynamic port.
61         //
62         // If Address is "", Start() listens only on the loopback
63         // interface (and changes Address to "127.0.0.1:port").
64         // Otherwise it listens on all interfaces.
65         //
66         // If Address is "host:0", Start() updates Address to
67         // "host:port".
68         Address    string
69         AuthSecret string
70         Target     GatewayTarget
71         Log        interface {
72                 Printf(fmt string, args ...interface{})
73         }
74         // If non-nil, set up a ContainerGatewayTunnel, so that the
75         // controller can connect to us even if our external IP
76         // address is unknown or not routable from controller.
77         ArvadosClient *arvados.Client
78
79         // When a tunnel is connected or reconnected, this func (if
80         // not nil) will be called with the InternalURL of the
81         // controller process at the other end of the tunnel.
82         UpdateTunnelURL func(url string)
83
84         // Source for serving WebDAV requests at /arvados/v1/{uuid}/log/
85         LogCollection arvados.CollectionFileSystem
86
87         sshConfig   ssh.ServerConfig
88         requestAuth string
89         respondAuth string
90         logPath     string
91 }
92
93 // Start starts an http server that allows authenticated clients to open an
94 // interactive "docker exec" session and (in future) connect to tcp ports
95 // inside the docker container.
96 func (gw *Gateway) Start() error {
97         gw.sshConfig = ssh.ServerConfig{
98                 NoClientAuth: true,
99                 PasswordCallback: func(c ssh.ConnMetadata, pass []byte) (*ssh.Permissions, error) {
100                         if c.User() == "_" {
101                                 return nil, nil
102                         }
103                         return nil, fmt.Errorf("cannot specify user %q via ssh client", c.User())
104                 },
105                 PublicKeyCallback: func(c ssh.ConnMetadata, pubKey ssh.PublicKey) (*ssh.Permissions, error) {
106                         if c.User() == "_" {
107                                 return &ssh.Permissions{
108                                         Extensions: map[string]string{
109                                                 "pubkey-fp": ssh.FingerprintSHA256(pubKey),
110                                         },
111                                 }, nil
112                         }
113                         return nil, fmt.Errorf("cannot specify user %q via ssh client", c.User())
114                 },
115         }
116         pvt, err := rsa.GenerateKey(rand.Reader, 2048)
117         if err != nil {
118                 return err
119         }
120         err = pvt.Validate()
121         if err != nil {
122                 return err
123         }
124         signer, err := ssh.NewSignerFromKey(pvt)
125         if err != nil {
126                 return err
127         }
128         gw.sshConfig.AddHostKey(signer)
129
130         // Address (typically provided by arvados-dispatch-cloud) is
131         // HOST:PORT where HOST is our IP address or hostname as seen
132         // from arvados-controller, and PORT is either the desired
133         // port where we should run our gateway server, or "0" if we
134         // should choose an available port.
135         extAddr := gw.Address
136         // Generally we can't know which local interface corresponds
137         // to an externally reachable IP address, so if we expect to
138         // be reachable by external hosts, we listen on all
139         // interfaces.
140         listenHost := ""
141         if extAddr == "" {
142                 // If the dispatcher doesn't tell us our external IP
143                 // address, controller will only be able to connect
144                 // through the tunnel (see runTunnel), so our gateway
145                 // server only needs to listen on the loopback
146                 // interface.
147                 extAddr = "127.0.0.1:0"
148                 listenHost = "127.0.0.1"
149         }
150         extHost, extPort, err := net.SplitHostPort(extAddr)
151         if err != nil {
152                 return err
153         }
154         cert, err := selfsigned.CertGenerator{}.Generate()
155         if err != nil {
156                 return err
157         }
158         h := hmac.New(sha256.New, []byte(gw.AuthSecret))
159         h.Write(cert.Certificate[0])
160         gw.requestAuth = fmt.Sprintf("%x", h.Sum(nil))
161         h.Reset()
162         h.Write([]byte(gw.requestAuth))
163         gw.respondAuth = fmt.Sprintf("%x", h.Sum(nil))
164
165         gw.logPath = "/arvados/v1/containers/" + gw.ContainerUUID + "/log"
166
167         srv := &httpserver.Server{
168                 Server: http.Server{
169                         Handler: gw,
170                         TLSConfig: &tls.Config{
171                                 Certificates: []tls.Certificate{cert},
172                         },
173                 },
174                 Addr: net.JoinHostPort(listenHost, extPort),
175         }
176         err = srv.Start()
177         if err != nil {
178                 return err
179         }
180         go func() {
181                 err := srv.Wait()
182                 gw.Log.Printf("gateway server stopped: %s", err)
183         }()
184         // Get the port number we are listening on (extPort might be
185         // "0" or a port name, in which case this will be different).
186         _, listenPort, err := net.SplitHostPort(srv.Addr)
187         if err != nil {
188                 return err
189         }
190         // When changing state to Running, the caller will want to set
191         // gateway_address to a "HOST:PORT" that, if controller
192         // connects to it, will reach this gateway server.
193         //
194         // The most likely thing to work is: HOST is our external
195         // hostname/IP as provided by the caller
196         // (arvados-dispatch-cloud) or 127.0.0.1 to indicate
197         // non-tunnel connections aren't available; and PORT is the
198         // port number we are listening on.
199         gw.Address = net.JoinHostPort(extHost, listenPort)
200         gw.Log.Printf("gateway server listening at %s", gw.Address)
201         if gw.ArvadosClient != nil {
202                 go gw.maintainTunnel(gw.Address)
203         }
204         return nil
205 }
206
207 func (gw *Gateway) maintainTunnel(addr string) {
208         for ; ; time.Sleep(5 * time.Second) {
209                 err := gw.runTunnel(addr)
210                 gw.Log.Printf("runTunnel: %s", err)
211         }
212 }
213
214 // runTunnel connects to controller and sets up a tunnel through
215 // which controller can connect to the gateway server at the given
216 // addr.
217 func (gw *Gateway) runTunnel(addr string) error {
218         ctx := auth.NewContext(context.Background(), auth.NewCredentials(gw.ArvadosClient.AuthToken))
219         arpc := rpc.NewConn("", &url.URL{Scheme: "https", Host: gw.ArvadosClient.APIHost}, gw.ArvadosClient.Insecure, rpc.PassthroughTokenProvider)
220         tun, err := arpc.ContainerGatewayTunnel(ctx, arvados.ContainerGatewayTunnelOptions{
221                 UUID:       gw.ContainerUUID,
222                 AuthSecret: gw.AuthSecret,
223         })
224         if err != nil {
225                 return fmt.Errorf("error creating gateway tunnel: %s", err)
226         }
227         mux, err := yamux.Client(tun.Conn, nil)
228         if err != nil {
229                 return fmt.Errorf("error setting up mux client end: %s", err)
230         }
231         if url := tun.Header.Get("X-Arvados-Internal-Url"); url != "" && gw.UpdateTunnelURL != nil {
232                 gw.UpdateTunnelURL(url)
233         }
234         for {
235                 muxconn, err := mux.AcceptStream()
236                 if err != nil {
237                         return err
238                 }
239                 gw.Log.Printf("tunnel connection %d started", muxconn.StreamID())
240                 go func() {
241                         defer muxconn.Close()
242                         gwconn, err := net.Dial("tcp", addr)
243                         if err != nil {
244                                 gw.Log.Printf("tunnel connection %d: error connecting to %s: %s", muxconn.StreamID(), addr, err)
245                                 return
246                         }
247                         defer gwconn.Close()
248                         var wg sync.WaitGroup
249                         wg.Add(2)
250                         go func() {
251                                 defer wg.Done()
252                                 _, err := io.Copy(gwconn, muxconn)
253                                 if err != nil {
254                                         gw.Log.Printf("tunnel connection %d: mux end: %s", muxconn.StreamID(), err)
255                                 }
256                                 gwconn.Close()
257                         }()
258                         go func() {
259                                 defer wg.Done()
260                                 _, err := io.Copy(muxconn, gwconn)
261                                 if err != nil {
262                                         gw.Log.Printf("tunnel connection %d: gateway end: %s", muxconn.StreamID(), err)
263                                 }
264                                 muxconn.Close()
265                         }()
266                         wg.Wait()
267                         gw.Log.Printf("tunnel connection %d finished", muxconn.StreamID())
268                 }()
269         }
270 }
271
272 var webdavMethod = map[string]bool{
273         "GET":      true,
274         "OPTIONS":  true,
275         "PROPFIND": true,
276 }
277
278 func (gw *Gateway) ServeHTTP(w http.ResponseWriter, req *http.Request) {
279         reqUUID := req.Header.Get("X-Arvados-Container-Gateway-Uuid")
280         if reqUUID == "" {
281                 // older controller versions only send UUID as query param
282                 req.ParseForm()
283                 reqUUID = req.Form.Get("uuid")
284         }
285         if reqUUID != gw.ContainerUUID {
286                 http.Error(w, fmt.Sprintf("misdirected request: meant for %q but received by crunch-run %q", reqUUID, gw.ContainerUUID), http.StatusBadGateway)
287                 return
288         }
289         if req.Header.Get("X-Arvados-Authorization") != gw.requestAuth {
290                 http.Error(w, "bad X-Arvados-Authorization header", http.StatusUnauthorized)
291                 return
292         }
293         w.Header().Set("X-Arvados-Authorization-Response", gw.respondAuth)
294         switch {
295         case req.Method == "POST" && req.Header.Get("Upgrade") == "ssh":
296                 gw.handleSSH(w, req)
297         case req.URL.Path == gw.logPath || strings.HasPrefix(req.URL.Path, gw.logPath):
298                 if !webdavMethod[req.Method] {
299                         http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
300                         return
301                 }
302                 gw.handleLogsWebDAV(w, req)
303         default:
304                 http.Error(w, "path not found", http.StatusNotFound)
305         }
306 }
307
308 func (gw *Gateway) handleLogsWebDAV(w http.ResponseWriter, r *http.Request) {
309         if gw.LogCollection == nil {
310                 http.Error(w, "Not found", http.StatusNotFound)
311                 return
312         }
313         wh := webdav.Handler{
314                 Prefix: gw.logPath,
315                 FileSystem: &webdavfs.FS{
316                         FileSystem:    gw.LogCollection,
317                         Prefix:        "",
318                         Writing:       false,
319                         AlwaysReadEOF: r.Method == "PROPFIND",
320                 },
321                 LockSystem: webdavfs.NoLockSystem,
322                 Logger:     gw.webdavLogger,
323         }
324         wh.ServeHTTP(w, r)
325 }
326
327 func (gw *Gateway) webdavLogger(r *http.Request, err error) {
328         if err != nil {
329                 ctxlog.FromContext(r.Context()).WithError(err).Error("error reported by webdav handler")
330         }
331 }
332
333 // handleSSH connects to an SSH server that allows the caller to run
334 // interactive commands as root (or any other desired user) inside the
335 // container. The tunnel itself can only be created by an
336 // authenticated caller, so the SSH server itself is wide open (any
337 // password or key will be accepted).
338 //
339 // Requests must have path "/ssh" and the following headers:
340 //
341 // Connection: upgrade
342 // Upgrade: ssh
343 // X-Arvados-Target-Uuid: uuid of container
344 // X-Arvados-Authorization: must match
345 // hmac(AuthSecret,certfingerprint) (this prevents other containers
346 // and shell nodes from connecting directly)
347 //
348 // Optional headers:
349 //
350 // X-Arvados-Detach-Keys: argument to "docker exec --detach-keys",
351 // e.g., "ctrl-p,ctrl-q"
352 // X-Arvados-Login-Username: argument to "docker exec --user": account
353 // used to run command(s) inside the container.
354 func (gw *Gateway) handleSSH(w http.ResponseWriter, req *http.Request) {
355         req.ParseForm()
356         detachKeys := req.Form.Get("detach_keys")
357         username := req.Form.Get("login_username")
358         if username == "" {
359                 username = "root"
360         }
361         hj, ok := w.(http.Hijacker)
362         if !ok {
363                 http.Error(w, "ResponseWriter does not support connection upgrade", http.StatusInternalServerError)
364                 return
365         }
366         netconn, _, err := hj.Hijack()
367         if !ok {
368                 http.Error(w, err.Error(), http.StatusInternalServerError)
369                 return
370         }
371         defer netconn.Close()
372         w.Header().Set("Connection", "upgrade")
373         w.Header().Set("Upgrade", "ssh")
374         netconn.Write([]byte("HTTP/1.1 101 Switching Protocols\r\n"))
375         w.Header().Write(netconn)
376         netconn.Write([]byte("\r\n"))
377
378         ctx := req.Context()
379
380         conn, newchans, reqs, err := ssh.NewServerConn(netconn, &gw.sshConfig)
381         if err == io.EOF {
382                 return
383         } else if err != nil {
384                 gw.Log.Printf("ssh.NewServerConn: %s", err)
385                 return
386         }
387         defer conn.Close()
388         go ssh.DiscardRequests(reqs)
389         for newch := range newchans {
390                 switch newch.ChannelType() {
391                 case "direct-tcpip":
392                         go gw.handleDirectTCPIP(ctx, newch)
393                 case "session":
394                         go gw.handleSession(ctx, newch, detachKeys, username)
395                 default:
396                         go newch.Reject(ssh.UnknownChannelType, fmt.Sprintf("unsupported channel type %q", newch.ChannelType()))
397                 }
398         }
399 }
400
401 func (gw *Gateway) handleDirectTCPIP(ctx context.Context, newch ssh.NewChannel) {
402         ch, reqs, err := newch.Accept()
403         if err != nil {
404                 gw.Log.Printf("accept direct-tcpip channel: %s", err)
405                 return
406         }
407         defer ch.Close()
408         go ssh.DiscardRequests(reqs)
409
410         // RFC 4254 7.2 (copy of channelOpenDirectMsg in
411         // golang.org/x/crypto/ssh)
412         var msg struct {
413                 Raddr string
414                 Rport uint32
415                 Laddr string
416                 Lport uint32
417         }
418         err = ssh.Unmarshal(newch.ExtraData(), &msg)
419         if err != nil {
420                 fmt.Fprintf(ch.Stderr(), "unmarshal direct-tcpip extradata: %s\n", err)
421                 return
422         }
423         switch msg.Raddr {
424         case "localhost", "0.0.0.0", "127.0.0.1", "::1", "::":
425         default:
426                 fmt.Fprintf(ch.Stderr(), "cannot forward to ports on %q, only localhost\n", msg.Raddr)
427                 return
428         }
429
430         dstaddr, err := gw.Target.IPAddress()
431         if err != nil {
432                 fmt.Fprintf(ch.Stderr(), "container has no IP address: %s\n", err)
433                 return
434         } else if dstaddr == "" {
435                 fmt.Fprintf(ch.Stderr(), "container has no IP address\n")
436                 return
437         }
438
439         dst := net.JoinHostPort(dstaddr, fmt.Sprintf("%d", msg.Rport))
440         tcpconn, err := net.Dial("tcp", dst)
441         if err != nil {
442                 fmt.Fprintf(ch.Stderr(), "%s: %s\n", dst, err)
443                 return
444         }
445         go func() {
446                 n, _ := io.Copy(ch, tcpconn)
447                 ctxlog.FromContext(ctx).Debugf("tcpip: sent %d bytes\n", n)
448                 ch.CloseWrite()
449         }()
450         n, _ := io.Copy(tcpconn, ch)
451         ctxlog.FromContext(ctx).Debugf("tcpip: received %d bytes\n", n)
452 }
453
454 func (gw *Gateway) handleSession(ctx context.Context, newch ssh.NewChannel, detachKeys, username string) {
455         ch, reqs, err := newch.Accept()
456         if err != nil {
457                 gw.Log.Printf("error accepting session channel: %s", err)
458                 return
459         }
460         defer ch.Close()
461
462         var pty0, tty0 *os.File
463         // Where to send errors/messages for the client to see
464         logw := io.Writer(ch.Stderr())
465         // How to end lines when sending errors/messages to the client
466         // (changes to \r\n when using a pty)
467         eol := "\n"
468         // Env vars to add to child process
469         termEnv := []string(nil)
470
471         started := 0
472         wantClose := make(chan struct{})
473         for {
474                 var req *ssh.Request
475                 select {
476                 case r, ok := <-reqs:
477                         if !ok {
478                                 return
479                         }
480                         req = r
481                 case <-wantClose:
482                         return
483                 }
484                 ok := false
485                 switch req.Type {
486                 case "shell", "exec":
487                         if started++; started != 1 {
488                                 // RFC 4254 6.5: "Only one of these
489                                 // requests can succeed per channel."
490                                 break
491                         }
492                         ok = true
493                         var payload struct {
494                                 Command string
495                         }
496                         ssh.Unmarshal(req.Payload, &payload)
497                         execargs, err := shlex.Split(payload.Command)
498                         if err != nil {
499                                 fmt.Fprintf(logw, "error parsing supplied command: %s"+eol, err)
500                                 return
501                         }
502                         if len(execargs) == 0 {
503                                 execargs = []string{"/bin/bash", "-login"}
504                         }
505                         go func() {
506                                 var resp struct {
507                                         Status uint32
508                                 }
509                                 defer func() {
510                                         ch.SendRequest("exit-status", false, ssh.Marshal(&resp))
511                                         close(wantClose)
512                                 }()
513
514                                 cmd, err := gw.Target.InjectCommand(ctx, detachKeys, username, tty0 != nil, execargs)
515                                 if err != nil {
516                                         fmt.Fprintln(ch.Stderr(), err)
517                                         ch.CloseWrite()
518                                         resp.Status = 1
519                                         return
520                                 }
521                                 if tty0 != nil {
522                                         cmd.Stdin = tty0
523                                         cmd.Stdout = tty0
524                                         cmd.Stderr = tty0
525                                         go io.Copy(ch, pty0)
526                                         go io.Copy(pty0, ch)
527                                         // Send our own debug messages to tty as well.
528                                         logw = tty0
529                                 } else {
530                                         // StdinPipe may seem
531                                         // superfluous here, but it's
532                                         // not: it causes cmd.Run() to
533                                         // return when the subprocess
534                                         // exits. Without it, Run()
535                                         // waits for stdin to close,
536                                         // which causes "ssh ... echo
537                                         // ok" (with the client's
538                                         // stdin connected to a
539                                         // terminal or something) to
540                                         // hang.
541                                         stdin, err := cmd.StdinPipe()
542                                         if err != nil {
543                                                 fmt.Fprintln(ch.Stderr(), err)
544                                                 ch.CloseWrite()
545                                                 resp.Status = 1
546                                                 return
547                                         }
548                                         go func() {
549                                                 io.Copy(stdin, ch)
550                                                 stdin.Close()
551                                         }()
552                                         cmd.Stdout = ch
553                                         cmd.Stderr = ch.Stderr()
554                                 }
555                                 cmd.SysProcAttr = &syscall.SysProcAttr{
556                                         Setctty: tty0 != nil,
557                                         Setsid:  true,
558                                 }
559                                 cmd.Env = append(os.Environ(), termEnv...)
560                                 err = cmd.Run()
561                                 if exiterr, ok := err.(*exec.ExitError); ok {
562                                         if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
563                                                 resp.Status = uint32(status.ExitStatus())
564                                         }
565                                 } else if err != nil {
566                                         // Propagate errors like `exec: "docker": executable file not found in $PATH`
567                                         fmt.Fprintln(ch.Stderr(), err)
568                                 }
569                                 errClose := ch.CloseWrite()
570                                 if resp.Status == 0 && (err != nil || errClose != nil) {
571                                         resp.Status = 1
572                                 }
573                         }()
574                 case "pty-req":
575                         eol = "\r\n"
576                         p, t, err := pty.Open()
577                         if err != nil {
578                                 fmt.Fprintf(ch.Stderr(), "pty failed: %s"+eol, err)
579                                 break
580                         }
581                         defer p.Close()
582                         defer t.Close()
583                         pty0, tty0 = p, t
584                         ok = true
585                         var payload struct {
586                                 Term string
587                                 Cols uint32
588                                 Rows uint32
589                                 X    uint32
590                                 Y    uint32
591                         }
592                         ssh.Unmarshal(req.Payload, &payload)
593                         termEnv = []string{"TERM=" + payload.Term, "USE_TTY=1"}
594                         err = pty.Setsize(pty0, &pty.Winsize{Rows: uint16(payload.Rows), Cols: uint16(payload.Cols), X: uint16(payload.X), Y: uint16(payload.Y)})
595                         if err != nil {
596                                 fmt.Fprintf(logw, "pty-req: setsize failed: %s"+eol, err)
597                         }
598                 case "window-change":
599                         var payload struct {
600                                 Cols uint32
601                                 Rows uint32
602                                 X    uint32
603                                 Y    uint32
604                         }
605                         ssh.Unmarshal(req.Payload, &payload)
606                         err := pty.Setsize(pty0, &pty.Winsize{Rows: uint16(payload.Rows), Cols: uint16(payload.Cols), X: uint16(payload.X), Y: uint16(payload.Y)})
607                         if err != nil {
608                                 fmt.Fprintf(logw, "window-change: setsize failed: %s"+eol, err)
609                                 break
610                         }
611                         ok = true
612                 case "env":
613                         // TODO: implement "env"
614                         // requests by setting env
615                         // vars in the docker-exec
616                         // command (not docker-exec's
617                         // own environment, which
618                         // would be a gaping security
619                         // hole).
620                 default:
621                         // fmt.Fprintf(logw, "declined request %q on ssh channel"+eol, req.Type)
622                 }
623                 if req.WantReply {
624                         req.Reply(ok, nil)
625                 }
626         }
627 }