19166: Close ssh session when exec/shell command exits.
[arvados.git] / lib / crunchrun / container_gateway.go
index 2ec24bac788f5a301a823621724eff82438a5fd7..1002de7335495e8d6c42e9afa151f0346a4c7267 100644 (file)
@@ -14,33 +14,69 @@ import (
        "io"
        "net"
        "net/http"
+       "net/url"
        "os"
        "os/exec"
        "sync"
-       "sync/atomic"
        "syscall"
        "time"
 
+       "git.arvados.org/arvados.git/lib/controller/rpc"
        "git.arvados.org/arvados.git/lib/selfsigned"
+       "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/auth"
        "git.arvados.org/arvados.git/sdk/go/ctxlog"
        "git.arvados.org/arvados.git/sdk/go/httpserver"
        "github.com/creack/pty"
-       dockerclient "github.com/docker/docker/client"
        "github.com/google/shlex"
+       "github.com/hashicorp/yamux"
        "golang.org/x/crypto/ssh"
        "golang.org/x/net/context"
 )
 
+type GatewayTarget interface {
+       // Command that will execute cmd inside the container
+       InjectCommand(ctx context.Context, detachKeys, username string, usingTTY bool, cmd []string) (*exec.Cmd, error)
+
+       // IP address inside container
+       IPAddress() (string, error)
+}
+
+type GatewayTargetStub struct{}
+
+func (GatewayTargetStub) IPAddress() (string, error) { return "127.0.0.1", nil }
+func (GatewayTargetStub) InjectCommand(ctx context.Context, detachKeys, username string, usingTTY bool, cmd []string) (*exec.Cmd, error) {
+       return exec.CommandContext(ctx, cmd[0], cmd[1:]...), nil
+}
+
 type Gateway struct {
-       DockerContainerID *string
-       ContainerUUID     string
-       Address           string // listen host:port; if port=0, Start() will change it to the selected port
-       AuthSecret        string
-       Log               interface {
+       ContainerUUID string
+       // Caller should set Address to "", or "host:0" or "host:port"
+       // where host is a known external IP address; port is a
+       // desired port number to listen on; and ":0" chooses an
+       // available dynamic port.
+       //
+       // If Address is "", Start() listens only on the loopback
+       // interface (and changes Address to "127.0.0.1:port").
+       // Otherwise it listens on all interfaces.
+       //
+       // If Address is "host:0", Start() updates Address to
+       // "host:port".
+       Address    string
+       AuthSecret string
+       Target     GatewayTarget
+       Log        interface {
                Printf(fmt string, args ...interface{})
        }
-       // return local ip address of running container, or "" if not available
-       ContainerIPAddress func() (string, error)
+       // If non-nil, set up a ContainerGatewayTunnel, so that the
+       // controller can connect to us even if our external IP
+       // address is unknown or not routable from controller.
+       ArvadosClient *arvados.Client
+
+       // When a tunnel is connected or reconnected, this func (if
+       // not nil) will be called with the InternalURL of the
+       // controller process at the other end of the tunnel.
+       UpdateTunnelURL func(url string)
 
        sshConfig   ssh.ServerConfig
        requestAuth string
@@ -89,7 +125,22 @@ func (gw *Gateway) Start() error {
        // from arvados-controller, and PORT is either the desired
        // port where we should run our gateway server, or "0" if we
        // should choose an available port.
-       host, port, err := net.SplitHostPort(gw.Address)
+       extAddr := gw.Address
+       // Generally we can't know which local interface corresponds
+       // to an externally reachable IP address, so if we expect to
+       // be reachable by external hosts, we listen on all
+       // interfaces.
+       listenHost := ""
+       if extAddr == "" {
+               // If the dispatcher doesn't tell us our external IP
+               // address, controller will only be able to connect
+               // through the tunnel (see runTunnel), so our gateway
+               // server only needs to listen on the loopback
+               // interface.
+               extAddr = "127.0.0.1:0"
+               listenHost = "127.0.0.1"
+       }
+       extHost, extPort, err := net.SplitHostPort(extAddr)
        if err != nil {
                return err
        }
@@ -111,26 +162,104 @@ func (gw *Gateway) Start() error {
                                Certificates: []tls.Certificate{cert},
                        },
                },
-               Addr: ":" + port,
+               Addr: net.JoinHostPort(listenHost, extPort),
        }
        err = srv.Start()
        if err != nil {
                return err
        }
-       // Get the port number we are listening on (the port might be
+       go func() {
+               err := srv.Wait()
+               gw.Log.Printf("gateway server stopped: %s", err)
+       }()
+       // Get the port number we are listening on (extPort might be
        // "0" or a port name, in which case this will be different).
-       _, port, err = net.SplitHostPort(srv.Addr)
+       _, listenPort, err := net.SplitHostPort(srv.Addr)
        if err != nil {
                return err
        }
-       // When changing state to Running, we will set
-       // gateway_address to "HOST:PORT" where HOST is our
-       // external hostname/IP as provided by arvados-dispatch-cloud,
-       // and PORT is the port number we ended up listening on.
-       gw.Address = net.JoinHostPort(host, port)
+       // When changing state to Running, the caller will want to set
+       // gateway_address to a "HOST:PORT" that, if controller
+       // connects to it, will reach this gateway server.
+       //
+       // The most likely thing to work is: HOST is our external
+       // hostname/IP as provided by the caller
+       // (arvados-dispatch-cloud) or 127.0.0.1 to indicate
+       // non-tunnel connections aren't available; and PORT is the
+       // port number we are listening on.
+       gw.Address = net.JoinHostPort(extHost, listenPort)
+       gw.Log.Printf("gateway server listening at %s", gw.Address)
+       if gw.ArvadosClient != nil {
+               go gw.maintainTunnel(gw.Address)
+       }
        return nil
 }
 
+func (gw *Gateway) maintainTunnel(addr string) {
+       for ; ; time.Sleep(5 * time.Second) {
+               err := gw.runTunnel(addr)
+               gw.Log.Printf("runTunnel: %s", err)
+       }
+}
+
+// runTunnel connects to controller and sets up a tunnel through
+// which controller can connect to the gateway server at the given
+// addr.
+func (gw *Gateway) runTunnel(addr string) error {
+       ctx := auth.NewContext(context.Background(), auth.NewCredentials(gw.ArvadosClient.AuthToken))
+       arpc := rpc.NewConn("", &url.URL{Scheme: "https", Host: gw.ArvadosClient.APIHost}, gw.ArvadosClient.Insecure, rpc.PassthroughTokenProvider)
+       tun, err := arpc.ContainerGatewayTunnel(ctx, arvados.ContainerGatewayTunnelOptions{
+               UUID:       gw.ContainerUUID,
+               AuthSecret: gw.AuthSecret,
+       })
+       if err != nil {
+               return fmt.Errorf("error creating gateway tunnel: %s", err)
+       }
+       mux, err := yamux.Client(tun.Conn, nil)
+       if err != nil {
+               return fmt.Errorf("error setting up mux client end: %s", err)
+       }
+       if url := tun.Header.Get("X-Arvados-Internal-Url"); url != "" && gw.UpdateTunnelURL != nil {
+               gw.UpdateTunnelURL(url)
+       }
+       for {
+               muxconn, err := mux.AcceptStream()
+               if err != nil {
+                       return err
+               }
+               gw.Log.Printf("tunnel connection %d started", muxconn.StreamID())
+               go func() {
+                       defer muxconn.Close()
+                       gwconn, err := net.Dial("tcp", addr)
+                       if err != nil {
+                               gw.Log.Printf("tunnel connection %d: error connecting to %s: %s", muxconn.StreamID(), addr, err)
+                               return
+                       }
+                       defer gwconn.Close()
+                       var wg sync.WaitGroup
+                       wg.Add(2)
+                       go func() {
+                               defer wg.Done()
+                               _, err := io.Copy(gwconn, muxconn)
+                               if err != nil {
+                                       gw.Log.Printf("tunnel connection %d: mux end: %s", muxconn.StreamID(), err)
+                               }
+                               gwconn.Close()
+                       }()
+                       go func() {
+                               defer wg.Done()
+                               _, err := io.Copy(muxconn, gwconn)
+                               if err != nil {
+                                       gw.Log.Printf("tunnel connection %d: gateway end: %s", muxconn.StreamID(), err)
+                               }
+                               muxconn.Close()
+                       }()
+                       wg.Wait()
+                       gw.Log.Printf("tunnel connection %d finished", muxconn.StreamID())
+               }()
+       }
+}
+
 // handleSSH connects to an SSH server that allows the caller to run
 // interactive commands as root (or any other desired user) inside the
 // container. The tunnel itself can only be created by an
@@ -156,11 +285,12 @@ func (gw *Gateway) handleSSH(w http.ResponseWriter, req *http.Request) {
        // In future we'll handle browser traffic too, but for now the
        // only traffic we expect is an SSH tunnel from
        // (*lib/controller/localdb.Conn)ContainerSSH()
-       if req.Method != "GET" || req.Header.Get("Upgrade") != "ssh" {
+       if req.Method != "POST" || req.Header.Get("Upgrade") != "ssh" {
                http.Error(w, "path not found", http.StatusNotFound)
                return
        }
-       if want := req.Header.Get("X-Arvados-Target-Uuid"); want != gw.ContainerUUID {
+       req.ParseForm()
+       if want := req.Form.Get("uuid"); want != gw.ContainerUUID {
                http.Error(w, fmt.Sprintf("misdirected request: meant for %q but received by crunch-run %q", want, gw.ContainerUUID), http.StatusBadGateway)
                return
        }
@@ -168,8 +298,8 @@ func (gw *Gateway) handleSSH(w http.ResponseWriter, req *http.Request) {
                http.Error(w, "bad X-Arvados-Authorization header", http.StatusUnauthorized)
                return
        }
-       detachKeys := req.Header.Get("X-Arvados-Detach-Keys")
-       username := req.Header.Get("X-Arvados-Login-Username")
+       detachKeys := req.Form.Get("detach_keys")
+       username := req.Form.Get("login_username")
        if username == "" {
                username = "root"
        }
@@ -194,7 +324,9 @@ func (gw *Gateway) handleSSH(w http.ResponseWriter, req *http.Request) {
        ctx := req.Context()
 
        conn, newchans, reqs, err := ssh.NewServerConn(netconn, &gw.sshConfig)
-       if err != nil {
+       if err == io.EOF {
+               return
+       } else if err != nil {
                gw.Log.Printf("ssh.NewServerConn: %s", err)
                return
        }
@@ -241,15 +373,11 @@ func (gw *Gateway) handleDirectTCPIP(ctx context.Context, newch ssh.NewChannel)
                return
        }
 
-       var dstaddr string
-       if gw.ContainerIPAddress != nil {
-               dstaddr, err = gw.ContainerIPAddress()
-               if err != nil {
-                       fmt.Fprintf(ch.Stderr(), "container has no IP address: %s\n", err)
-                       return
-               }
-       }
-       if dstaddr == "" {
+       dstaddr, err := gw.Target.IPAddress()
+       if err != nil {
+               fmt.Fprintf(ch.Stderr(), "container has no IP address: %s\n", err)
+               return
+       } else if dstaddr == "" {
                fmt.Fprintf(ch.Stderr(), "container has no IP address\n")
                return
        }
@@ -272,9 +400,11 @@ func (gw *Gateway) handleDirectTCPIP(ctx context.Context, newch ssh.NewChannel)
 func (gw *Gateway) handleSession(ctx context.Context, newch ssh.NewChannel, detachKeys, username string) {
        ch, reqs, err := newch.Accept()
        if err != nil {
-               gw.Log.Printf("accept session channel: %s", err)
+               gw.Log.Printf("error accepting session channel: %s", err)
                return
        }
+       defer ch.Close()
+
        var pty0, tty0 *os.File
        // Where to send errors/messages for the client to see
        logw := io.Writer(ch.Stderr())
@@ -283,10 +413,28 @@ func (gw *Gateway) handleSession(ctx context.Context, newch ssh.NewChannel, deta
        eol := "\n"
        // Env vars to add to child process
        termEnv := []string(nil)
-       for req := range reqs {
+
+       started := 0
+       wantClose := make(chan struct{})
+       for {
+               var req *ssh.Request
+               select {
+               case r, ok := <-reqs:
+                       if !ok {
+                               return
+                       }
+                       req = r
+               case <-wantClose:
+                       return
+               }
                ok := false
                switch req.Type {
                case "shell", "exec":
+                       if started++; started != 1 {
+                               // RFC 4254 6.5: "Only one of these
+                               // requests can succeed per channel."
+                               break
+                       }
                        ok = true
                        var payload struct {
                                Command string
@@ -301,34 +449,61 @@ func (gw *Gateway) handleSession(ctx context.Context, newch ssh.NewChannel, deta
                                execargs = []string{"/bin/bash", "-login"}
                        }
                        go func() {
-                               cmd := exec.CommandContext(ctx, "docker", "exec", "-i", "--detach-keys="+detachKeys, "--user="+username)
-                               cmd.Stdin = ch
-                               cmd.Stdout = ch
-                               cmd.Stderr = ch.Stderr()
+                               var resp struct {
+                                       Status uint32
+                               }
+                               defer func() {
+                                       ch.SendRequest("exit-status", false, ssh.Marshal(&resp))
+                                       close(wantClose)
+                               }()
+
+                               cmd, err := gw.Target.InjectCommand(ctx, detachKeys, username, tty0 != nil, execargs)
+                               if err != nil {
+                                       fmt.Fprintln(ch.Stderr(), err)
+                                       ch.CloseWrite()
+                                       resp.Status = 1
+                                       return
+                               }
                                if tty0 != nil {
-                                       cmd.Args = append(cmd.Args, "-t")
                                        cmd.Stdin = tty0
                                        cmd.Stdout = tty0
                                        cmd.Stderr = tty0
-                                       var wg sync.WaitGroup
-                                       defer wg.Wait()
-                                       wg.Add(2)
-                                       go func() { io.Copy(ch, pty0); wg.Done() }()
-                                       go func() { io.Copy(pty0, ch); wg.Done() }()
+                                       go io.Copy(ch, pty0)
+                                       go io.Copy(pty0, ch)
                                        // Send our own debug messages to tty as well.
                                        logw = tty0
+                               } else {
+                                       // StdinPipe may seem
+                                       // superfluous here, but it's
+                                       // not: it causes cmd.Run() to
+                                       // return when the subprocess
+                                       // exits. Without it, Run()
+                                       // waits for stdin to close,
+                                       // which causes "ssh ... echo
+                                       // ok" (with the client's
+                                       // stdin connected to a
+                                       // terminal or something) to
+                                       // hang.
+                                       stdin, err := cmd.StdinPipe()
+                                       if err != nil {
+                                               fmt.Fprintln(ch.Stderr(), err)
+                                               ch.CloseWrite()
+                                               resp.Status = 1
+                                               return
+                                       }
+                                       go func() {
+                                               io.Copy(stdin, ch)
+                                               stdin.Close()
+                                       }()
+                                       cmd.Stdout = ch
+                                       cmd.Stderr = ch.Stderr()
                                }
-                               cmd.Args = append(cmd.Args, *gw.DockerContainerID)
-                               cmd.Args = append(cmd.Args, execargs...)
                                cmd.SysProcAttr = &syscall.SysProcAttr{
                                        Setctty: tty0 != nil,
                                        Setsid:  true,
                                }
                                cmd.Env = append(os.Environ(), termEnv...)
-                               err := cmd.Run()
-                               var resp struct {
-                                       Status uint32
-                               }
+                               err = cmd.Run()
                                if exiterr, ok := err.(*exec.ExitError); ok {
                                        if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
                                                resp.Status = uint32(status.ExitStatus())
@@ -341,8 +516,6 @@ func (gw *Gateway) handleSession(ctx context.Context, newch ssh.NewChannel, deta
                                if resp.Status == 0 && (err != nil || errClose != nil) {
                                        resp.Status = 1
                                }
-                               ch.SendRequest("exit-status", false, ssh.Marshal(&resp))
-                               ch.Close()
                        }()
                case "pty-req":
                        eol = "\r\n"
@@ -391,38 +564,10 @@ func (gw *Gateway) handleSession(ctx context.Context, newch ssh.NewChannel, deta
                        // would be a gaping security
                        // hole).
                default:
-                       // fmt.Fprintf(logw, "declining %q req"+eol, req.Type)
+                       fmt.Fprintf(logw, "declined request %q on ssh channel"+eol, req.Type)
                }
                if req.WantReply {
                        req.Reply(ok, nil)
                }
        }
 }
-
-func dockerContainerIPAddress(containerID *string) func() (string, error) {
-       var saved atomic.Value
-       return func() (string, error) {
-               if ip, ok := saved.Load().(*string); ok {
-                       return *ip, nil
-               }
-               docker, err := dockerclient.NewClient(dockerclient.DefaultDockerHost, "1.21", nil, nil)
-               if err != nil {
-                       return "", fmt.Errorf("cannot create docker client: %s", err)
-               }
-               ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(time.Minute))
-               defer cancel()
-               ctr, err := docker.ContainerInspect(ctx, *containerID)
-               if err != nil {
-                       return "", fmt.Errorf("cannot get docker container info: %s", err)
-               }
-               ip := ctr.NetworkSettings.IPAddress
-               if ip == "" {
-                       // TODO: try to enable networking if it wasn't
-                       // already enabled when the container was
-                       // created.
-                       return "", fmt.Errorf("container has no IP address")
-               }
-               saved.Store(&ip)
-               return ip, nil
-       }
-}