21123: Add container_requests/{uuid}/container_status endpoint.
[arvados.git] / lib / controller / localdb / container_request.go
index 49e21840ea206f69684738e2f9aec98f0f6c2fd3..0234ee8fa6e618fa9d095c938cb2721ae69bda90 100644 (file)
@@ -6,8 +6,15 @@ package localdb
 
 import (
        "context"
+       "encoding/json"
+       "fmt"
+       "net/http"
+       "net/url"
 
+       "git.arvados.org/arvados.git/lib/dispatchcloud/scheduler"
        "git.arvados.org/arvados.git/sdk/go/arvados"
+       "git.arvados.org/arvados.git/sdk/go/auth"
+       "git.arvados.org/arvados.git/sdk/go/httpserver"
 )
 
 // ContainerRequestCreate defers to railsProxy for everything except
@@ -54,3 +61,87 @@ func (conn *Conn) ContainerRequestDelete(ctx context.Context, opts arvados.Delet
        conn.logActivity(ctx)
        return conn.railsProxy.ContainerRequestDelete(ctx, opts)
 }
+
+func (conn *Conn) ContainerRequestContainerStatus(ctx context.Context, opts arvados.GetOptions) (arvados.ContainerStatus, error) {
+       conn.logActivity(ctx)
+       var ret arvados.ContainerStatus
+       cr, err := conn.railsProxy.ContainerRequestGet(ctx, arvados.GetOptions{UUID: opts.UUID, Select: []string{"uuid", "container_uuid", "log_uuid"}})
+       if err != nil {
+               return ret, err
+       }
+       if cr.ContainerUUID == "" {
+               ret.SchedulingStatus = "no container assigned"
+               return ret, nil
+       }
+       // We use admin credentials to get the container record so we
+       // don't get an error when we're in a race with auto-retry and
+       // the container became user-unreadable since we fetched the
+       // CR above.
+       ctxRoot := auth.NewContext(ctx, &auth.Credentials{Tokens: []string{conn.cluster.SystemRootToken}})
+       ctr, err := conn.railsProxy.ContainerGet(ctxRoot, arvados.GetOptions{UUID: cr.ContainerUUID, Select: []string{"uuid", "state", "priority"}})
+       if err != nil {
+               return ret, err
+       }
+       ret.UUID = ctr.UUID
+       ret.State = ctr.State
+       if ctr.State != arvados.ContainerStateQueued && ctr.State != arvados.ContainerStateLocked {
+               // Scheduling status is not a thing once the container
+               // is in running state.
+               return ret, nil
+       }
+       var lastErr error
+       for dispatchurl := range conn.cluster.Services.DispatchCloud.InternalURLs {
+               baseurl := url.URL(dispatchurl)
+               apiurl, err := baseurl.Parse("/arvados/v1/dispatch/container?container_uuid=" + cr.ContainerUUID)
+               if err != nil {
+                       lastErr = err
+                       continue
+               }
+               req, err := http.NewRequestWithContext(ctx, http.MethodGet, apiurl.String(), nil)
+               if err != nil {
+                       lastErr = err
+                       continue
+               }
+               req.Header.Set("Authorization", "Bearer "+conn.cluster.ManagementToken)
+               resp, err := http.DefaultClient.Do(req)
+               if err != nil {
+                       lastErr = fmt.Errorf("error getting status from dispatcher: %w", err)
+                       continue
+               }
+               if resp.StatusCode == http.StatusNotFound {
+                       continue
+               } else if resp.StatusCode != http.StatusOK {
+                       lastErr = fmt.Errorf("error getting status from dispatcher: %s", resp.Status)
+                       continue
+               }
+               var qent scheduler.QueueEnt
+               err = json.NewDecoder(resp.Body).Decode(&qent)
+               if err != nil {
+                       lastErr = err
+                       continue
+               }
+               ret.State = qent.Container.State // Prefer dispatcher's view of state if not equal to ctr.State
+               ret.SchedulingStatus = qent.SchedulingStatus
+               return ret, nil
+       }
+       if lastErr != nil {
+               // If we got a non-nil error from a dispatchcloud
+               // service, and the container state suggests
+               // dispatchcloud should know about it, then we return
+               // an error so the client knows to retry.
+               return ret, httpserver.ErrorWithStatus(lastErr, http.StatusBadGateway)
+       }
+       // All running dispatchcloud services confirm they don't have
+       // this container (the dispatcher hasn't yet noticed it
+       // appearing in the queue) or there are no dispatchcloud
+       // services configured. Either way, all we can say is that
+       // it's queued.
+       if ctr.State == arvados.ContainerStateQueued && ctr.Priority < 1 {
+               // If it hasn't been picked up by a dispatcher
+               // already, it won't be -- it's just on hold.
+               // Scheduling status does not apply.
+               return ret, nil
+       }
+       ret.SchedulingStatus = "waiting for dispatch"
+       return ret, nil
+}