From 8052381fb4e7aceb52497e8378b596178cf5af7c Mon Sep 17 00:00:00 2001 From: Tom Clegg Date: Fri, 18 Jan 2019 16:28:45 -0500 Subject: [PATCH] 14325: Cancel containers with unsatisfiable runtime constraints. Arvados-DCO-1.1-Signed-off-by: Tom Clegg --- lib/dispatchcloud/container/queue.go | 50 +++++++++++++++++++++-- lib/dispatchcloud/container/queue_test.go | 48 ++++++++++++++++++++-- sdk/go/arvados/container.go | 31 +++++++------- 3 files changed, 107 insertions(+), 22 deletions(-) diff --git a/lib/dispatchcloud/container/queue.go b/lib/dispatchcloud/container/queue.go index 965407e518..7a41d47c39 100644 --- a/lib/dispatchcloud/container/queue.go +++ b/lib/dispatchcloud/container/queue.go @@ -211,9 +211,41 @@ func (cq *Queue) Update() error { func (cq *Queue) addEnt(uuid string, ctr arvados.Container) { it, err := cq.chooseType(&ctr) - if err != nil { - // FIXME: throttle warnings, cancel after timeout - cq.logger.Warnf("cannot run %s", &ctr) + if err != nil && (ctr.State == arvados.ContainerStateQueued || ctr.State == arvados.ContainerStateLocked) { + errorString := err.Error() + cq.logger.WithField("ContainerUUID", ctr.UUID).Warn("cancel container with no suitable instance type") + go func() { + var err error + defer func() { + if err == nil { + return + } + // On failure, check current container + // state, and don't log the error if + // the failure came from losing a + // race. + var latest arvados.Container + cq.client.RequestAndDecode(&latest, "GET", "arvados/v1/containers/"+ctr.UUID, nil, map[string][]string{"select": {"state"}}) + if latest.State == arvados.ContainerStateCancelled { + return + } + cq.logger.WithField("ContainerUUID", ctr.UUID).WithError(err).Warn("error while trying to cancel unsatisfiable container") + }() + if ctr.State == arvados.ContainerStateQueued { + err = cq.Lock(ctr.UUID) + if err != nil { + return + } + } + err = cq.setRuntimeError(ctr.UUID, errorString) + if err != nil { + return + } + err = cq.Cancel(ctr.UUID) + if err != nil { + return + } + }() return } cq.current[uuid] = QueueEnt{Container: ctr, InstanceType: it} @@ -229,6 +261,18 @@ func (cq *Queue) Unlock(uuid string) error { return cq.apiUpdate(uuid, "unlock") } +// setRuntimeError sets runtime_status["error"] to the given value. +// Container should already have state==Locked or Running. +func (cq *Queue) setRuntimeError(uuid, errorString string) error { + return cq.client.RequestAndDecode(nil, "PUT", "arvados/v1/containers/"+uuid, nil, map[string]map[string]map[string]interface{}{ + "container": { + "runtime_status": { + "error": errorString, + }, + }, + }) +} + // Cancel cancels the given container. func (cq *Queue) Cancel(uuid string) error { err := cq.client.RequestAndDecode(nil, "PUT", "arvados/v1/containers/"+uuid, nil, map[string]map[string]interface{}{ diff --git a/lib/dispatchcloud/container/queue_test.go b/lib/dispatchcloud/container/queue_test.go index 9d2f830903..a84497424a 100644 --- a/lib/dispatchcloud/container/queue_test.go +++ b/lib/dispatchcloud/container/queue_test.go @@ -5,6 +5,7 @@ package container import ( + "errors" "sync" "testing" "time" @@ -24,9 +25,18 @@ var _ = check.Suite(&IntegrationSuite{}) type IntegrationSuite struct{} -func (*IntegrationSuite) TestControllerBackedQueue(c *check.C) { +func (suite *IntegrationSuite) TearDownTest(c *check.C) { + err := arvados.NewClientFromEnv().RequestAndDecode(nil, "POST", "database/reset", nil, nil) + c.Check(err, check.IsNil) +} + +func (suite *IntegrationSuite) TestGetLockUnlockCancel(c *check.C) { + typeChooser := func(ctr *arvados.Container) (arvados.InstanceType, error) { + return arvados.InstanceType{Name: "testType"}, nil + } + client := arvados.NewClientFromEnv() - cq := NewQueue(logrus.StandardLogger(), nil, testTypeChooser, client) + cq := NewQueue(logrus.StandardLogger(), nil, typeChooser, client) err := cq.Update() c.Check(err, check.IsNil) @@ -77,6 +87,36 @@ func (*IntegrationSuite) TestControllerBackedQueue(c *check.C) { c.Check(err, check.ErrorMatches, `.*State cannot change from Complete to Cancelled.*`) } -func testTypeChooser(ctr *arvados.Container) (arvados.InstanceType, error) { - return arvados.InstanceType{Name: "testType"}, nil +func (suite *IntegrationSuite) TestCancelIfNoInstanceType(c *check.C) { + errorTypeChooser := func(ctr *arvados.Container) (arvados.InstanceType, error) { + return arvados.InstanceType{}, errors.New("no suitable instance type") + } + + client := arvados.NewClientFromEnv() + cq := NewQueue(logrus.StandardLogger(), nil, errorTypeChooser, client) + + var ctr arvados.Container + err := client.RequestAndDecode(&ctr, "GET", "arvados/v1/containers/"+arvadostest.QueuedContainerUUID, nil, nil) + c.Check(err, check.IsNil) + c.Check(ctr.State, check.Equals, arvados.ContainerStateQueued) + + cq.Update() + + // Wait for the cancel operation to take effect. Container + // will have state=Cancelled or just disappear from the queue. + suite.waitfor(c, time.Second, func() bool { + err := client.RequestAndDecode(&ctr, "GET", "arvados/v1/containers/"+arvadostest.QueuedContainerUUID, nil, nil) + return err == nil && ctr.State == arvados.ContainerStateCancelled + }) + c.Check(ctr.RuntimeStatus["error"], check.Equals, `no suitable instance type`) +} + +func (suite *IntegrationSuite) waitfor(c *check.C, timeout time.Duration, fn func() bool) { + defer func() { + c.Check(fn(), check.Equals, true) + }() + deadline := time.Now().Add(timeout) + for !fn() && time.Now().Before(deadline) { + time.Sleep(timeout / 1000) + } } diff --git a/sdk/go/arvados/container.go b/sdk/go/arvados/container.go index 02a0d76dec..fb095481bb 100644 --- a/sdk/go/arvados/container.go +++ b/sdk/go/arvados/container.go @@ -8,21 +8,22 @@ import "time" // Container is an arvados#container resource. type Container struct { - UUID string `json:"uuid"` - CreatedAt time.Time `json:"created_at"` - Command []string `json:"command"` - ContainerImage string `json:"container_image"` - Cwd string `json:"cwd"` - Environment map[string]string `json:"environment"` - LockedByUUID string `json:"locked_by_uuid"` - Mounts map[string]Mount `json:"mounts"` - Output string `json:"output"` - OutputPath string `json:"output_path"` - Priority int64 `json:"priority"` - RuntimeConstraints RuntimeConstraints `json:"runtime_constraints"` - State ContainerState `json:"state"` - SchedulingParameters SchedulingParameters `json:"scheduling_parameters"` - ExitCode int `json:"exit_code"` + UUID string `json:"uuid"` + CreatedAt time.Time `json:"created_at"` + Command []string `json:"command"` + ContainerImage string `json:"container_image"` + Cwd string `json:"cwd"` + Environment map[string]string `json:"environment"` + LockedByUUID string `json:"locked_by_uuid"` + Mounts map[string]Mount `json:"mounts"` + Output string `json:"output"` + OutputPath string `json:"output_path"` + Priority int64 `json:"priority"` + RuntimeConstraints RuntimeConstraints `json:"runtime_constraints"` + State ContainerState `json:"state"` + SchedulingParameters SchedulingParameters `json:"scheduling_parameters"` + ExitCode int `json:"exit_code"` + RuntimeStatus map[string]interface{} `json:"runtime_status"` } // Container is an arvados#container resource. -- 2.39.5