X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/c3c538444c15e68e96780f157935f2baa4ba0bc5..855a0afbc604487ddaedaed0cc1a4ad6da34b602:/lib/dispatchcloud/readme.go diff --git a/lib/dispatchcloud/readme.go b/lib/dispatchcloud/readme.go index a4b005eb83..c8491fb1df 100644 --- a/lib/dispatchcloud/readme.go +++ b/lib/dispatchcloud/readme.go @@ -5,33 +5,35 @@ package dispatchcloud // A dispatcher comprises a container queue, a scheduler, a worker -// pool, a cloud provider, a stale-lock fixer, and a syncer. +// pool, a remote command executor, and a cloud driver. // 1. Choose a provider. // 2. Start a worker pool. // 3. Start a container queue. -// 4. Run a stale-lock fixer. -// 5. Start a scheduler. -// 6. Start a syncer. +// 4. Run the scheduler's stale-lock fixer. +// 5. Run the scheduler's mapper. +// 6. Run the scheduler's syncer. +// 7. Wait for updates to the container queue or worker pool. +// 8. Repeat from 5. // // -// A provider (cloud driver) creates new cloud VM instances and gets -// the latest list of instances. The returned instances implement -// proxies to the provider's metadata and control interfaces (get IP -// address, update tags, shutdown). +// A cloud driver creates new cloud VM instances and gets the latest +// list of instances. The returned instances are caches/proxies for +// the provider's metadata and control interfaces (get IP address, +// update tags, shutdown). // // -// A workerPool tracks workers' instance types and readiness states +// A worker pool tracks workers' instance types and readiness states // (available to do work now, booting, suffering a temporary network // outage, shutting down). It loads internal state from the cloud // provider's list of instances at startup, and syncs periodically // after that. // // -// A worker maintains a multiplexed SSH connection to a cloud -// instance, retrying/reconnecting as needed, so the workerPool can -// execute commands. It asks the provider's instance to verify its SSH -// public key once when first connecting, and again later if the key -// changes. +// An executor maintains a multiplexed SSH connection to a cloud +// instance, retrying/reconnecting as needed, so the worker pool can +// execute commands. It asks the cloud driver's instance to verify its +// SSH public key once when first connecting, and again later if the +// key changes. // // // A container queue tracks the known state (according to @@ -44,36 +46,25 @@ package dispatchcloud // lock/unlock/cancel operation.) // // -// A stale-lock fixer waits for any already-locked containers (i.e., -// locked by a prior server process) to appear on workers as the -// worker pool recovers its state. It unlocks/requeues any that still -// remain when all workers are recovered or shutdown, or its timer -// expires. +// The scheduler's stale-lock fixer waits for any already-locked +// containers (i.e., locked by a prior dispatcher process) to appear +// on workers as the worker pool recovers its state. It +// unlocks/requeues any that still remain when all workers are +// recovered or shutdown, or its timer expires. // // -// A scheduler chooses which containers to assign to which idle -// workers, and decides what to do when there are not enough idle +// The scheduler's mapper chooses which containers to assign to which +// idle workers, and decides what to do when there are not enough idle // workers (including shutting down some idle nodes). // // -// A syncer updates state to Cancelled when a running container -// process dies without finalizing its entry in the controller -// database. It also calls the worker pool to kill containers that -// have priority=0 while locked or running. +// The scheduler's syncer updates state to Cancelled when a running +// container process dies without finalizing its entry in the +// controller database. It also calls the worker pool to kill +// containers that have priority=0 while locked or running. // // -// A provider proxy wraps a provider with rate-limiting logic. After -// the wrapped provider receives a cloud.RateLimitError, the proxy -// starts returning errors to callers immediately without calling -// through to the wrapped provider. -// -// -// TBD: Bootstrapping script via SSH, too? Future version. -// -// TBD: drain instance, keep instance alive -// TBD: metrics, diagnostics -// TBD: why dispatch token currently passed to worker? -// -// Metrics: queue size, time job has been in queued, #idle/busy/booting nodes -// Timing in each step, and end-to-end -// Metrics: boot/idle/alloc time and cost +// An instance set proxy wraps a driver's instance set with +// rate-limiting logic. After the wrapped instance set receives a +// cloud.RateLimitError, the proxy starts returning errors to callers +// immediately without calling through to the wrapped instance set.