15003: Add dispatch-cloud configs to default/template file.
authorTom Clegg <tclegg@veritasgenetics.com>
Thu, 23 May 2019 15:38:09 +0000 (11:38 -0400)
committerTom Clegg <tclegg@veritasgenetics.com>
Thu, 23 May 2019 15:38:09 +0000 (11:38 -0400)
Arvados-DCO-1.1-Signed-off-by: Tom Clegg <tclegg@veritasgenetics.com>

lib/config/config.default.yml
lib/config/generated_config.go
lib/config/load_test.go
lib/controller/handler.go
lib/controller/handler_test.go
lib/service/cmd.go
sdk/go/arvados/config.go
sdk/go/arvados/duration.go
sdk/python/tests/run_test_server.py

index 363d7eb02957142f7792edf286e90e539d3eff2a..4d4937c470801910a6005bf955703c9e3dbcf550 100644 (file)
@@ -21,11 +21,7 @@ Clusters:
     Services:
       RailsAPI:
         InternalURLs: {}
-      GitHTTP:
-        InternalURLs: {}
-        ExternalURL: ""
-      Keepstore:
-        InternalURLs: {}
+        ExternalURL: "-"
       Controller:
         InternalURLs: {}
         ExternalURL: ""
@@ -34,6 +30,7 @@ Clusters:
         ExternalURL: ""
       Keepbalance:
         InternalURLs: {}
+        ExternalURL: "-"
       GitHTTP:
         InternalURLs: {}
         ExternalURL: ""
@@ -41,6 +38,7 @@ Clusters:
         ExternalURL: ""
       DispatchCloud:
         InternalURLs: {}
+        ExternalURL: "-"
       SSO:
         ExternalURL: ""
       Keepproxy:
@@ -54,6 +52,7 @@ Clusters:
         ExternalURL: ""
       Keepstore:
         InternalURLs: {}
+        ExternalURL: "-"
       Composer:
         ExternalURL: ""
       WebShell:
@@ -63,6 +62,13 @@ Clusters:
         ExternalURL: ""
       Workbench2:
         ExternalURL: ""
+      Nodemanager:
+        InternalURLs: {}
+        ExternalURL: "-"
+      Health:
+        InternalURLs: {}
+        ExternalURL: "-"
+
     PostgreSQL:
       # max concurrent connections per arvados server daemon
       ConnectionPool: 32
@@ -118,6 +124,9 @@ Clusters:
       # site secret. It should be at least 50 characters.
       RailsSessionSecretToken: ""
 
+      # Maximum wall clock time to spend handling an incoming request.
+      RequestTimeout: 5m
+
     Users:
       # Config parameters to automatically setup new users.  If enabled,
       # this users will be able to self-activate.  Enable this if you want
@@ -185,6 +194,14 @@ Clusters:
       UnloggedAttributes: []
 
     SystemLogs:
+
+      # Logging threshold: panic, fatal, error, warn, info, debug, or
+      # trace
+      LogLevel: info
+
+      # Logging format: json or text
+      Format: json
+
       # Maximum characters of (JSON-encoded) query parameters to include
       # in each request log entry. When params exceed this size, they will
       # be JSON-encoded, truncated to this size, and logged as
@@ -271,6 +288,8 @@ Clusters:
       Repositories: /var/lib/arvados/git/repositories
 
     TLS:
+      Certificate: ""
+      Key: ""
       Insecure: false
 
     Containers:
@@ -323,6 +342,16 @@ Clusters:
       # troubleshooting purposes.
       LogReuseDecisions: false
 
+      # PEM encoded SSH key (RSA, DSA, or ECDSA) used by the
+      # (experimental) cloud dispatcher for executing containers on
+      # worker VMs. Begins with "-----BEGIN RSA PRIVATE KEY-----\n"
+      # and ends with "\n-----END RSA PRIVATE KEY-----\n".
+      DispatchPrivateKey: none
+
+      # Maximum time to wait for workers to come up before abandoning
+      # stale locks from a previous dispatch process.
+      StaleLockTimeout: 1m
+
       Logging:
         # When you run the db:delete_old_container_logs task, it will find
         # containers that have been finished for at least this many seconds,
@@ -445,6 +474,111 @@ Clusters:
         # original job reuse behavior, and is still the default).
         ReuseJobIfOutputsDiffer: false
 
+      CloudVMs:
+        # Enable the cloud scheduler (experimental).
+        Enable: false
+
+        # Name/number of port where workers' SSH services listen.
+        SSHPort: "22"
+
+        # Interval between queue polls.
+        PollInterval: 10s
+
+        # Shell command to execute on each worker to determine whether
+        # the worker is booted and ready to run containers. It should
+        # exit zero if the worker is ready.
+        BootProbeCommand: "docker ps"
+
+        # Minimum interval between consecutive probes to a single
+        # worker.
+        ProbeInterval: 10s
+
+        # Maximum probes per second, across all workers in a pool.
+        MaxProbesPerSecond: 10
+
+        # Time before repeating SIGTERM when killing a container.
+        TimeoutSignal: 5s
+
+        # Time to give up on SIGTERM and write off the worker.
+        TimeoutTERM: 2m
+
+        # Maximum create/destroy-instance operations per second (0 =
+        # unlimited).
+        MaxCloudOpsPerSecond: 0
+
+        # Interval between cloud provider syncs/updates ("list all
+        # instances").
+        SyncInterval: 1m
+
+        # Time to leave an idle worker running (in case new containers
+        # appear in the queue that it can run) before shutting it
+        # down.
+        TimeoutIdle: 1m
+
+        # Time to wait for a new worker to boot (i.e., pass
+        # BootProbeCommand) before giving up and shutting it down.
+        TimeoutBooting: 10m
+
+        # Maximum time a worker can stay alive with no successful
+        # probes before being automatically shut down.
+        TimeoutProbe: 10m
+
+        # Time after shutting down a worker to retry the
+        # shutdown/destroy operation.
+        TimeoutShutdown: 10s
+
+        # Worker VM image ID.
+        ImageID: ami-01234567890abcdef
+
+        # Cloud driver: "azure" (Microsoft Azure) or "ec2" (Amazon AWS).
+        Driver: ec2
+
+        # Cloud-specific driver parameters.
+        DriverParameters:
+
+          # (ec2) Credentials.
+          AccessKeyID: ""
+          SecretAccessKey: ""
+
+          # (ec2) Instance configuration.
+          SecurityGroupIDs:
+            - ""
+          SubnetID: ""
+          Region: ""
+          EBSVolumeType: gp2
+          AdminUsername: debian
+
+          # (azure) Credentials.
+          SubscriptionID: ""
+          ClientID: ""
+          ClientSecret: ""
+          TenantID: ""
+
+          # (azure) Instance configuration.
+          CloudEnvironment: AzurePublicCloud
+          ResourceGroup: ""
+          Location: centralus
+          Network: ""
+          Subnet: ""
+          StorageAccount: ""
+          BlobContainer: ""
+          DeleteDanglingResourcesAfter: 20s
+          AdminUsername: arvados
+
+    InstanceTypes:
+
+      # Use the instance type name as the key (in place of "SAMPLE" in
+      # this sample entry).
+      SAMPLE:
+        # Cloud provider's instance type. Defaults to the configured type name.
+        ProviderType: ""
+        VCPUs: 1
+        RAM: 128MiB
+        IncludedScratch: 16GB
+        AddedScratch: 0
+        Price: 0.1
+        Preemptible: false
+
     Mail:
       MailchimpAPIKey: ""
       MailchimpListID: ""
@@ -455,7 +589,10 @@ Clusters:
       EmailFrom: ""
     RemoteClusters:
       "*":
+        Host: ""
         Proxy: false
+        Scheme: https
+        Insecure: false
         ActivateUsers: false
       SAMPLE:
         Host: sample.arvadosapi.com
index e26f5f41a7dc8a5975fc7466dfed2b28aaf9529b..4f89166ac3991d3a12dfa45268b5fd06d7ba50c5 100644 (file)
@@ -27,11 +27,7 @@ Clusters:
     Services:
       RailsAPI:
         InternalURLs: {}
-      GitHTTP:
-        InternalURLs: {}
-        ExternalURL: ""
-      Keepstore:
-        InternalURLs: {}
+        ExternalURL: "-"
       Controller:
         InternalURLs: {}
         ExternalURL: ""
@@ -40,6 +36,7 @@ Clusters:
         ExternalURL: ""
       Keepbalance:
         InternalURLs: {}
+        ExternalURL: "-"
       GitHTTP:
         InternalURLs: {}
         ExternalURL: ""
@@ -47,6 +44,7 @@ Clusters:
         ExternalURL: ""
       DispatchCloud:
         InternalURLs: {}
+        ExternalURL: "-"
       SSO:
         ExternalURL: ""
       Keepproxy:
@@ -60,6 +58,7 @@ Clusters:
         ExternalURL: ""
       Keepstore:
         InternalURLs: {}
+        ExternalURL: "-"
       Composer:
         ExternalURL: ""
       WebShell:
@@ -69,6 +68,13 @@ Clusters:
         ExternalURL: ""
       Workbench2:
         ExternalURL: ""
+      Nodemanager:
+        InternalURLs: {}
+        ExternalURL: "-"
+      Health:
+        InternalURLs: {}
+        ExternalURL: "-"
+
     PostgreSQL:
       # max concurrent connections per arvados server daemon
       ConnectionPool: 32
@@ -124,6 +130,9 @@ Clusters:
       # site secret. It should be at least 50 characters.
       RailsSessionSecretToken: ""
 
+      # Maximum wall clock time to spend handling an incoming request.
+      RequestTimeout: 5m
+
     Users:
       # Config parameters to automatically setup new users.  If enabled,
       # this users will be able to self-activate.  Enable this if you want
@@ -191,6 +200,14 @@ Clusters:
       UnloggedAttributes: []
 
     SystemLogs:
+
+      # Logging threshold: panic, fatal, error, warn, info, debug, or
+      # trace
+      LogLevel: info
+
+      # Logging format: json or text
+      Format: json
+
       # Maximum characters of (JSON-encoded) query parameters to include
       # in each request log entry. When params exceed this size, they will
       # be JSON-encoded, truncated to this size, and logged as
@@ -277,6 +294,8 @@ Clusters:
       Repositories: /var/lib/arvados/git/repositories
 
     TLS:
+      Certificate: ""
+      Key: ""
       Insecure: false
 
     Containers:
@@ -329,6 +348,16 @@ Clusters:
       # troubleshooting purposes.
       LogReuseDecisions: false
 
+      # PEM encoded SSH key (RSA, DSA, or ECDSA) used by the
+      # (experimental) cloud dispatcher for executing containers on
+      # worker VMs. Begins with "-----BEGIN RSA PRIVATE KEY-----\n"
+      # and ends with "\n-----END RSA PRIVATE KEY-----\n".
+      DispatchPrivateKey: none
+
+      # Maximum time to wait for workers to come up before abandoning
+      # stale locks from a previous dispatch process.
+      StaleLockTimeout: 1m
+
       Logging:
         # When you run the db:delete_old_container_logs task, it will find
         # containers that have been finished for at least this many seconds,
@@ -451,6 +480,111 @@ Clusters:
         # original job reuse behavior, and is still the default).
         ReuseJobIfOutputsDiffer: false
 
+      CloudVMs:
+        # Enable the cloud scheduler (experimental).
+        Enable: false
+
+        # Name/number of port where workers' SSH services listen.
+        SSHPort: "22"
+
+        # Interval between queue polls.
+        PollInterval: 10s
+
+        # Shell command to execute on each worker to determine whether
+        # the worker is booted and ready to run containers. It should
+        # exit zero if the worker is ready.
+        BootProbeCommand: "docker ps"
+
+        # Minimum interval between consecutive probes to a single
+        # worker.
+        ProbeInterval: 10s
+
+        # Maximum probes per second, across all workers in a pool.
+        MaxProbesPerSecond: 10
+
+        # Time before repeating SIGTERM when killing a container.
+        TimeoutSignal: 5s
+
+        # Time to give up on SIGTERM and write off the worker.
+        TimeoutTERM: 2m
+
+        # Maximum create/destroy-instance operations per second (0 =
+        # unlimited).
+        MaxCloudOpsPerSecond: 0
+
+        # Interval between cloud provider syncs/updates ("list all
+        # instances").
+        SyncInterval: 1m
+
+        # Time to leave an idle worker running (in case new containers
+        # appear in the queue that it can run) before shutting it
+        # down.
+        TimeoutIdle: 1m
+
+        # Time to wait for a new worker to boot (i.e., pass
+        # BootProbeCommand) before giving up and shutting it down.
+        TimeoutBooting: 10m
+
+        # Maximum time a worker can stay alive with no successful
+        # probes before being automatically shut down.
+        TimeoutProbe: 10m
+
+        # Time after shutting down a worker to retry the
+        # shutdown/destroy operation.
+        TimeoutShutdown: 10s
+
+        # Worker VM image ID.
+        ImageID: ami-01234567890abcdef
+
+        # Cloud driver: "azure" (Microsoft Azure) or "ec2" (Amazon AWS).
+        Driver: ec2
+
+        # Cloud-specific driver parameters.
+        DriverParameters:
+
+          # (ec2) Credentials.
+          AccessKeyID: ""
+          SecretAccessKey: ""
+
+          # (ec2) Instance configuration.
+          SecurityGroupIDs:
+            - ""
+          SubnetID: ""
+          Region: ""
+          EBSVolumeType: gp2
+          AdminUsername: debian
+
+          # (azure) Credentials.
+          SubscriptionID: ""
+          ClientID: ""
+          ClientSecret: ""
+          TenantID: ""
+
+          # (azure) Instance configuration.
+          CloudEnvironment: AzurePublicCloud
+          ResourceGroup: ""
+          Location: centralus
+          Network: ""
+          Subnet: ""
+          StorageAccount: ""
+          BlobContainer: ""
+          DeleteDanglingResourcesAfter: 20s
+          AdminUsername: arvados
+
+    InstanceTypes:
+
+      # Use the instance type name as the key (in place of "SAMPLE" in
+      # this sample entry).
+      SAMPLE:
+        # Cloud provider's instance type. Defaults to the configured type name.
+        ProviderType: ""
+        VCPUs: 1
+        RAM: 128MiB
+        IncludedScratch: 16GB
+        AddedScratch: 0
+        Price: 0.1
+        Preemptible: false
+
     Mail:
       MailchimpAPIKey: ""
       MailchimpListID: ""
@@ -461,7 +595,10 @@ Clusters:
       EmailFrom: ""
     RemoteClusters:
       "*":
+        Host: ""
         Proxy: false
+        Scheme: https
+        Insecure: false
         ActivateUsers: false
       SAMPLE:
         Host: sample.arvadosapi.com
index bbcc45a3f3714e3de346a82125e16bc0fb5054ac..ed1dd1bdfd82f941921fa291c6b95067fb9a24c2 100644 (file)
@@ -97,6 +97,20 @@ Clusters:
        c.Check(logs, check.HasLen, 2)
 }
 
+func (s *LoadSuite) TestNoWarningsForDumpedConfig(c *check.C) {
+       var logbuf bytes.Buffer
+       logger := logrus.New()
+       logger.Out = &logbuf
+       cfg, err := Load(bytes.NewBufferString(`{"Clusters":{"zzzzz":{}}}`), logger)
+       c.Assert(err, check.IsNil)
+       yaml, err := yaml.Marshal(cfg)
+       c.Assert(err, check.IsNil)
+       cfgDumped, err := Load(bytes.NewBuffer(yaml), logger)
+       c.Assert(err, check.IsNil)
+       c.Check(cfg, check.DeepEquals, cfgDumped)
+       c.Check(logbuf.String(), check.Equals, "")
+}
+
 func (s *LoadSuite) TestPostgreSQLKeyConflict(c *check.C) {
        _, err := Load(bytes.NewBufferString(`
 Clusters:
index 775d2903475d6ad83eb368b77191cf479065cb57..35734d780c9bf1819c9f7b0875d0903858f95125 100644 (file)
@@ -50,8 +50,8 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
                        req.URL.Path = strings.Replace(req.URL.Path, "//", "/", -1)
                }
        }
-       if h.Cluster.HTTPRequestTimeout > 0 {
-               ctx, cancel := context.WithDeadline(req.Context(), time.Now().Add(time.Duration(h.Cluster.HTTPRequestTimeout)))
+       if h.Cluster.API.RequestTimeout > 0 {
+               ctx, cancel := context.WithDeadline(req.Context(), time.Now().Add(time.Duration(h.Cluster.API.RequestTimeout)))
                req = req.WithContext(ctx)
                defer cancel()
        }
index 96110ea85859b05b362f849475a9d77c91919752..01544a2b0b392af4b231d9d03a9efe75a69fb27e 100644 (file)
@@ -72,7 +72,7 @@ func (s *HandlerSuite) TestProxyDiscoveryDoc(c *check.C) {
 }
 
 func (s *HandlerSuite) TestRequestTimeout(c *check.C) {
-       s.cluster.HTTPRequestTimeout = arvados.Duration(time.Nanosecond)
+       s.cluster.API.RequestTimeout = arvados.Duration(time.Nanosecond)
        req := httptest.NewRequest("GET", "/discovery/v1/apis/arvados/v1/rest", nil)
        resp := httptest.NewRecorder()
        s.handler.ServeHTTP(resp, req)
index 4b7341d7294d44a94f6422534dfc8780eab0c7db..024459ca0606747bf9298a0dc5ef18a16d9a9d12 100644 (file)
@@ -78,7 +78,7 @@ func (c *command) RunCommand(prog string, args []string, stdin io.Reader, stdout
        if err != nil {
                return 1
        }
-       log = ctxlog.New(stderr, cluster.Logging.Format, cluster.Logging.Level).WithFields(logrus.Fields{
+       log = ctxlog.New(stderr, cluster.SystemLogs.Format, cluster.SystemLogs.LogLevel).WithFields(logrus.Fields{
                "PID": os.Getpid(),
        })
        ctx := ctxlog.Context(c.ctx, log)
index 6b3150c6f0e15d5711f9d5d30fdfe62042f20739..b25164c3d159c310284efb1b0889cdc2db3c7445 100644 (file)
@@ -54,23 +54,22 @@ func (sc *Config) GetCluster(clusterID string) (*Cluster, error) {
 type API struct {
        MaxItemsPerResponse     int
        MaxRequestAmplification int
+       RequestTimeout          Duration
 }
 
 type Cluster struct {
-       ClusterID          string `json:"-"`
-       ManagementToken    string
-       SystemRootToken    string
-       Services           Services
-       NodeProfiles       map[string]NodeProfile
-       InstanceTypes      InstanceTypeMap
-       CloudVMs           CloudVMs
-       Dispatch           Dispatch
-       HTTPRequestTimeout Duration
-       RemoteClusters     map[string]RemoteCluster
-       PostgreSQL         PostgreSQL
-       API                API
-       Logging            Logging
-       TLS                TLS
+       ClusterID       string `json:"-"`
+       ManagementToken string
+       SystemRootToken string
+       Services        Services
+       NodeProfiles    map[string]NodeProfile
+       InstanceTypes   InstanceTypeMap
+       Containers      ContainersConfig
+       RemoteClusters  map[string]RemoteCluster
+       PostgreSQL      PostgreSQL
+       API             API
+       SystemLogs      SystemLogs
+       TLS             TLS
 }
 
 type Services struct {
@@ -89,7 +88,7 @@ type Services struct {
 }
 
 type Service struct {
-       InternalURLs map[URL]ServiceInstance
+       InternalURLs map[URL]ServiceInstance `json:",omitempty"`
        ExternalURL  URL
 }
 
@@ -112,9 +111,10 @@ func (su URL) MarshalText() ([]byte, error) {
 
 type ServiceInstance struct{}
 
-type Logging struct {
-       Level  string
-       Format string
+type SystemLogs struct {
+       LogLevel                string
+       Format                  string
+       MaxRequestLogParamsSize int
 }
 
 type PostgreSQL struct {
@@ -148,59 +148,29 @@ type InstanceType struct {
        Preemptible     bool
 }
 
-type Dispatch struct {
-       // PEM encoded SSH key (RSA, DSA, or ECDSA) able to log in to
-       // cloud VMs.
-       PrivateKey string
-
-       // Max time for workers to come up before abandoning stale
-       // locks from previous run
-       StaleLockTimeout Duration
-
-       // Interval between queue polls
-       PollInterval Duration
-
-       // Interval between probes to each worker
-       ProbeInterval Duration
-
-       // Maximum total worker probes per second
-       MaxProbesPerSecond int
-
-       // Time before repeating SIGTERM when killing a container
-       TimeoutSignal Duration
-
-       // Time to give up on SIGTERM and write off the worker
-       TimeoutTERM Duration
+type ContainersConfig struct {
+       CloudVMs           CloudVMsConfig
+       DispatchPrivateKey string
+       StaleLockTimeout   Duration
 }
 
-type CloudVMs struct {
-       // Shell command that exits zero IFF the VM is fully booted
-       // and ready to run containers, e.g., "mount | grep
-       // /encrypted-tmp"
-       BootProbeCommand string
-
-       // Listening port (name or number) of SSH servers on worker
-       // VMs
-       SSHPort string
+type CloudVMsConfig struct {
+       Enable bool
 
-       SyncInterval Duration
-
-       // Maximum idle time before automatic shutdown
-       TimeoutIdle Duration
-
-       // Maximum booting time before automatic shutdown
-       TimeoutBooting Duration
-
-       // Maximum time with no successful probes before automatic shutdown
-       TimeoutProbe Duration
-
-       // Time after shutdown to retry shutdown
-       TimeoutShutdown Duration
-
-       // Maximum create/destroy-instance operations per second
+       BootProbeCommand     string
+       ImageID              string
        MaxCloudOpsPerSecond int
-
-       ImageID string
+       MaxProbesPerSecond   int
+       PollInterval         Duration
+       ProbeInterval        Duration
+       SSHPort              string
+       SyncInterval         Duration
+       TimeoutBooting       Duration
+       TimeoutIdle          Duration
+       TimeoutProbe         Duration
+       TimeoutShutdown      Duration
+       TimeoutSignal        Duration
+       TimeoutTERM          Duration
 
        Driver           string
        DriverParameters json.RawMessage
index 25eed010f26c534ef8e36dfa119065731d1e2ac4..d3e11c7a5e673aebe84cbb98a807fa0ef1806b64 100644 (file)
@@ -23,7 +23,7 @@ func (d *Duration) UnmarshalJSON(data []byte) error {
 }
 
 // MarshalJSON implements json.Marshaler.
-func (d *Duration) MarshalJSON() ([]byte, error) {
+func (d Duration) MarshalJSON() ([]byte, error) {
        return json.Marshal(d.String())
 }
 
index 79767c2fa5b007f267ad99a72c5445a909862d05..c2ad7c892811a5586b523df056b0436609611299 100644 (file)
@@ -414,7 +414,8 @@ def run_controller():
 Clusters:
   zzzzz:
     ManagementToken: e687950a23c3a9bceec28c6223a06c79
-    HTTPRequestTimeout: 30s
+    API:
+      RequestTimeout: 30s
     PostgreSQL:
       ConnectionPool: 32
       Connection: