Merge branch '20602-controller-qos'

[arvados.git] / lib / config / config.default.yml
diff --git a/lib/config/config.default.yml b/lib/config/config.default.yml

index 71d180b0e7ad09bb0c399fae3d2c8ac1d0b37a99..33c1e497de3fc8bbdb50b4d8cc7386eda1fa3937 100644 (file)
--- a/lib/config/config.default.yml
+++ b/lib/config/config.default.yml
@@ -223,9 +223,23 @@ Clusters:
        # parameter higher than this value, this value is used instead.
        MaxItemsPerResponse: 1000
  
-      # Maximum number of concurrent requests to accept in a single
-      # service process, or 0 for no limit.
-      MaxConcurrentRequests: 0
+      # Maximum number of concurrent requests to process concurrently
+      # in a single service process, or 0 for no limit.
+      MaxConcurrentRequests: 64
+
+      # Maximum number of incoming requests to hold in a priority
+      # queue waiting for one of the MaxConcurrentRequests slots to be
+      # free. When the queue is longer than this, respond 503 to the
+      # lowest priority request.
+      #
+      # If MaxQueuedRequests is 0, respond 503 immediately to
+      # additional requests while at the MaxConcurrentRequests limit.
+      MaxQueuedRequests: 64
+
+      # Fraction of MaxConcurrentRequests that can be "log create"
+      # messages at any given time.  This is to prevent logging
+      # updates from crowding out more important requests.
+      LogCreateRequestFraction: 0.50
  
        # Maximum number of 64MiB memory buffers per Keepstore server process, or
        # 0 for no limit. When this limit is reached, up to
@@ -288,6 +302,9 @@ Clusters:
        # any user with "manage" permission can un-freeze.
        UnfreezeProjectRequiresAdmin: false
  
+      # (Experimental) Use row-level locking on update API calls.
+      LockBeforeUpdate: false
+
      Users:
        # Config parameters to automatically setup new users.  If enabled,
        # this users will be able to self-activate.  Enable this if you want
@@ -434,6 +451,15 @@ Clusters:
        # params_truncated.
        MaxRequestLogParamsSize: 2000
  
+      # In all services except RailsAPI, periodically check whether
+      # the incoming HTTP request queue is nearly full (see
+      # MaxConcurrentRequests) and, if so, write a snapshot of the
+      # request queue to {service}-requests.json in the specified
+      # directory.
+      #
+      # Leave blank to disable.
+      RequestQueueDumpDirectory: ""
+
      Collections:
  
        # Enable access controls for data stored in Keep. This should
@@ -526,7 +552,7 @@ Clusters:
        #
        # If SIGUSR1 is received during an idle period between operations,
        # the next operation will start immediately.
-      BalancePeriod: 10m
+      BalancePeriod: 6h
  
        # Limits the number of collections retrieved by keep-balance per
        # API transaction. If this is zero, page size is
@@ -535,11 +561,12 @@ Clusters:
        BalanceCollectionBatch: 0
  
        # The size of keep-balance's internal queue of
-      # collections. Higher values use more memory and improve throughput
-      # by allowing keep-balance to fetch the next page of collections
-      # while the current page is still being processed. If this is zero
-      # or omitted, pages are processed serially.
-      BalanceCollectionBuffers: 1000
+      # collections. Higher values may improve throughput by allowing
+      # keep-balance to fetch collections from the database while the
+      # current collection are still being processed, at the expense of
+      # using more memory.  If this is zero or omitted, pages are
+      # processed serially.
+      BalanceCollectionBuffers: 4
  
        # Maximum time for a rebalancing run. This ensures keep-balance
        # eventually gives up and retries if, for example, a network
@@ -803,6 +830,16 @@ Clusters:
          # Skip TLS certificate name verification.
          InsecureTLS: false
  
+        # Mininum TLS version to negotiate when connecting to server
+        # (ldaps://... or StartTLS). It may be necessary to set this
+        # to "1.1" for compatibility with older LDAP servers that fail
+        # with 'LDAP Result Code 200 "Network Error": TLS handshake
+        # failed (tls: server selected unsupported protocol version
+        # 301)'.
+        #
+        # If blank, use the recommended minimum version (1.2).
+        MinTLSVersion: ""
+
          # Strip the @domain part if a user supplies an email-style
          # username with this domain. If "*", strip any user-provided
          # domain. If "", never strip the domain part. Example:
@@ -891,6 +928,9 @@ Clusters:
        # probably want to include the other Workbench instances in the
        # federation in this list.
        #
+      # A wildcard like "https://*.example" will match client URLs
+      # like "https://a.example" and "https://a.b.c.example".
+      #
        # Example:
        #
        # TrustedClients:
@@ -973,14 +1013,19 @@ Clusters:
  
        # Default value for keep_cache_ram of a container's
        # runtime_constraints.  Note: this gets added to the RAM request
-      # used to allocate a VM or submit an HPC job
+      # used to allocate a VM or submit an HPC job.
+      #
+      # If this is zero, container requests that don't specify RAM or
+      # disk cache size will use a disk cache, sized to the
+      # container's RAM requirement (but with minimum 2 GiB and
+      # maximum 32 GiB).
+      #
+      # Note: If you change this value, containers that used the previous
+      # default value will only be reused by container requests that
+      # explicitly specify the previous value in their keep_cache_ram
+      # runtime constraint.
        DefaultKeepCacheRAM: 0
  
-      # Default value for keep_cache_disk of a container's
-      # runtime_constraints.  Note: this gets added to the disk
-      # request used to allocate a VM or submit an HPC job
-      DefaultKeepCacheDisk: 8589934592
-
        # Number of times a container can be unlocked before being
        # automatically cancelled.
        MaxDispatchAttempts: 5
@@ -992,13 +1037,6 @@ Clusters:
        # with the cancelled container.
        MaxRetryAttempts: 3
  
-      # The maximum number of compute nodes that can be in use simultaneously
-      # If this limit is reduced, any existing nodes with slot number >= new limit
-      # will not be counted against the new limit. In other words, the new limit
-      # won't be strictly enforced until those nodes with higher slot numbers
-      # go down.
-      MaxComputeVMs: 64
-
        # Schedule all child containers on preemptible instances (e.g. AWS
        # Spot Instances) even if not requested by the submitter.
        #
@@ -1022,6 +1060,10 @@ Clusters:
        # cloud dispatcher for executing containers on worker VMs.
        # Begins with "-----BEGIN RSA PRIVATE KEY-----\n"
        # and ends with "\n-----END RSA PRIVATE KEY-----\n".
+      #
+      # Use "file:///absolute/path/to/key" to load the key from a
+      # separate file instead of embedding it in the configuration
+      # file.
        DispatchPrivateKey: ""
  
        # Maximum time to wait for workers to come up before abandoning
@@ -1126,6 +1168,8 @@ Clusters:
  
          # Maximum bytes that may be logged by a single job.  Log bytes that are
          # silenced by throttling are not counted against this total.
+        # If you set this to zero, each container will only create a single
+        # log on the API server, noting for users that logging is throttled.
          LimitLogBytesPerJob: 67108864
  
          LogPartialLineThrottlePeriod: 5s
@@ -1314,6 +1358,30 @@ Clusters:
          # providers too, if desired.
          MaxConcurrentInstanceCreateOps: 1
  
+        # The maximum number of instances to run at a time, or 0 for
+        # unlimited.
+        #
+        # If more instances than this are already running and busy
+        # when the dispatcher starts up, the running containers will
+        # be allowed to finish before the excess instances are shut
+        # down.
+        MaxInstances: 64
+
+        # Maximum fraction of CloudVMs.MaxInstances allowed to run
+        # "supervisor" containers at any given time. A supervisor is a
+        # container whose purpose is mainly to submit and manage other
+        # containers, such as arvados-cwl-runner workflow runner.
+        #
+        # If there is a hard limit on the amount of concurrent
+        # containers that the cluster can run, it is important to
+        # avoid crowding out the containers doing useful work with
+        # containers who just create more work.
+        #
+        # For example, with the default MaxInstances of 64, it will
+        # schedule at most floor(64*0.30) = 19 concurrent workflows,
+        # ensuring 45 slots are available for work.
+        SupervisorFraction: 0.30
+
          # Interval between cloud provider syncs/updates ("list all
          # instances").
          SyncInterval: 1m
@@ -1344,6 +1412,12 @@ Clusters:
          # https://xxxxx.blob.core.windows.net/system/Microsoft.Compute/Images/images/xxxxx.vhd
          ImageID: ""
  
+        # Shell script to run on new instances using the cloud
+        # provider's UserData (EC2) or CustomData (Azure) feature.
+        #
+        # It is not necessary to include a #!/bin/sh line.
+        InstanceInitCommand: ""
+
          # An executable file (located on the dispatcher host) to be
          # copied to cloud instances at runtime and used as the
          # container runner/supervisor. The default value is the
@@ -1354,6 +1428,12 @@ Clusters:
          # version of crunch-run installed; see CrunchRunCommand above.
          DeployRunnerBinary: "/proc/self/exe"
  
+        # Install the Dispatcher's SSH public key (derived from
+        # DispatchPrivateKey) when creating new cloud
+        # instances. Change this to false if you are using a different
+        # mechanism to pre-install the public key on new instances.
+        DeployPublicKey: true
+
          # Tags to add on all resources (VMs, NICs, disks) created by
          # the container dispatcher. (Arvados's own tags --
          # InstanceType, IdleBehavior, and InstanceSecret -- will also
@@ -1395,6 +1475,20 @@ Clusters:
            # the cloud dispatcher. Leave blank when not needed.
            IAMInstanceProfile: ""
  
+          # (ec2) how often to look up spot instance pricing data
+          # (only while running spot instances) for the purpose of
+          # calculating container cost estimates. A value of 0
+          # disables spot price lookups entirely.
+          SpotPriceUpdateInterval: 24h
+
+          # (ec2) per-GiB-month cost of EBS volumes. Matches
+          # EBSVolumeType. Used to account for AddedScratch when
+          # calculating container cost estimates. Note that
+          # https://aws.amazon.com/ebs/pricing/ defines GB to mean
+          # GiB, so an advertised price $0.10/GB indicates a real
+          # price of $0.10/GiB and can be entered here as 0.10.
+          EBSPrice: 0.10
+
            # (azure) Credentials.
            SubscriptionID: ""
            ClientID: ""
@@ -1448,6 +1542,13 @@ Clusters:
          RAM: 128MiB
          IncludedScratch: 16GB
          AddedScratch: 0
+        # Hourly price ($), used to select node types for containers,
+        # and to calculate estimated container costs. For spot
+        # instances on EC2, this is also used as the maximum price
+        # when launching spot instances, while the estimated container
+        # cost is computed based on the current spot price according
+        # to AWS. On Azure, and on-demand instances on EC2, the price
+        # given here is used to compute container cost estimates.
          Price: 0.1
          Preemptible: false
          # Include this section if the node type includes GPU (CUDA) support
@@ -1523,8 +1624,6 @@ Clusters:
            ReadTimeout: 10m
            RaceWindow: 24h
            PrefixLength: 0
-          # Use aws-s3-go (v2) instead of goamz
-          UseAWSS3v2Driver: true
  
            # For S3 driver, potentially unsafe tuning parameter,
            # intentionally excluded from main documentation.
@@ -1745,9 +1844,11 @@ Clusters:
        # This feature is disabled when set to zero.
        IdleTimeout: 0s
  
-      # URL to a file that is a fragment of text or HTML which should
-      # be rendered in Workbench as a banner.
-      BannerURL: ""
+      # UUID of a collection.  This collection should be shared with
+      # all users.  Workbench will look for a file "banner.html" in
+      # this collection and display its contents (should be
+      # HTML-formatted text) when users first log in to Workbench.
+      BannerUUID: ""
  
        # Workbench welcome screen, this is HTML text that will be
        # incorporated directly onto the page.