Merge branch '20754-docker-py-upgrade'
authorBrett Smith <brett.smith@curii.com>
Mon, 7 Aug 2023 21:18:56 +0000 (17:18 -0400)
committerBrett Smith <brett.smith@curii.com>
Mon, 7 Aug 2023 21:18:56 +0000 (17:18 -0400)
Closes #20754.

Arvados-DCO-1.1-Signed-off-by: Brett Smith <brett.smith@curii.com>

24 files changed:
doc/_includes/_download_installer.liquid
doc/_includes/_multi_host_install_custom_certificates.liquid
doc/install/salt-multi-host.html.textile.liquid
lib/config/config.default.yml
lib/dispatchcloud/dispatcher_test.go
lib/dispatchcloud/test/stub_driver.go
tools/crunchstat-summary/crunchstat_summary/__init__.py
tools/crunchstat-summary/crunchstat_summary/summarizer.py
tools/crunchstat-summary/tests/test_examples.py
tools/salt-install/common.sh [new file with mode: 0644]
tools/salt-install/config_examples/multi_host/aws/pillars/arvados.sls
tools/salt-install/config_examples/multi_host/aws/pillars/letsencrypt_balancer_configuration.sls [new file with mode: 0644]
tools/salt-install/config_examples/multi_host/aws/pillars/nginx_balancer_configuration.sls [new file with mode: 0644]
tools/salt-install/config_examples/multi_host/aws/pillars/nginx_controller_configuration.sls
tools/salt-install/config_examples/multi_host/aws/pillars/nginx_passenger.sls
tools/salt-install/config_examples/multi_host/aws/pillars/postgresql.sls
tools/salt-install/config_examples/multi_host/aws/pillars/prometheus_server.sls
tools/salt-install/installer.sh
tools/salt-install/local.params.example.multiple_hosts
tools/salt-install/local.params.secrets.example
tools/salt-install/provision.sh
tools/salt-install/terraform/aws/services/locals.tf
tools/salt-install/terraform/aws/vpc/terraform.tfvars
tools/salt-install/terraform/aws/vpc/variables.tf

index 724ed1416ee8f81d62d3c03a8bc908f7f3fc4b3b..461debd4928a949a8cc014a8e9f7fa297a738f4b 100644 (file)
@@ -9,7 +9,7 @@ SPDX-License-Identifier: CC-BY-SA-3.0
 This is a package-based installation method, however the installation script is currently distributed in source form via @git@. We recommend checking out the git tree on your local workstation, not directly on the target(s) where you want to install and run Arvados.
 
 <notextile>
-<pre><code>git clone https://github.com/arvados/arvados.git
+<pre><code class="userinput">git clone https://github.com/arvados/arvados.git
 cd arvados
 git checkout {{ branchname }}
 cd tools/salt-install
@@ -31,7 +31,7 @@ h3. Using Terraform (AWS specific)
 If you are going to use Terraform to set up the infrastructure on AWS, you first need to install the "Terraform CLI":https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli and the "AWS CLI":https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html tool.  Then you can initialize the installer.
 
 <notextile>
-<pre><code>CLUSTER=xarv1
+<pre><code class="userinput">CLUSTER=xarv1
 ./installer.sh initialize ~/setup-arvados-${CLUSTER} {{local_params_src}} {{config_examples_src}} {{terraform_src}}
 cd ~/setup-arvados-${CLUSTER}
 </code></pre>
@@ -40,7 +40,7 @@ cd ~/setup-arvados-${CLUSTER}
 h3. Without Terraform
 
 <notextile>
-<pre><code>CLUSTER=xarv1
+<pre><code class="userinput">CLUSTER=xarv1
 ./installer.sh initialize ~/setup-arvados-${CLUSTER} {{local_params_src}} {{config_examples_src}}
 cd ~/setup-arvados-${CLUSTER}
 </code></pre>
index 1a51f2991933f6b0b641afa58bef0eecd4b09d1d..eac40218cc357db1d4ed90376f7580b3a75ce7b9 100644 (file)
@@ -42,7 +42,7 @@ Make sure that all the FQDNs that you will use for the public-facing application
 Note: because the installer currently looks for a different certificate file for each service, if you use a single certificate, we recommend creating a symlink for each certificate and key file to the primary certificate and key, e.g.
 
 <notextile>
-<pre><code>ln -s xarv1.crt ${CUSTOM_CERTS_DIR}/controller.crt
+<pre><code class="userinput">ln -s xarv1.crt ${CUSTOM_CERTS_DIR}/controller.crt
 ln -s xarv1.key ${CUSTOM_CERTS_DIR}/controller.key
 ln -s xarv1.crt ${CUSTOM_CERTS_DIR}/keepproxy.crt
 ln -s xarv1.key ${CUSTOM_CERTS_DIR}/keepproxy.key
index 27e7321644fa63fad4a463a11f5675c305b13c66..cad06754496b7621da8bdec6fc4b5af59db72d2c 100644 (file)
@@ -30,6 +30,8 @@ SPDX-License-Identifier: CC-BY-SA-3.0
 ## "Common problems and solutions":#common-problems
 # "Initial user and login":#initial_user
 # "Monitoring and Metrics":#monitoring
+# "Load balancing controllers":#load_balancing
+## "Rolling upgrades procedure":#rolling-upgrades
 # "After the installation":#post_install
 
 h2(#introduction). Introduction
@@ -114,14 +116,14 @@ h4. Set credentials
 
 You will need an AWS access key and secret key to create the infrastructure.
 
-<pre><code>$ export AWS_ACCESS_KEY_ID="anaccesskey"
-export AWS_SECRET_ACCESS_KEY="asecretkey"</code></pre>
+<pre><code class="userinput">export AWS_ACCESS_KEY_ID="anaccesskey"
+export AWS_SECRET_ACCESS_KEY="asecretkey"</code></pre>
 
 h4. Create the infrastructure
 
 Build the infrastructure by running @./installer.sh terraform@.  The last stage will output the information needed to set up the cluster's domain and continue with the installer. for example:
 
-<pre><code>$ ./installer.sh terraform
+<pre><code class="userinput">./installer.sh terraform
 ...
 Apply complete! Resources: 16 added, 0 changed, 0 destroyed.
 
@@ -190,7 +192,7 @@ The certificates will be requested from Let's Encrypt when you run the installer
 
 * @cluster_int_cidr@ will be used to set @CLUSTER_INT_CIDR@
 
-* You'll also need @compute_subnet_id@ and @arvados_sg_id@ to set @DriverParameters.SubnetID@ and @DriverParameters.SecurityGroupIDs@ in @local_config_dir/pillars/arvados.sls@ and when you "create a compute image":#create_a_compute_image.
+* You'll also need @compute_subnet_id@ and @arvados_sg_id@ to set @COMPUTE_SUBNET@ and @COMPUTE_SG@ in @local.params@ and when you "create a compute image":#create_a_compute_image.
 
 You can now proceed to "edit local.params* files":#localparams.
 
@@ -228,18 +230,17 @@ The installer will set up the Arvados services on your machines.  Here is the de
 ## postgresql server
 ## arvados api server
 ## arvados controller  (recommendend hostname @controller.${DOMAIN}@)
-## arvados websocket   (recommendend hostname @ws.${DOMAIN}@)
-## arvados cloud dispatcher
-## arvados keepbalance
 # KEEPSTORE nodes (at least 1 if using S3 as a Keep backend, else 2)
 ## arvados keepstore   (recommendend hostnames @keep0.${DOMAIN}@ and @keep1.${DOMAIN}@)
-# KEEPPROXY node
-## arvados keepproxy   (recommendend hostname @keep.${DOMAIN}@)
-## arvados keepweb     (recommendend hostname @download.${DOMAIN}@ and @*.collections.${DOMAIN}@)
 # WORKBENCH node
 ## arvados workbench   (recommendend hostname @workbench.${DOMAIN}@)
 ## arvados workbench2  (recommendend hostname @workbench2.${DOMAIN}@)
 ## arvados webshell    (recommendend hostname @webshell.${DOMAIN}@)
+## arvados websocket   (recommendend hostname @ws.${DOMAIN}@)
+## arvados cloud dispatcher
+## arvados keepbalance
+## arvados keepproxy   (recommendend hostname @keep.${DOMAIN}@)
+## arvados keepweb     (recommendend hostname @download.${DOMAIN}@ and @*.collections.${DOMAIN}@)
 # SHELL node  (optional)
 ## arvados shell       (recommended hostname @shell.${DOMAIN}@)
 
@@ -278,7 +279,7 @@ _AWS Specific: Go to the AWS console and into the VPC service, there is a column
 h3. Parameters from @local.params.secrets@:
 
 # Set each @KEY@ / @TOKEN@ / @PASSWORD@ to a random string.  You can use @installer.sh generate-tokens@
-<pre><code>$ ./installer.sh generate-tokens
+<pre><code class="userinput">./installer.sh generate-tokens
 BLOB_SIGNING_KEY=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 MANAGEMENT_TOKEN=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 SYSTEM_ROOT_TOKEN=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
@@ -338,7 +339,7 @@ Arvados requires a database that is compatible with PostgreSQL 9.5 or later.  Fo
 
 # In @local.params@, remove 'database' from the list of roles assigned to the controller node:
 <pre><code>NODES=(
-  [controller.${DOMAIN}]=api,controller,websocket,dispatcher,keepbalance
+  [controller.${DOMAIN}]=controller,websocket,dispatcher,keepbalance
   ...
 )
 </code></pre>
@@ -370,15 +371,14 @@ Follow "the instructions to build a cloud compute node image":{{site.baseurl}}/i
 
 h3. Configure the compute image
 
-Once the image has been created, open @local_config_dir/pillars/arvados.sls@ and edit as follows (AWS specific settings described here, other cloud providers will have similar settings in their respective configuration section):
+Once the image has been created, open @local.params@ and edit as follows (AWS specific settings described here, you will need to make custom changes for other cloud providers):
 
-# In the @arvados.cluster.Containers.CloudVMs@ section:
-## Set @ImageID@ to the AMI produced by Packer
-## Set @DriverParameters.Region@ to the appropriate AWS region
-## Set @DriverParameters.AdminUsername@ to the admin user account on the image
-## Set the @DriverParameters.SecurityGroupIDs@ list to the VPC security group which you set up to allow SSH connections to these nodes
-## Set @DriverParameters.SubnetID@ to the value of SubnetId of your VPC
-# Update @arvados.cluster.InstanceTypes@ as necessary.  The example instance types are for AWS, other cloud providers will of course have different instance types with different names and specifications.
+# Set @COMPUTE_AMI@ to the AMI produced by Packer
+# Set @COMPUTE_AWS_REGION@ to the appropriate AWS region
+# Set @COMPUTE_USER@ to the admin user account on the image
+# Set the @COMPUTE_SG@ list to the VPC security group which you set up to allow SSH connections to these nodes
+# Set @COMPUTE_SUBNET@ to the value of SubnetId of your VPC
+# Update @arvados.cluster.InstanceTypes@ in @local_config_dir/pillars/arvados.sls@ as necessary.  The example instance types are for AWS, other cloud providers will of course have different instance types with different names and specifications.
 (AWS specific) If m5/c5 node types are not available, replace them with m4/c4. You'll need to double check the values for Price and IncludedScratch/AddedScratch for each type that is changed.
 
 h2(#installation). Begin installation
@@ -387,9 +387,7 @@ At this point, you are ready to run the installer script in deploy mode that wil
 
 Run this in the @~/arvados-setup-xarv1@ directory:
 
-<pre>
-./installer.sh deploy
-</pre>
+<pre><code class="userinput">./installer.sh deploy</code></pre>
 
 This will install and configure Arvados on all the nodes.  It will take a while and produce a lot of logging.  If it runs into an error, it will stop.
 
@@ -403,9 +401,7 @@ If you are running the diagnostics from one of the Arvados machines inside the p
 
 You are an "external client" if you running the diagnostics from your workstation outside of the private network.
 
-<pre>
-./installer.sh diagnostics (-internal-client|-external-client)
-</pre>
+<pre><code class="userinput">./installer.sh diagnostics (-internal-client|-external-client)</code></pre>
 
 h3(#debugging). Debugging issues
 
@@ -429,9 +425,7 @@ You can iterate on the config and maintain the cluster by making changes to @loc
 
 If you are debugging a configuration issue on a specific node, you can speed up the cycle a bit by deploying just one node:
 
-<pre>
-./installer.sh deploy keep0.xarv1.example.com
-</pre>
+<pre><code class="userinput">./installer.sh deploy keep0.xarv1.example.com</code></pre>
 
 However, once you have a final configuration, you should run a full deploy to ensure that the configuration has been synchronized on all the nodes.
 
@@ -452,7 +446,7 @@ If this happens, you need to
 1. correct the database information
 2. run @./installer.sh deploy xarv1.example.com@ to update the configuration on the API/controller node
 3. Log in to the API/controller server node, then run this command to re-run the post-install script, which will set up the database:
-<pre>dpkg-reconfigure arvados-api-server</pre>
+<pre><code class="userinput">dpkg-reconfigure arvados-api-server</code></pre>
 4. Re-run @./installer.sh deploy@ again to synchronize everything, and so that the install steps that need to contact the API server are run successfully.
 
 h4. Missing ENA support (AWS Specific)
@@ -463,9 +457,9 @@ h2(#initial_user). Initial user and login
 
 At this point you should be able to log into the Arvados cluster. The initial URL will be
 
-https://workbench.${DOMAIN}@
+@https://workbench.${DOMAIN}@
 
-If you did *not* "configure a different authentication provider":#authentication you will be using the "Test" provider, and the provision script creates an initial user for testing purposes. This user is configured as administrator of the newly created cluster.  It uses the values of @INITIAL_USER@ and @INITIAL_USER_PASSWORD@ the @local.params@ file.
+If you did *not* "configure a different authentication provider":#authentication you will be using the "Test" provider, and the provision script creates an initial user for testing purposes. This user is configured as administrator of the newly created cluster.  It uses the values of @INITIAL_USER@ and @INITIAL_USER_PASSWORD@ from the @local.params*@ file.
 
 If you *did* configure a different authentication provider, the first user to log in will automatically be given Arvados admin privileges.
 
@@ -473,9 +467,9 @@ h2(#monitoring). Monitoring and Metrics
 
 You can monitor the health and performance of the system using the admin dashboard:
 
-https://grafana.${DOMAIN}@
+@https://grafana.${DOMAIN}@
 
-To log in, use username "admin" and @${INITIAL_USER_PASSWORD}@ from @local.conf@.
+To log in, use username "admin" and @${INITIAL_USER_PASSWORD}@ from @local.params.secrets@.
 
 Once logged in, you will want to add the dashboards to the front page.
 
@@ -486,6 +480,98 @@ Once logged in, you will want to add the dashboards to the front page.
 # Visit each dashboard, at the top of the page click on the star next to the title to "Mark as favorite"
 # They should now be linked on the front page.
 
+h2(#load_balancing). Load balancing controllers (optional)
+
+In order to handle high loads and perform rolling upgrades, the controller service can be scaled to a number of hosts and the installer make this implementation a fairly simple task.
+
+First, you should take care of the infrastructure deployment: if you use our Terraform code, you will need to set up the @terraform.tfvars@ in @terraform/vpc/@ so that in addition to the node named @controller@ (the load-balancer), a number of @controllerN@ nodes (backends) are defined as needed, and added to the @internal_service_hosts@ list.
+
+We suggest that the backend nodes just hold the controller service and nothing else, so they can be easily created or destroyed as needed without other service disruption.
+
+The following is an example @terraform/vpc/terraform.tfvars@ file that describes a cluster with a load-balancer, 2 backend nodes, a separate database node, a shell node, a keepstore node and a workbench node that will also hold other miscelaneous services:
+
+<pre><code>region_name = "us-east-1"
+cluster_name = "xarv1"
+domain_name = "xarv1.example.com"
+# Include controller nodes in this list so instances are assigned to the
+# private subnet. Only the balancer node should be connecting to them.
+internal_service_hosts = [ "keep0", "shell", "database", "controller1", "controller2" ]
+
+# Assign private IPs for the controller nodes. These will be used to create
+# internal DNS resolutions that will get used by the balancer and database nodes.
+private_ip = {
+  controller = "10.1.1.11"
+  workbench = "10.1.1.15"
+  database = "10.1.2.12"
+  controller1 = "10.1.2.21"
+  controller2 = "10.1.2.22"
+  shell = "10.1.2.17"
+  keep0 = "10.1.2.13"
+}</code></pre>
+
+Once the infrastructure is deployed, you'll then need to define which node will be using the @balancer@ role and which will be the @controller@ nodes in @local.params@, as it's being shown in this partial example:
+
+<pre><code>NODES=(
+  [controller.${DOMAIN}]=balancer
+  [controller1.${DOMAIN}]=controller
+  [controller2.${DOMAIN}]=controller
+  [database.${DOMAIN}]=database
+  ...
+)
+</code></pre>
+
+Note that we also set the @database@ role to its own node.
+
+h3(#rolling-upgrades). Rolling upgrades procedure
+
+Once you have more than one controller backend node, it's easy to take one at a time from the backend pool to upgrade it to a newer version of Arvados (which might involve applying database migrations) by adding its name to the @DISABLED_CONTROLLER@ variable in @local.params@. For example:
+
+<pre><code>...
+DISABLED_CONTROLLER="controller1"
+...</code></pre>
+
+Then, apply the configuration change to just the load-balancer:
+
+<pre><code class="userinput">./installer.sh deploy controller.xarv1.example.com</code></pre>
+
+This will allow you to do the necessary changes to the @controller1@ node without service disruption, as it will not be receiving any traffic until you remove it from the @DISABLED_CONTROLLER@ variable.
+
+Next step is applying the @deploy@ command to @controller1@:
+
+<pre><code class="userinput">./installer.sh deploy controller1.xarv1.example.com</code></pre>
+
+After that, disable the other controller node by editing @local.params@:
+
+<pre><code>...
+DISABLED_CONTROLLER="controller2"
+...</code></pre>
+
+...applying the changes on the balancer node:
+
+<pre><code class="userinput">./installer.sh deploy controller.xarv1.example.com</code></pre>
+
+Then, deploy the changes to the recently disabled @controller2@ node:
+
+<pre><code class="userinput">./installer.sh deploy controller2.xarv1.example.com</code></pre>
+
+This won't cause a service interruption because the load balancer is already routing all traffic to the othe @controller1@ node.
+
+And the last step is enabling both controller nodes by making the following change to @local.params@:
+
+<pre><code>...
+DISABLED_CONTROLLER=""
+...</code></pre>
+
+...and running:
+
+<pre><code class="userinput">./installer.sh deploy controller.xarv1.example.com</code></pre>
+
+This should get all your @controller@ nodes correctly upgraded, and you can continue executing the @deploy@ command with the rest of the nodes individually, or just run:
+
+<pre><code class="userinput">./installer.sh deploy</code></pre>
+
+Only the nodes with pending changes might require certain services to be restarted. In this example, the @workbench@ node will have the remaining Arvados services upgraded and restarted. However, these services are not as critical as the ones on the @controller@ nodes.
+
 h2(#post_install). After the installation
 
 As part of the operation of @installer.sh@, it automatically creates a @git@ repository with your configuration templates.  You should retain this repository but *be aware that it contains sensitive information* (passwords and tokens used by the Arvados services as well as cloud credentials if you used Terraform to create the infrastructure).
index 723e64ceabf6147a69833d5d53a68511aa1358eb..b78116255a14687208dace80d212dae452bbac80 100644 (file)
@@ -225,7 +225,7 @@ Clusters:
 
       # Maximum number of concurrent requests to process concurrently
       # in a single service process, or 0 for no limit.
-      MaxConcurrentRequests: 64
+      MaxConcurrentRequests: 4
 
       # Maximum number of incoming requests to hold in a priority
       # queue waiting for one of the MaxConcurrentRequests slots to be
@@ -234,7 +234,7 @@ Clusters:
       #
       # If MaxQueuedRequests is 0, respond 503 immediately to
       # additional requests while at the MaxConcurrentRequests limit.
-      MaxQueuedRequests: 64
+      MaxQueuedRequests: 128
 
       # Maximum time a "lock container" request is allowed to wait in
       # the incoming request queue before returning 503.
@@ -1074,7 +1074,7 @@ Clusters:
 
       # Number of times a container can be unlocked before being
       # automatically cancelled.
-      MaxDispatchAttempts: 5
+      MaxDispatchAttempts: 10
 
       # Default value for container_count_max for container requests.  This is the
       # number of times Arvados will create a new container to satisfy a container
index 17121ffeb60d65a6871cbc631310402d80136747..4583a596eebfe48a08fd862e6d840d8df401c047 100644 (file)
@@ -49,6 +49,7 @@ func (s *DispatcherSuite) SetUpTest(c *check.C) {
        s.stubDriver = &test.StubDriver{
                HostKey:                   hostpriv,
                AuthorizedKeys:            []ssh.PublicKey{dispatchpub},
+               ErrorRateCreate:           0.1,
                ErrorRateDestroy:          0.1,
                MinTimeBetweenCreateCalls: time.Millisecond,
        }
index e91878527c45f82f934b8968e1f208f308088d33..5ca83d263c1c481bd71c968299744e2cf9b2486d 100644 (file)
@@ -45,7 +45,8 @@ type StubDriver struct {
        Queue *Queue
 
        // Frequency of artificially introduced errors on calls to
-       // Destroy. 0=always succeed, 1=always fail.
+       // Create and Destroy. 0=always succeed, 1=always fail.
+       ErrorRateCreate  float64
        ErrorRateDestroy float64
 
        // If Create() or Instances() is called too frequently, return
@@ -120,6 +121,9 @@ func (sis *StubInstanceSet) Create(it arvados.InstanceType, image cloud.ImageID,
        if sis.allowCreateCall.After(time.Now()) {
                return nil, RateLimitError{sis.allowCreateCall}
        }
+       if math_rand.Float64() < sis.driver.ErrorRateCreate {
+               return nil, fmt.Errorf("StubInstanceSet: rand < ErrorRateCreate %f", sis.driver.ErrorRateCreate)
+       }
        sis.allowCreateCall = time.Now().Add(sis.driver.MinTimeBetweenCreateCalls)
        ak := sis.driver.AuthorizedKeys
        if authKey != nil {
index 9bdf3589ab6ef0589dcac19ef3f44194220f84ba..610766e198589078bfe4601f452c3088b2a73f50 100644 (file)
@@ -3,6 +3,9 @@
 # SPDX-License-Identifier: AGPL-3.0
 
 import logging
+import sys
+
 
 logger = logging.getLogger(__name__)
-logger.addHandler(logging.NullHandler())
+logger.addHandler(logging.StreamHandler(stream=sys.stderr))
+logger.setLevel(logging.WARNING)
index 463c552c4f1eb5caf0868337858197a747bc8fa8..a876257abc2a5ae00c8def17f4ae23e0219de446 100644 (file)
@@ -245,6 +245,27 @@ class Summarizer(object):
                     self.job_tot[category][stat] += val
         logger.debug('%s: done totals', self.label)
 
+        missing_category = {
+            'cpu': 'CPU',
+            'mem': 'memory',
+            'net:': 'network I/O',
+            'statfs': 'storage space',
+        }
+        for task_stat in self.task_stats.values():
+            for category in task_stat.keys():
+                for checkcat in missing_category:
+                    if checkcat.endswith(':'):
+                        if category.startswith(checkcat):
+                            missing_category.pop(checkcat)
+                            break
+                    else:
+                        if category == checkcat:
+                            missing_category.pop(checkcat)
+                            break
+        for catlabel in missing_category.values():
+            logger.warning('%s: %s stats are missing -- possible cluster configuration issue',
+                        self.label, catlabel)
+
     def long_label(self):
         label = self.label
         if hasattr(self, 'process') and self.process['uuid'] not in label:
index fb23eab39e9072f9b44ac5e3b766d25c524e5668..444cfe4ef83258543f5dd8905afbd6a0b9cf4829 100644 (file)
@@ -8,21 +8,32 @@ import crunchstat_summary.command
 import difflib
 import glob
 import gzip
-from io import open
+import io
+import logging
 import mock
 import os
 import sys
 import unittest
 
 from crunchstat_summary.command import UTF8Decode
+from crunchstat_summary import logger
 
 TESTS_DIR = os.path.dirname(os.path.abspath(__file__))
 
 
-class ReportDiff(unittest.TestCase):
+class TestCase(unittest.TestCase):
+    def setUp(self):
+        self.logbuf = io.StringIO()
+        self.loghandler = logging.StreamHandler(stream=self.logbuf)
+        logger.addHandler(self.loghandler)
+        logger.setLevel(logging.WARNING)
+
+    def tearDown(self):
+        logger.removeHandler(self.loghandler)
+
     def diff_known_report(self, logfile, cmd):
         expectfile = logfile+'.report'
-        with open(expectfile, encoding='utf-8') as f:
+        with io.open(expectfile, encoding='utf-8') as f:
             expect = f.readlines()
         self.diff_report(cmd, expect, expectfile=expectfile)
 
@@ -32,7 +43,7 @@ class ReportDiff(unittest.TestCase):
             expect, got, fromfile=expectfile, tofile="(generated)")))
 
 
-class SummarizeFile(ReportDiff):
+class SummarizeFile(TestCase):
     def test_example_files(self):
         for fnm in glob.glob(os.path.join(TESTS_DIR, '*.txt.gz')):
             logfile = os.path.join(TESTS_DIR, fnm)
@@ -43,7 +54,7 @@ class SummarizeFile(ReportDiff):
             self.diff_known_report(logfile, cmd)
 
 
-class HTMLFromFile(ReportDiff):
+class HTMLFromFile(TestCase):
     def test_example_files(self):
         # Note we don't test the output content at all yet; we're
         # mainly just verifying the --format=html option isn't ignored
@@ -54,20 +65,21 @@ class HTMLFromFile(ReportDiff):
                 ['--format=html', '--log-file', logfile])
             cmd = crunchstat_summary.command.Command(args)
             cmd.run()
-            if sys.version_info >= (3,2):
-                self.assertRegex(cmd.report(), r'(?is)<html>.*</html>\s*$')
-            else:
-                self.assertRegexpMatches(cmd.report(), r'(?is)<html>.*</html>\s*$')
+            self.assertRegex(cmd.report(), r'(?is)<html>.*</html>\s*$')
 
 
-class SummarizeEdgeCases(unittest.TestCase):
+class SummarizeEdgeCases(TestCase):
     def test_error_messages(self):
-        logfile = open(os.path.join(TESTS_DIR, 'crunchstat_error_messages.txt'), encoding='utf-8')
+        logfile = io.open(os.path.join(TESTS_DIR, 'crunchstat_error_messages.txt'), encoding='utf-8')
         s = crunchstat_summary.summarizer.Summarizer(logfile)
         s.run()
+        self.assertRegex(self.logbuf.getvalue(), r'CPU stats are missing -- possible cluster configuration issue')
+        self.assertRegex(self.logbuf.getvalue(), r'memory stats are missing -- possible cluster configuration issue')
+        self.assertRegex(self.logbuf.getvalue(), r'network I/O stats are missing -- possible cluster configuration issue')
+        self.assertRegex(self.logbuf.getvalue(), r'storage space stats are missing -- possible cluster configuration issue')
 
 
-class SummarizeContainerCommon(ReportDiff):
+class SummarizeContainerCommon(TestCase):
     fake_container = {
         'uuid': '9tee4-dz642-lymtndkpy39eibk',
         'created_at': '2017-08-18T14:27:25.371388141',
@@ -133,9 +145,11 @@ class SummarizeContainerRequest(SummarizeContainerCommon):
 
     def test_container_request(self):
         self.check_common()
+        self.assertNotRegex(self.logbuf.getvalue(), r'stats are missing')
+        self.assertNotRegex(self.logbuf.getvalue(), r'possible cluster configuration issue')
 
 
-class SummarizeJob(ReportDiff):
+class SummarizeJob(TestCase):
     fake_job_uuid = '4xphq-8i9sb-jq0ekny1xou3zoh'
     fake_log_id = 'fake-log-collection-id'
     fake_job = {
@@ -160,7 +174,7 @@ class SummarizeJob(ReportDiff):
         mock_cr().open.assert_called_with('fake-logfile.txt')
 
 
-class SummarizePipeline(ReportDiff):
+class SummarizePipeline(TestCase):
     fake_instance = {
         'uuid': 'zzzzz-d1hrv-i3e77t9z5y8j9cc',
         'owner_uuid': 'zzzzz-tpzed-xurymjxw79nv3jz',
@@ -216,7 +230,7 @@ class SummarizePipeline(ReportDiff):
         cmd = crunchstat_summary.command.Command(args)
         cmd.run()
 
-        with open(logfile+'.report', encoding='utf-8') as f:
+        with io.open(logfile+'.report', encoding='utf-8') as f:
             job_report = [line for line in f if not line.startswith('#!! ')]
         expect = (
             ['### Summary for foo (zzzzz-8i9sb-000000000000000)\n'] +
@@ -238,7 +252,7 @@ class SummarizePipeline(ReportDiff):
         mock_cr().open.assert_called_with('fake-logfile.txt')
 
 
-class SummarizeACRJob(ReportDiff):
+class SummarizeACRJob(TestCase):
     fake_job = {
         'uuid': 'zzzzz-8i9sb-i3e77t9z5y8j9cc',
         'owner_uuid': 'zzzzz-tpzed-xurymjxw79nv3jz',
@@ -291,7 +305,7 @@ class SummarizeACRJob(ReportDiff):
         cmd = crunchstat_summary.command.Command(args)
         cmd.run()
 
-        with open(logfile+'.report', encoding='utf-8') as f:
+        with io.open(logfile+'.report', encoding='utf-8') as f:
             job_report = [line for line in f if not line.startswith('#!! ')]
         expect = (
             ['### Summary for zzzzz-8i9sb-i3e77t9z5y8j9cc (partial) (zzzzz-8i9sb-i3e77t9z5y8j9cc)\n',
diff --git a/tools/salt-install/common.sh b/tools/salt-install/common.sh
new file mode 100644 (file)
index 0000000..d406f2f
--- /dev/null
@@ -0,0 +1,53 @@
+##########################################################
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: CC-BY-SA-3.0
+
+# This is generic logic used by provision.sh & installer.sh scripts
+
+if [[ -s ${CONFIG_FILE} && -s ${CONFIG_FILE}.secrets ]]; then
+  source ${CONFIG_FILE}.secrets
+  source ${CONFIG_FILE}
+else
+  echo >&2 "You don't seem to have a config file with initial values."
+  echo >&2 "Please create a '${CONFIG_FILE}' & '${CONFIG_FILE}.secrets' files as described in"
+  echo >&2 "  * https://doc.arvados.org/install/salt-single-host.html#single_host, or"
+  echo >&2 "  * https://doc.arvados.org/install/salt-multi-host.html#multi_host_multi_hostnames"
+  exit 1
+fi
+
+# Comma-separated list of nodes. This is used to dynamically adjust
+# salt pillars.
+NODELIST=""
+for node in "${!NODES[@]}"; do
+  if [ -z "$NODELIST" ]; then
+    NODELIST="$node"
+  else
+    NODELIST="$NODELIST,$node"
+  fi
+done
+
+# The mapping of roles to nodes. This is used to dynamically adjust
+# salt pillars.
+declare -A ROLE2NODES
+for node in "${!NODES[@]}"; do
+  roles="${NODES[$node]}"
+
+  # Split the comma-separated roles into an array
+  IFS=',' read -ra roles_array <<< "$roles"
+
+  for role in "${roles_array[@]}"; do
+    if [ -n "${ROLE2NODES[$role]:-}" ]; then
+      ROLE2NODES["$role"]="${ROLE2NODES[$role]},$node"
+    else
+      ROLE2NODES["$role"]=$node
+    fi
+  done
+done
+
+# Auto-detects load-balancing mode
+if [ -z "${ROLE2NODES['balancer']:-}" ]; then
+  ENABLE_BALANCER="no"
+else
+  ENABLE_BALANCER="yes"
+fi
index 98fcf5f6d93da173b97cbda753afce9b155689e5..58a7851c28a7ea2c9da1d6112816dcea85240bd0 100644 (file)
@@ -3,6 +3,9 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
+{%- set max_workers = ("__CONTROLLER_MAX_WORKERS__" or grains['num_cpus'])|int %}
+{%- set max_reqs = ("__CONTROLLER_MAX_QUEUED_REQUESTS__" or 128)|int %}
+
 # The variables commented out are the default values that the formula uses.
 # The uncommented values are REQUIRED values. If you don't set them, running
 # this formula will fail.
@@ -108,11 +111,9 @@ arvados:
             Password: __INITIAL_USER_PASSWORD__
 
     ### API
-    {%- set max_reqs = "__CONTROLLER_MAX_CONCURRENT_REQUESTS__" %}
-    {%- if max_reqs != "" and max_reqs is number %}
     API:
-      MaxConcurrentRequests: max_reqs
-    {%- endif %}
+      MaxConcurrentRequests: {{ max_workers * 2 }}
+      MaxQueuedRequests: {{ max_reqs }}
 
     ### CONTAINERS
     {%- set dispatcher_ssh_privkey = "__DISPATCHER_SSH_PRIVKEY__" %}
@@ -122,15 +123,15 @@ arvados:
         ResourceTags:
           Name: __CLUSTER__-compute-node
         BootProbeCommand: 'systemctl is-system-running'
-        ImageID: ami-FIXMEFIXMEFIXMEFI
+        ImageID: __COMPUTE_AMI__
         Driver: ec2
         DriverParameters:
-          Region: FIXME
+          Region: __COMPUTE_AWS_REGION__
           EBSVolumeType: gp3
-          AdminUsername: FIXME
+          AdminUsername: __COMPUTE_USER__
           ### This SG should allow SSH from the dispatcher to the compute nodes
-          SecurityGroupIDs: ['sg-FIXMEFIXMEFIXMEFI']
-          SubnetID: subnet-FIXMEFIXMEFIXMEFI
+          SecurityGroupIDs: ['__COMPUTE_SG__']
+          SubnetID: __COMPUTE_SUBNET__
           IAMInstanceProfile: __CLUSTER__-compute-node-00-iam-role
       DispatchPrivateKey: {{ dispatcher_ssh_privkey|yaml_dquote }}
 
@@ -145,7 +146,7 @@ arvados:
         DriverParameters:
           Bucket: __CLUSTER__-nyw5e-000000000000000-volume
           IAMRole: __CLUSTER__-keepstore-00-iam-role
-          Region: FIXME
+          Region: __KEEP_AWS_REGION__
 
     Users:
       NewUsersAreActive: true
@@ -160,10 +161,10 @@ arvados:
           'http://localhost:8003': {}
       DispatchCloud:
         InternalURLs:
-          'http://__CONTROLLER_INT_IP__:9006': {}
+          'http://__DISPATCHER_INT_IP__:9006': {}
       Keepbalance:
         InternalURLs:
-          'http://__CONTROLLER_INT_IP__:9005': {}
+          'http://__KEEPBALANCE_INT_IP__:9005': {}
       Keepproxy:
         ExternalURL: 'https://keep.__DOMAIN__:__KEEP_EXT_SSL_PORT__'
         InternalURLs:
diff --git a/tools/salt-install/config_examples/multi_host/aws/pillars/letsencrypt_balancer_configuration.sls b/tools/salt-install/config_examples/multi_host/aws/pillars/letsencrypt_balancer_configuration.sls
new file mode 100644 (file)
index 0000000..f2de52d
--- /dev/null
@@ -0,0 +1,10 @@
+---
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+### LETSENCRYPT
+letsencrypt:
+  domainsets:
+    __BALANCER_NODENAME__:
+      - __DOMAIN__
diff --git a/tools/salt-install/config_examples/multi_host/aws/pillars/nginx_balancer_configuration.sls b/tools/salt-install/config_examples/multi_host/aws/pillars/nginx_balancer_configuration.sls
new file mode 100644 (file)
index 0000000..b2b4db1
--- /dev/null
@@ -0,0 +1,129 @@
+---
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+{%- import_yaml "ssl_key_encrypted.sls" as ssl_key_encrypted_pillar %}
+{%- set domain = "__DOMAIN__" %}
+{%- set balancer_backends = "__CONTROLLER_NODES__".split(",") %}
+{%- set controller_nr = balancer_backends|length %}
+{%- set disabled_controller = "__DISABLED_CONTROLLER__" %}
+{%- if disabled_controller != "" %}
+  {%- set controller_nr = controller_nr - 1 %}
+{%- endif %}
+{%- set max_reqs = ("__CONTROLLER_MAX_QUEUED_REQUESTS__" or 128)|int %}
+
+### NGINX
+nginx:
+  ### SERVER
+  server:
+    config:
+      {%- if max_reqs != "" %}
+      worker_rlimit_nofile: {{ (max_reqs|int * 3 * controller_nr)|round|int }}
+      events:
+        worker_connections: {{ (max_reqs|int * 3 * controller_nr)|round|int }}
+      {%- else %}
+      worker_rlimit_nofile: 4096
+      events:
+        worker_connections: 1024
+      {%- endif %}
+      ### STREAMS
+      http:
+        'geo $external_client':
+          default: 1
+          '127.0.0.0/8': 0
+          '__CLUSTER_INT_CIDR__': 0
+        upstream controller_upstream:
+        {%- for backend in balancer_backends %}
+          {%- if disabled_controller == "" or not backend.startswith(disabled_controller) %}
+          'server {{ backend }}:80': ''
+          {%- else %}
+          'server {{ backend }}:80 down': ''
+          {% endif %}
+        {%- endfor %}
+
+  ### SNIPPETS
+  snippets:
+    # Based on https://ssl-config.mozilla.org/#server=nginx&version=1.14.2&config=intermediate&openssl=1.1.1d&guideline=5.4
+    ssl_hardening_default.conf:
+      - ssl_session_timeout: 1d
+      - ssl_session_cache: 'shared:arvadosSSL:10m'
+      - ssl_session_tickets: 'off'
+
+      # intermediate configuration
+      - ssl_protocols: TLSv1.2 TLSv1.3
+      - ssl_ciphers: ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384
+      - ssl_prefer_server_ciphers: 'off'
+
+      # HSTS (ngx_http_headers_module is required) (63072000 seconds)
+      - add_header: 'Strict-Transport-Security "max-age=63072000" always'
+
+      # OCSP stapling
+      - ssl_stapling: 'on'
+      - ssl_stapling_verify: 'on'
+
+      # verify chain of trust of OCSP response using Root CA and Intermediate certs
+      # - ssl_trusted_certificate /path/to/root_CA_cert_plus_intermediates
+
+      # curl https://ssl-config.mozilla.org/ffdhe2048.txt > /path/to/dhparam
+      # - ssl_dhparam: /path/to/dhparam
+
+      # replace with the IP address of your resolver
+      # - resolver: 127.0.0.1
+
+  ### SITES
+  servers:
+    managed:
+      # Remove default webserver
+      default:
+        enabled: false
+      ### DEFAULT
+      arvados_balancer_default.conf:
+        enabled: true
+        overwrite: true
+        config:
+          - server:
+            - server_name: {{ domain }}
+            - listen:
+              - 80 default
+            - location /.well-known:
+              - root: /var/www
+            - location /:
+              - return: '301 https://$host$request_uri'
+
+      arvados_balancer_ssl.conf:
+        enabled: true
+        overwrite: true
+        requires:
+          __CERT_REQUIRES__
+        config:
+          - server:
+            - server_name: {{ domain }}
+            - listen:
+              - __CONTROLLER_EXT_SSL_PORT__ http2 ssl
+            - index: index.html index.htm
+            - location /:
+              - proxy_pass: 'http://controller_upstream'
+              - proxy_read_timeout: 300
+              - proxy_connect_timeout: 90
+              - proxy_redirect: 'off'
+              - proxy_set_header: X-Forwarded-Proto https
+              - proxy_set_header: 'Host $http_host'
+              - proxy_set_header: 'X-Real-IP $remote_addr'
+              - proxy_set_header: 'X-Forwarded-For $proxy_add_x_forwarded_for'
+              - proxy_set_header: 'X-External-Client $external_client'
+              - proxy_set_header: 'Upgrade $http_upgrade'
+              - proxy_set_header: 'Connection "upgrade"'
+              - proxy_max_temp_file_size: 0
+              - proxy_request_buffering: 'off'
+              - proxy_buffering: 'off'
+              - proxy_http_version: '1.1'
+            - include: snippets/ssl_hardening_default.conf
+            - ssl_certificate: __CERT_PEM__
+            - ssl_certificate_key: __CERT_KEY__
+            {%- if ssl_key_encrypted_pillar.ssl_key_encrypted.enabled %}
+            - ssl_password_file: {{ '/run/arvados/' | path_join(ssl_key_encrypted_pillar.ssl_key_encrypted.privkey_password_filename) }}
+            {%- endif %}
+            - access_log: /var/log/nginx/{{ domain }}.access.log combined
+            - error_log: /var/log/nginx/{{ domain }}.error.log
+            - client_max_body_size: 128m
index d0fd6a1312de90a0ec52aa6cd393362e2797172b..5bd67a6ce4b1b7bbeeef6dd7744f902cec85eff3 100644 (file)
@@ -4,6 +4,8 @@
 # SPDX-License-Identifier: AGPL-3.0
 
 {%- import_yaml "ssl_key_encrypted.sls" as ssl_key_encrypted_pillar %}
+{%- set balanced_controller = ("__ENABLE_BALANCER__"|to_bool) %}
+{%- set server_name = grains['fqdn'] if balanced_controller else "__DOMAIN__" %}
 
 ### NGINX
 nginx:
@@ -28,14 +30,36 @@ nginx:
         overwrite: true
         config:
           - server:
-            - server_name: __DOMAIN__
+            - server_name: {{ server_name }}
             - listen:
               - 80 default
             - location /.well-known:
               - root: /var/www
+            {%- if balanced_controller %}
+            {%- set balancer_ip = salt['cmd.run']("getent hosts __BALANCER_NODENAME__ | awk '{print $1 ; exit}'", python_shell=True) %}
+            {%- set prometheus_ip = salt['cmd.run']("getent hosts __PROMETHEUS_NODENAME__ | awk '{print $1 ; exit}'", python_shell=True) %}
+            - index: index.html index.htm
+            - location /:
+              - allow: {{ balancer_ip }}
+              - allow: {{ prometheus_ip }}
+              - deny: all
+              - proxy_pass: 'http://controller_upstream'
+              - proxy_read_timeout: 300
+              - proxy_connect_timeout: 90
+              - proxy_redirect: 'off'
+              - proxy_max_temp_file_size: 0
+              - proxy_request_buffering: 'off'
+              - proxy_buffering: 'off'
+              - proxy_http_version: '1.1'
+            - access_log: /var/log/nginx/{{ server_name }}.access.log combined
+            - error_log: /var/log/nginx/{{ server_name }}.error.log
+            - client_max_body_size: 128m
+            {%- else %}
             - location /:
               - return: '301 https://$host$request_uri'
+            {%- endif %}
 
+      {%- if not balanced_controller %}
       arvados_controller_ssl.conf:
         enabled: true
         overwrite: true
@@ -43,7 +67,7 @@ nginx:
           __CERT_REQUIRES__
         config:
           - server:
-            - server_name: __DOMAIN__
+            - server_name: {{ server_name }}
             - listen:
               - __CONTROLLER_EXT_SSL_PORT__ http2 ssl
             - index: index.html index.htm
@@ -69,6 +93,7 @@ nginx:
             {%- if ssl_key_encrypted_pillar.ssl_key_encrypted.enabled %}
             - ssl_password_file: {{ '/run/arvados/' | path_join(ssl_key_encrypted_pillar.ssl_key_encrypted.privkey_password_filename) }}
             {%- endif %}
-            - access_log: /var/log/nginx/controller.__DOMAIN__.access.log combined
-            - error_log: /var/log/nginx/controller.__DOMAIN__.error.log
+            - access_log: /var/log/nginx/{{ server_name }}.access.log combined
+            - error_log: /var/log/nginx/{{ server_name }}.error.log
             - client_max_body_size: 128m
+      {%- endif %}
index b003172330e465173e1f1f200449d5f1bb6a171a..ce8f0ff407a2a5819d98f27c1b238291df5be578 100644 (file)
@@ -12,7 +12,8 @@
 {%- set passenger_ruby = '/usr/local/rvm/wrappers/default/ruby'
                            if grains.osfinger in ('CentOS Linux-7', 'Ubuntu-18.04', 'Debian-10') else
                          '/usr/bin/ruby' %}
-{%- set max_reqs = "__CONTROLLER_MAX_CONCURRENT_REQUESTS__" %}
+{%- set max_workers = ("__CONTROLLER_MAX_WORKERS__" or grains['num_cpus'])|int %}
+{%- set max_reqs = ("__CONTROLLER_MAX_QUEUED_REQUESTS__" or 128)|int %}
 
 ### NGINX
 nginx:
@@ -22,12 +23,15 @@ nginx:
   ### PASSENGER
   passenger:
     passenger_ruby: {{ passenger_ruby }}
-    passenger_max_pool_size: {{ "__CONTROLLER_NGINX_WORKERS__" or grains['num_cpus'] }}
-    {%- if max_reqs != "" %}
-    # Default is 100 -- Configuring this a bit higher than API.MaxConcurrentRequests
-    # to be able to handle /metrics requests even on heavy load situations.
-    passenger_max_request_queue_size: {{ (max_reqs|int * 1.1)|round|int }}
-    {%- endif %}
+    passenger_max_pool_size: {{ max_workers }}
+
+    # Make the passenger queue small (twice the concurrency, so
+    # there's at most one pending request for each busy worker)
+    # because controller reorders requests based on priority, and
+    # won't send more than API.MaxConcurrentRequests to passenger
+    # (which is max_workers * 2), so things that are moved to the head
+    # of the line get processed quickly.
+    passenger_max_request_queue_size: {{ max_workers * 2 + 1 }}
 
   ### SERVER
   server:
@@ -43,16 +47,15 @@ nginx:
       # include: 'modules-enabled/*.conf'
       load_module: {{ passenger_mod }}
       {% endif %}
-      worker_processes: {{ "__CONTROLLER_NGINX_WORKERS__" or grains['num_cpus'] }}
-      {%- if max_reqs != "" %}
-      worker_rlimit_nofile: {{ (max_reqs|int * 3)|round|int }}
-      events:
-        worker_connections: {{ (max_reqs|int * 3)|round|int }}
-      {%- else %}
-      worker_rlimit_nofile: 4096
+      worker_processes: {{ max_workers }}
+
+      # each request is up to 3 connections (1 with client, 1 proxy to
+      # controller, then potentially 1 from controller back to
+      # passenger).  Each connection consumes a file descriptor.
+      # That's how we get these calculations
+      worker_rlimit_nofile: {{ max_reqs * 3 + 1 }}
       events:
-        worker_connections: 1024
-      {%- endif %}
+        worker_connections: {{ max_reqs * 3 + 1 }}
 
   ### SNIPPETS
   snippets:
index 10cbb6c34ea73b40fd1d61269fd13317b2425d0c..70edfeb8d0675382b00126eb863669f20d990672 100644 (file)
@@ -3,6 +3,11 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
+{%- set domain = "__DOMAIN__" %}
+{%- set controller_nodes = "__CONTROLLER_NODES__".split(",") %}
+{%- set websocket_ip = "__WEBSOCKET_INT_IP__" %}
+{%- set keepbalance_ip = "__KEEPBALANCE_INT_IP__" %}
+
 ### POSTGRESQL
 postgres:
   pkgs_extra:
@@ -17,7 +22,12 @@ postgres:
     - ['host', 'all', 'all', '127.0.0.1/32', 'md5']
     - ['host', 'all', 'all', '::1/128', 'md5']
     - ['host', '__CLUSTER___arvados', '__CLUSTER___arvados', '127.0.0.1/32']
-    - ['host', '__CLUSTER___arvados', '__CLUSTER___arvados', '__CONTROLLER_INT_IP__/32']
+    - ['host', '__CLUSTER___arvados', '__CLUSTER___arvados', '{{ websocket_ip }}/32']
+    - ['host', '__CLUSTER___arvados', '__CLUSTER___arvados', '{{ keepbalance_ip }}/32']
+    {%- for controller_hostname in controller_nodes %}
+    {%- set controller_ip = salt['cmd.run']("getent hosts "+controller_hostname+" | awk '{print $1 ; exit}'", python_shell=True) %}
+    - ['host', '__CLUSTER___arvados', '__CLUSTER___arvados', '{{ controller_ip }}/32']
+    {%- endfor %}
   users:
     __CLUSTER___arvados:
       ensure: present
index bbf997b7be364700372d55b308ad527b7a7a9aae..6dc90c840b84f0fe534d608f341dc8f2373c40ab 100644 (file)
@@ -3,6 +3,9 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
+{%- set controller_nodes = "__CONTROLLER_NODES__".split(',') %}
+{%- set enable_balancer = ("__ENABLE_BALANCER__"|to_bool) %}
+
 ### PROMETHEUS
 prometheus:
   wanted:
@@ -42,12 +45,25 @@ prometheus:
                     cluster: __CLUSTER__
             - job_name: arvados_controller
               bearer_token: __MANAGEMENT_TOKEN__
+              {%- if enable_balancer %}
+              scheme: http
+              {%- else %}
               scheme: https
+              {%- endif %}
               static_configs:
+                {%- if enable_balancer %}
+                  {%- for controller in controller_nodes %}
+                - targets: ['{{ controller }}']
+                  labels:
+                    instance: {{ controller.split('.')[0] }}.__CLUSTER__
+                    cluster: __CLUSTER__
+                  {%- endfor %}
+                {%- else %}
                 - targets: ['__DOMAIN__:443']
                   labels:
                     instance: controller.__CLUSTER__
                     cluster: __CLUSTER__
+                {%- endif %}
             - job_name: keep_web
               bearer_token: __MANAGEMENT_TOKEN__
               scheme: https
@@ -59,7 +75,7 @@ prometheus:
             - job_name: keep_balance
               bearer_token: __MANAGEMENT_TOKEN__
               static_configs:
-                - targets: ['__CONTROLLER_INT_IP__:9005']
+                - targets: ['__KEEPBALANCE_INT_IP__:9005']
                   labels:
                     instance: keep-balance.__CLUSTER__
                     cluster: __CLUSTER__
@@ -73,7 +89,7 @@ prometheus:
             - job_name: arvados_dispatch_cloud
               bearer_token: __MANAGEMENT_TOKEN__
               static_configs:
-                - targets: ['__CONTROLLER_INT_IP__:9006']
+                - targets: ['__DISPATCHER_INT_IP__:9006']
                   labels:
                     instance: arvados-dispatch-cloud.__CLUSTER__
                     cluster: __CLUSTER__
@@ -90,14 +106,14 @@ prometheus:
                     cluster: __CLUSTER__
 
             # Nodes
+            {%- set node_list = "__NODELIST__".split(',') %}
+            {%- set nodes = [] %}
+            {%- for node in node_list %}
+              {%- set _ = nodes.append(node.split('.')[0]) %}
+            {%- endfor %}
             - job_name: node
               static_configs:
-                {% for node in [
-                  'controller',
-                  'keep0',
-                  'workbench',
-                  'shell',
-                ] %}
+                {% for node in nodes %}
                 - targets: [ "{{ node }}.__DOMAIN__:9100" ]
                   labels:
                     instance: "{{ node }}.__CLUSTER__"
index 4ce490421f5abe2516a3446d909af6143a82b7ae..37007da7b66dc44aae4b998b1e7d65fcee25e360 100755 (executable)
@@ -47,6 +47,11 @@ declare GITTARGET
 # This will be populated by loadconfig()
 declare USE_SSH_JUMPHOST
 
+# The temp file that will get used to disable envvar forwarding to avoid locale
+# issues in Debian distros.
+# This will be populated by loadconfig()
+declare SSH_CONFFILE
+
 checktools() {
     local MISSING=''
     for a in git ip ; do
@@ -130,17 +135,22 @@ loadconfig() {
     if ! [[ -s ${CONFIG_FILE} && -s ${CONFIG_FILE}.secrets ]]; then
                echo "Must be run from initialized setup dir, maybe you need to 'initialize' first?"
     fi
-    source ${CONFIG_FILE}.secrets
-    source ${CONFIG_FILE}
+    source common.sh
     GITTARGET=arvados-deploy-config-${CLUSTER}
+
+       # Set up SSH so that it doesn't forward any environment variable. This is to avoid
+       # getting "setlocale" errors on the first run, depending on the distro being used
+       # to run the installer (like Debian).
+       SSH_CONFFILE=$(mktemp)
+       echo "Include config SendEnv -*" > ${SSH_CONFFILE}
 }
 
 ssh_cmd() {
        local NODE=$1
        if [ -z "${USE_SSH_JUMPHOST}" -o "${NODE}" == "${USE_SSH_JUMPHOST}" -o "${NODE}" == "localhost" ]; then
-               echo "ssh"
+               echo "ssh -F ${SSH_CONFFILE}"
        else
-               echo "ssh -J ${DEPLOY_USER}@${USE_SSH_JUMPHOST}"
+               echo "ssh -F ${SSH_CONFFILE} -J ${DEPLOY_USER}@${USE_SSH_JUMPHOST}"
        fi
 }
 
@@ -296,8 +306,17 @@ case "$subcmd" in
 
            for NODE in "${!NODES[@]}"
            do
-               # then  'api' or 'controller' roles
-               if [[ "${NODES[$NODE]}" =~ (api|controller) ]] ; then
+               # then 'balancer' role
+               if [[ "${NODES[$NODE]}" =~ balancer ]] ; then
+                   deploynode $NODE "${NODES[$NODE]}"
+                   unset NODES[$NODE]
+               fi
+           done
+
+           for NODE in "${!NODES[@]}"
+           do
+               # then 'controller' role
+               if [[ "${NODES[$NODE]}" =~ controller ]] ; then
                    deploynode $NODE "${NODES[$NODE]}"
                    unset NODES[$NODE]
                fi
index d1cdfeb3c6ab18527326c0065c99c8ad0d4512f1..5d4ebdc1d57f1f92e6d3aad836d6cf863f4a3d45 100644 (file)
@@ -28,6 +28,8 @@ INITIAL_USER_EMAIL="admin@cluster_fixme_or_this_wont_work.domain_fixme_or_this_w
 # Comment out to disable.
 USE_SSH_JUMPHOST="controller.${DOMAIN}"
 
+AWS_REGION="fixme_or_this_wont_work"
+
 # SSL CERTIFICATES
 # Arvados requires SSL certificates to work correctly. This installer supports these options:
 # * self-signed: let the installer create self-signed certificate(s)
@@ -42,7 +44,17 @@ USE_LETSENCRYPT_ROUTE53="yes"
 # For that reason, you'll need to provide AWS credentials with permissions to manage
 # RRs in the route53 zone for the cluster.
 # WARNING!: If AWS credentials files already exist in the hosts, they won't be replaced.
-LE_AWS_REGION="us-east-1"
+LE_AWS_REGION="${AWS_REGION}"
+
+# Compute node configurations
+COMPUTE_AMI="ami_id_fixme_or_this_wont_work"
+COMPUTE_SG="security_group_fixme_or_this_wont_work"
+COMPUTE_SUBNET="subnet_fixme_or_this_wont_work"
+COMPUTE_AWS_REGION="${AWS_REGION}"
+COMPUTE_USER="${DEPLOY_USER}"
+
+# Keep S3 backend region
+KEEP_AWS_REGION="${AWS_REGION}"
 
 # If you going to provide your own certificates for Arvados, the provision script can
 # help you deploy them. In order to do that, you need to set `SSL_MODE=bring-your-own` above,
@@ -72,11 +84,10 @@ LE_AWS_REGION="us-east-1"
 # a custom AWS secret name for each node to retrieve the password.
 SSL_KEY_ENCRYPTED="no"
 SSL_KEY_AWS_SECRET_NAME="${CLUSTER}-arvados-ssl-privkey-password"
-SSL_KEY_AWS_REGION="us-east-1"
+SSL_KEY_AWS_REGION="${AWS_REGION}"
 
 # Customize Prometheus & Grafana web UI access credentials
 MONITORING_USERNAME=${INITIAL_USER}
-MONITORING_PASSWORD=${INITIAL_USER_PASSWORD}
 MONITORING_EMAIL=${INITIAL_USER_EMAIL}
 # Sets the directory for Grafana dashboards
 # GRAFANA_DASHBOARDS_DIR="${SCRIPT_DIR}/local_config_dir/dashboards"
@@ -85,8 +96,8 @@ MONITORING_EMAIL=${INITIAL_USER_EMAIL}
 # installer.sh will log in to each of these nodes and then provision
 # it for the specified roles.
 NODES=(
-  [controller.${DOMAIN}]=database,api,controller,websocket,dispatcher,keepbalance
-  [workbench.${DOMAIN}]=monitoring,workbench,workbench2,webshell,keepproxy,keepweb
+  [controller.${DOMAIN}]=database,controller
+  [workbench.${DOMAIN}]=monitoring,workbench,workbench2,webshell,keepproxy,keepweb,websocket,dispatcher,keepbalance
   [keep0.${DOMAIN}]=keepstore
   [shell.${DOMAIN}]=shell
 )
@@ -110,20 +121,28 @@ CLUSTER_INT_CIDR=10.1.0.0/16
 # Note the IPs in this example are shared between roles, as suggested in
 # https://doc.arvados.org/main/install/salt-multi-host.html
 CONTROLLER_INT_IP=10.1.1.11
-WEBSOCKET_INT_IP=10.1.1.11
-KEEP_INT_IP=10.1.1.15
+DATABASE_INT_IP=${CONTROLLER_INT_IP}
+WORKBENCH1_INT_IP=10.1.1.15
+DISPATCHER_INT_IP=${WORKBENCH1_INT_IP}
+KEEPBALANCE_INT_IP=${WORKBENCH1_INT_IP}
+WEBSOCKET_INT_IP=${WORKBENCH1_INT_IP}
 # Both for collections and downloads
-KEEPWEB_INT_IP=10.1.1.15
+KEEPWEB_INT_IP=${WORKBENCH1_INT_IP}
+WORKBENCH2_INT_IP=${WORKBENCH1_INT_IP}
+WEBSHELL_INT_IP=${WORKBENCH1_INT_IP}
+KEEP_INT_IP=${WORKBENCH1_INT_IP}
 KEEPSTORE0_INT_IP=10.1.2.13
-WORKBENCH1_INT_IP=10.1.1.15
-WORKBENCH2_INT_IP=10.1.1.15
-WEBSHELL_INT_IP=10.1.1.15
-DATABASE_INT_IP=10.1.1.11
 SHELL_INT_IP=10.1.2.17
 
-# Performance tuning parameters
-#CONTROLLER_NGINX_WORKERS=
-#CONTROLLER_MAX_CONCURRENT_REQUESTS=
+# In a load balanced deployment, you can do rolling upgrades by specifying one
+# controller node name at a time, so that it gets removed from the pool and can
+# be upgraded.
+DISABLED_CONTROLLER=""
+
+# Performance tuning parameters.  If these are not set, workers
+# defaults on the number of cpus and queued requests defaults to 128.
+#CONTROLLER_MAX_WORKERS=
+#CONTROLLER_MAX_QUEUED_REQUESTS=
 
 # The directory to check for the config files (pillars, states) you want to use.
 # There are a few examples under 'config_examples'.
index bec56e00be6e88d645093f884c95eb4e5eed0d0a..36cdb57b877462f26a9fc4b9eaae1acdf0e05787 100644 (file)
@@ -6,6 +6,7 @@
 # These are the security-sensitive parameters to configure the installation
 
 INITIAL_USER_PASSWORD="fixme"
+MONITORING_PASSWORD=${INITIAL_USER_PASSWORD}
 
 # YOU SHOULD CHANGE THESE TO SOME RANDOM STRINGS
 BLOB_SIGNING_KEY=fixmeblobsigningkeymushaveatleast32characters
index 7010b388b2c9d54356ac32237fd2de85321ae9f5..203b4b7e7020b682fa64c46418aa97b8c731b14c 100755 (executable)
@@ -10,6 +10,7 @@
 #
 # vagrant up
 
+set -eu
 set -o pipefail
 
 # capture the directory that the script is running from
@@ -25,7 +26,7 @@ usage() {
   echo >&2 "  -t, --test                                  Test installation running a CWL workflow"
   echo >&2 "  -r, --roles                                 List of Arvados roles to apply to the host, comma separated"
   echo >&2 "                                              Possible values are:"
-  echo >&2 "                                                api"
+  echo >&2 "                                                balancer"
   echo >&2 "                                                controller"
   echo >&2 "                                                dispatcher"
   echo >&2 "                                                keepproxy"
@@ -109,12 +110,12 @@ arguments() {
         for i in ${2//,/ }
           do
             # Verify the role exists
-            if [[ ! "database,api,controller,keepstore,websocket,keepweb,workbench2,webshell,keepbalance,keepproxy,shell,workbench,dispatcher,monitoring" == *"$i"* ]]; then
+            if [[ ! "database,balancer,controller,keepstore,websocket,keepweb,workbench2,webshell,keepbalance,keepproxy,shell,workbench,dispatcher,monitoring" == *"$i"* ]]; then
               echo "The role '${i}' is not a valid role"
               usage
               exit 1
             fi
-            ROLES="${ROLES} ${i}"
+            ROLES="${ROLES:-} ${i}"
           done
           shift 2
         ;;
@@ -158,6 +159,71 @@ copy_custom_cert() {
   fi
 }
 
+apply_var_substitutions() {
+  local SRCFILE=$1
+  local DSTFILE=$2
+  sed "s#__ANONYMOUS_USER_TOKEN__#${ANONYMOUS_USER_TOKEN}#g;
+       s#__BLOB_SIGNING_KEY__#${BLOB_SIGNING_KEY}#g;
+       s#__CONTROLLER_EXT_SSL_PORT__#${CONTROLLER_EXT_SSL_PORT}#g;
+       s#__CLUSTER__#${CLUSTER}#g;
+       s#__DOMAIN__#${DOMAIN}#g;
+       s#__HOSTNAME_EXT__#${HOSTNAME_EXT}#g;
+       s#__IP_INT__#${IP_INT}#g;
+       s#__INITIAL_USER_EMAIL__#${INITIAL_USER_EMAIL}#g;
+       s#__INITIAL_USER_PASSWORD__#${INITIAL_USER_PASSWORD}#g;
+       s#__INITIAL_USER__#${INITIAL_USER}#g;
+       s#__LE_AWS_REGION__#${LE_AWS_REGION}#g;
+       s#__LE_AWS_SECRET_ACCESS_KEY__#${LE_AWS_SECRET_ACCESS_KEY}#g;
+       s#__LE_AWS_ACCESS_KEY_ID__#${LE_AWS_ACCESS_KEY_ID}#g;
+       s#__DATABASE_PASSWORD__#${DATABASE_PASSWORD}#g;
+       s#__KEEPWEB_EXT_SSL_PORT__#${KEEPWEB_EXT_SSL_PORT}#g;
+       s#__KEEP_EXT_SSL_PORT__#${KEEP_EXT_SSL_PORT}#g;
+       s#__MANAGEMENT_TOKEN__#${MANAGEMENT_TOKEN}#g;
+       s#__RELEASE__#${RELEASE}#g;
+       s#__SYSTEM_ROOT_TOKEN__#${SYSTEM_ROOT_TOKEN}#g;
+       s#__VERSION__#${VERSION}#g;
+       s#__WEBSHELL_EXT_SSL_PORT__#${WEBSHELL_EXT_SSL_PORT}#g;
+       s#__WEBSOCKET_EXT_SSL_PORT__#${WEBSOCKET_EXT_SSL_PORT}#g;
+       s#__WORKBENCH1_EXT_SSL_PORT__#${WORKBENCH1_EXT_SSL_PORT}#g;
+       s#__WORKBENCH2_EXT_SSL_PORT__#${WORKBENCH2_EXT_SSL_PORT}#g;
+       s#__CLUSTER_INT_CIDR__#${CLUSTER_INT_CIDR}#g;
+       s#__CONTROLLER_INT_IP__#${CONTROLLER_INT_IP}#g;
+       s#__WEBSOCKET_INT_IP__#${WEBSOCKET_INT_IP}#g;
+       s#__KEEP_INT_IP__#${KEEP_INT_IP}#g;
+       s#__KEEPSTORE0_INT_IP__#${KEEPSTORE0_INT_IP}#g;
+       s#__KEEPWEB_INT_IP__#${KEEPWEB_INT_IP}#g;
+       s#__WEBSHELL_INT_IP__#${WEBSHELL_INT_IP}#g;
+       s#__SHELL_INT_IP__#${SHELL_INT_IP}#g;
+       s#__WORKBENCH1_INT_IP__#${WORKBENCH1_INT_IP}#g;
+       s#__WORKBENCH2_INT_IP__#${WORKBENCH2_INT_IP}#g;
+       s#__DATABASE_INT_IP__#${DATABASE_INT_IP}#g;
+       s#__WORKBENCH_SECRET_KEY__#${WORKBENCH_SECRET_KEY}#g;
+       s#__SSL_KEY_ENCRYPTED__#${SSL_KEY_ENCRYPTED}#g;
+       s#__SSL_KEY_AWS_REGION__#${SSL_KEY_AWS_REGION}#g;
+       s#__SSL_KEY_AWS_SECRET_NAME__#${SSL_KEY_AWS_SECRET_NAME}#g;
+       s#__CONTROLLER_MAX_WORKERS__#${CONTROLLER_MAX_WORKERS:-}#g;
+       s#__CONTROLLER_MAX_QUEUED_REQUESTS__#${CONTROLLER_MAX_QUEUED_REQUESTS:-128}#g;
+       s#__MONITORING_USERNAME__#${MONITORING_USERNAME}#g;
+       s#__MONITORING_EMAIL__#${MONITORING_EMAIL}#g;
+       s#__MONITORING_PASSWORD__#${MONITORING_PASSWORD}#g;
+       s#__DISPATCHER_SSH_PRIVKEY__#${DISPATCHER_SSH_PRIVKEY//$'\n'/\\n}#g;
+       s#__ENABLE_BALANCER__#${ENABLE_BALANCER}#g;
+       s#__DISABLED_CONTROLLER__#${DISABLED_CONTROLLER}#g;
+       s#__BALANCER_NODENAME__#${ROLE2NODES['balancer']:-}#g;
+       s#__PROMETHEUS_NODENAME__#${ROLE2NODES['monitoring']:-}#g;
+       s#__CONTROLLER_NODES__#${ROLE2NODES['controller']}#g;
+       s#__NODELIST__#${NODELIST}#g;
+       s#__DISPATCHER_INT_IP__#${DISPATCHER_INT_IP}#g;
+       s#__KEEPBALANCE_INT_IP__#${KEEPBALANCE_INT_IP}#g;
+       s#__COMPUTE_AMI__#${COMPUTE_AMI}#g;
+       s#__COMPUTE_SG__#${COMPUTE_SG}#g;
+       s#__COMPUTE_SUBNET__#${COMPUTE_SUBNET}#g;
+       s#__COMPUTE_AWS_REGION__#${COMPUTE_AWS_REGION}#g;
+       s#__COMPUTE_USER__#${COMPUTE_USER}#g;
+       s#__KEEP_AWS_REGION__#${KEEP_AWS_REGION}#g" \
+  "${SRCFILE}" > "${DSTFILE}"
+}
+
 DEV_MODE="no"
 CONFIG_FILE="${SCRIPT_DIR}/local.params"
 CONFIG_DIR="local_config_dir"
@@ -230,27 +296,22 @@ GRAFANA_TAG="v3.1.3"
 DUMP_SALT_CONFIG_DIR=""
 ## states
 S_DIR="/srv/salt"
+STATES_TOP=${S_DIR}/top.sls
 ## formulas
 F_DIR="/srv/formulas"
 ## pillars
 P_DIR="/srv/pillars"
+PILLARS_TOP=${P_DIR}/top.sls
 ## tests
 T_DIR="/tmp/cluster_tests"
 
 arguments ${@}
 
 declare -A NODES
+declare -A ROLES
+declare NODELIST
 
-if [[ -s ${CONFIG_FILE} && -s ${CONFIG_FILE}.secrets ]]; then
-  source ${CONFIG_FILE}.secrets
-  source ${CONFIG_FILE}
-else
-  echo >&2 "You don't seem to have a config file with initial values."
-  echo >&2 "Please create a '${CONFIG_FILE}' & '${CONFIG_FILE}.secrets' files as described in"
-  echo >&2 "  * https://doc.arvados.org/install/salt-single-host.html#single_host, or"
-  echo >&2 "  * https://doc.arvados.org/install/salt-multi-host.html#multi_host_multi_hostnames"
-  exit 1
-fi
+source common.sh
 
 if [ ! -d ${CONFIG_DIR} ]; then
   echo >&2 "You don't seem to have a config directory with pillars and states."
@@ -273,7 +334,7 @@ if ! grep -qE '^[[:alnum:]]{5}$' <<<${CLUSTER} ; then
 fi
 
 # Only used in single_host/single_name deploys
-if [ ! -z "${HOSTNAME_EXT}" ] ; then
+if [ ! -z "${HOSTNAME_EXT:-}" ] ; then
   # We need to add some extra control vars to manage a single certificate vs. multiple
   USE_SINGLE_HOSTNAME="yes"
   # Make sure that the value configured as IP_INT is a real IP on the system.
@@ -388,13 +449,13 @@ echo "...arvados"
 test -d arvados || git clone --quiet https://git.arvados.org/arvados-formula.git ${F_DIR}/arvados
 
 # If we want to try a specific branch of the formula
-if [ "x${BRANCH}" != "x" ]; then
+if [ "x${BRANCH:-}" != "x" ]; then
   ( cd ${F_DIR}/arvados && git checkout --quiet -t origin/"${BRANCH}" -b "${BRANCH}" )
-elif [ "x${ARVADOS_TAG}" != "x" ]; then
+elif [ "x${ARVADOS_TAG:-}" != "x" ]; then
   ( cd ${F_DIR}/arvados && git checkout --quiet tags/"${ARVADOS_TAG}" -b "${ARVADOS_TAG}" )
 fi
 
-if [ "x${VAGRANT}" = "xyes" ]; then
+if [ "x${VAGRANT:-}" = "xyes" ]; then
   EXTRA_STATES_DIR="/home/vagrant/${CONFIG_DIR}/states"
   SOURCE_PILLARS_DIR="/home/vagrant/${CONFIG_DIR}/pillars"
   SOURCE_TOFS_DIR="/home/vagrant/${CONFIG_DIR}/tofs"
@@ -417,57 +478,12 @@ if [ ! -d "${SOURCE_PILLARS_DIR}" ]; then
   exit 1
 fi
 for f in $(ls "${SOURCE_PILLARS_DIR}"/*); do
-  sed "s#__ANONYMOUS_USER_TOKEN__#${ANONYMOUS_USER_TOKEN}#g;
-       s#__BLOB_SIGNING_KEY__#${BLOB_SIGNING_KEY}#g;
-       s#__CONTROLLER_EXT_SSL_PORT__#${CONTROLLER_EXT_SSL_PORT}#g;
-       s#__CLUSTER__#${CLUSTER}#g;
-       s#__DOMAIN__#${DOMAIN}#g;
-       s#__HOSTNAME_EXT__#${HOSTNAME_EXT}#g;
-       s#__IP_INT__#${IP_INT}#g;
-       s#__INITIAL_USER_EMAIL__#${INITIAL_USER_EMAIL}#g;
-       s#__INITIAL_USER_PASSWORD__#${INITIAL_USER_PASSWORD}#g;
-       s#__INITIAL_USER__#${INITIAL_USER}#g;
-       s#__LE_AWS_REGION__#${LE_AWS_REGION}#g;
-       s#__LE_AWS_SECRET_ACCESS_KEY__#${LE_AWS_SECRET_ACCESS_KEY}#g;
-       s#__LE_AWS_ACCESS_KEY_ID__#${LE_AWS_ACCESS_KEY_ID}#g;
-       s#__DATABASE_PASSWORD__#${DATABASE_PASSWORD}#g;
-       s#__KEEPWEB_EXT_SSL_PORT__#${KEEPWEB_EXT_SSL_PORT}#g;
-       s#__KEEP_EXT_SSL_PORT__#${KEEP_EXT_SSL_PORT}#g;
-       s#__MANAGEMENT_TOKEN__#${MANAGEMENT_TOKEN}#g;
-       s#__RELEASE__#${RELEASE}#g;
-       s#__SYSTEM_ROOT_TOKEN__#${SYSTEM_ROOT_TOKEN}#g;
-       s#__VERSION__#${VERSION}#g;
-       s#__WEBSHELL_EXT_SSL_PORT__#${WEBSHELL_EXT_SSL_PORT}#g;
-       s#__WEBSOCKET_EXT_SSL_PORT__#${WEBSOCKET_EXT_SSL_PORT}#g;
-       s#__WORKBENCH1_EXT_SSL_PORT__#${WORKBENCH1_EXT_SSL_PORT}#g;
-       s#__WORKBENCH2_EXT_SSL_PORT__#${WORKBENCH2_EXT_SSL_PORT}#g;
-       s#__CLUSTER_INT_CIDR__#${CLUSTER_INT_CIDR}#g;
-       s#__CONTROLLER_INT_IP__#${CONTROLLER_INT_IP}#g;
-       s#__WEBSOCKET_INT_IP__#${WEBSOCKET_INT_IP}#g;
-       s#__KEEP_INT_IP__#${KEEP_INT_IP}#g;
-       s#__KEEPSTORE0_INT_IP__#${KEEPSTORE0_INT_IP}#g;
-       s#__KEEPWEB_INT_IP__#${KEEPWEB_INT_IP}#g;
-       s#__WEBSHELL_INT_IP__#${WEBSHELL_INT_IP}#g;
-       s#__SHELL_INT_IP__#${SHELL_INT_IP}#g;
-       s#__WORKBENCH1_INT_IP__#${WORKBENCH1_INT_IP}#g;
-       s#__WORKBENCH2_INT_IP__#${WORKBENCH2_INT_IP}#g;
-       s#__DATABASE_INT_IP__#${DATABASE_INT_IP}#g;
-       s#__WORKBENCH_SECRET_KEY__#${WORKBENCH_SECRET_KEY}#g;
-       s#__SSL_KEY_ENCRYPTED__#${SSL_KEY_ENCRYPTED}#g;
-       s#__SSL_KEY_AWS_REGION__#${SSL_KEY_AWS_REGION}#g;
-       s#__SSL_KEY_AWS_SECRET_NAME__#${SSL_KEY_AWS_SECRET_NAME}#g;
-       s#__CONTROLLER_NGINX_WORKERS__#${CONTROLLER_NGINX_WORKERS}#g;
-       s#__CONTROLLER_MAX_CONCURRENT_REQUESTS__#${CONTROLLER_MAX_CONCURRENT_REQUESTS}#g;
-       s#__MONITORING_USERNAME__#${MONITORING_USERNAME}#g;
-       s#__MONITORING_EMAIL__#${MONITORING_EMAIL}#g;
-       s#__MONITORING_PASSWORD__#${MONITORING_PASSWORD}#g;
-       s#__DISPATCHER_SSH_PRIVKEY__#${DISPATCHER_SSH_PRIVKEY//$'\n'/\\n}#g" \
-  "${f}" > "${P_DIR}"/$(basename "${f}")
+  apply_var_substitutions "${f}" "${P_DIR}"/$(basename "${f}")
 done
 
 if [ ! -d "${SOURCE_TESTS_DIR}" ]; then
   echo "WARNING: The tests directory was not copied to \"${SOURCE_TESTS_DIR}\"."
-  if [ "x${TEST}" = "xyes" ]; then
+  if [ "x${TEST:-}" = "xyes" ]; then
     echo "WARNING: Disabling tests for this installation."
   fi
   TEST="no"
@@ -500,49 +516,7 @@ if [ -d "${SOURCE_STATES_DIR}" ]; then
   rm -f "${F_DIR}"/extra/extra/*
 
   for f in $(ls "${SOURCE_STATES_DIR}"/*); do
-    sed "s#__ANONYMOUS_USER_TOKEN__#${ANONYMOUS_USER_TOKEN}#g;
-         s#__CLUSTER__#${CLUSTER}#g;
-         s#__BLOB_SIGNING_KEY__#${BLOB_SIGNING_KEY}#g;
-         s#__CONTROLLER_EXT_SSL_PORT__#${CONTROLLER_EXT_SSL_PORT}#g;
-         s#__DOMAIN__#${DOMAIN}#g;
-         s#__HOSTNAME_EXT__#${HOSTNAME_EXT}#g;
-         s#__IP_INT__#${IP_INT}#g;
-         s#__INITIAL_USER_EMAIL__#${INITIAL_USER_EMAIL}#g;
-         s#__INITIAL_USER_PASSWORD__#${INITIAL_USER_PASSWORD}#g;
-         s#__INITIAL_USER__#${INITIAL_USER}#g;
-         s#__DATABASE_PASSWORD__#${DATABASE_PASSWORD}#g;
-         s#__KEEPWEB_EXT_SSL_PORT__#${KEEPWEB_EXT_SSL_PORT}#g;
-         s#__KEEP_EXT_SSL_PORT__#${KEEP_EXT_SSL_PORT}#g;
-         s#__MANAGEMENT_TOKEN__#${MANAGEMENT_TOKEN}#g;
-         s#__RELEASE__#${RELEASE}#g;
-         s#__SYSTEM_ROOT_TOKEN__#${SYSTEM_ROOT_TOKEN}#g;
-         s#__VERSION__#${VERSION}#g;
-         s#__CLUSTER_INT_CIDR__#${CLUSTER_INT_CIDR}#g;
-         s#__CONTROLLER_INT_IP__#${CONTROLLER_INT_IP}#g;
-         s#__WEBSOCKET_INT_IP__#${WEBSOCKET_INT_IP}#g;
-         s#__KEEP_INT_IP__#${KEEP_INT_IP}#g;
-         s#__KEEPSTORE0_INT_IP__#${KEEPSTORE0_INT_IP}#g;
-         s#__KEEPWEB_INT_IP__#${KEEPWEB_INT_IP}#g;
-         s#__WEBSHELL_INT_IP__#${WEBSHELL_INT_IP}#g;
-         s#__WORKBENCH1_INT_IP__#${WORKBENCH1_INT_IP}#g;
-         s#__WORKBENCH2_INT_IP__#${WORKBENCH2_INT_IP}#g;
-         s#__DATABASE_INT_IP__#${DATABASE_INT_IP}#g;
-         s#__WEBSHELL_EXT_SSL_PORT__#${WEBSHELL_EXT_SSL_PORT}#g;
-         s#__SHELL_INT_IP__#${SHELL_INT_IP}#g;
-         s#__WEBSOCKET_EXT_SSL_PORT__#${WEBSOCKET_EXT_SSL_PORT}#g;
-         s#__WORKBENCH1_EXT_SSL_PORT__#${WORKBENCH1_EXT_SSL_PORT}#g;
-         s#__WORKBENCH2_EXT_SSL_PORT__#${WORKBENCH2_EXT_SSL_PORT}#g;
-         s#__WORKBENCH_SECRET_KEY__#${WORKBENCH_SECRET_KEY}#g;
-         s#__SSL_KEY_ENCRYPTED__#${SSL_KEY_ENCRYPTED}#g;
-         s#__SSL_KEY_AWS_REGION__#${SSL_KEY_AWS_REGION}#g;
-         s#__SSL_KEY_AWS_SECRET_NAME__#${SSL_KEY_AWS_SECRET_NAME}#g;
-         s#__CONTROLLER_NGINX_WORKERS__#${CONTROLLER_NGINX_WORKERS}#g;
-         s#__CONTROLLER_MAX_CONCURRENT_REQUESTS__#${CONTROLLER_MAX_CONCURRENT_REQUESTS}#g;
-         s#__MONITORING_USERNAME__#${MONITORING_USERNAME}#g;
-         s#__MONITORING_EMAIL__#${MONITORING_EMAIL}#g;
-         s#__MONITORING_PASSWORD__#${MONITORING_PASSWORD}#g;
-         s#__DISPATCHER_SSH_PRIVKEY__#${DISPATCHER_SSH_PRIVKEY//$'\n'/\\n}#g" \
-    "${f}" > "${F_DIR}/extra/extra"/$(basename "${f}")
+    apply_var_substitutions "${f}" "${F_DIR}/extra/extra"/$(basename "${f}")
   done
 fi
 
@@ -557,14 +531,14 @@ if [ -d ${SOURCE_TOFS_DIR} ]; then
 fi
 
 # States
-cat > ${S_DIR}/top.sls << EOFTSLS
+cat > ${STATES_TOP} << EOFTSLS
 base:
   '*':
     - locale
 EOFTSLS
 
 # Pillars
-cat > ${P_DIR}/top.sls << EOFPSLS
+cat > ${PILLARS_TOP} << EOFPSLS
 base:
   '*':
     - locale
@@ -582,7 +556,7 @@ if [ -d "${F_DIR}"/extra/extra ]; then
     SKIP_SNAKE_OIL="dont_add_snakeoil_certs"
   fi
   for f in $(ls "${F_DIR}"/extra/extra/*.sls | egrep -v "${SKIP_SNAKE_OIL}|shell_"); do
-  echo "    - extra.$(basename ${f} | sed 's/.sls$//g')" >> ${S_DIR}/top.sls
+  echo "    - extra.$(basename ${f} | sed 's/.sls$//g')" >> ${STATES_TOP}
   done
   # Use byo or self-signed certificates
   if [ "${SSL_MODE}" != "lets-encrypt" ]; then
@@ -594,49 +568,49 @@ fi
 # and its dependencies
 if [ -z "${ROLES}" ]; then
   # States
-  echo "    - nginx.passenger" >> ${S_DIR}/top.sls
+  echo "    - nginx.passenger" >> ${STATES_TOP}
   if [ "${SSL_MODE}" = "lets-encrypt" ]; then
     if [ "${USE_LETSENCRYPT_ROUTE53}" = "yes" ]; then
-      grep -q "aws_credentials" ${S_DIR}/top.sls || echo "    - extra.aws_credentials" >> ${S_DIR}/top.sls
+      grep -q "aws_credentials" ${STATES_TOP} || echo "    - extra.aws_credentials" >> ${STATES_TOP}
     fi
-    grep -q "letsencrypt" ${S_DIR}/top.sls || echo "    - letsencrypt" >> ${S_DIR}/top.sls
+    grep -q "letsencrypt" ${STATES_TOP} || echo "    - letsencrypt" >> ${STATES_TOP}
   else
     mkdir -p --mode=0700 /srv/salt/certs
     if [ "${SSL_MODE}" = "bring-your-own" ]; then
       # Copy certs to formula extra/files
       install --mode=0600 ${CUSTOM_CERTS_DIR}/* /srv/salt/certs/
       # We add the custom_certs state
-      grep -q "custom_certs" ${S_DIR}/top.sls || echo "    - extra.custom_certs" >> ${S_DIR}/top.sls
+      grep -q "custom_certs" ${STATES_TOP} || echo "    - extra.custom_certs" >> ${STATES_TOP}
       if [ "${SSL_KEY_ENCRYPTED}" = "yes" ]; then
-        grep -q "ssl_key_encrypted" ${S_DIR}/top.sls || echo "    - extra.ssl_key_encrypted" >> ${S_DIR}/top.sls
+        grep -q "ssl_key_encrypted" ${STATES_TOP} || echo "    - extra.ssl_key_encrypted" >> ${STATES_TOP}
       fi
     fi
     # In self-signed mode, the certificate files will be created and put in the
     # destination directory by the snakeoil_certs.sls state file
   fi
 
-  echo "    - postgres" >> ${S_DIR}/top.sls
-  echo "    - logrotate" >> ${S_DIR}/top.sls
-  echo "    - docker.software" >> ${S_DIR}/top.sls
-  echo "    - arvados" >> ${S_DIR}/top.sls
-  echo "    - extra.shell_sudo_passwordless" >> ${S_DIR}/top.sls
-  echo "    - extra.shell_cron_add_login_sync" >> ${S_DIR}/top.sls
-  echo "    - extra.passenger_rvm" >> ${S_DIR}/top.sls
+  echo "    - postgres" >> ${STATES_TOP}
+  echo "    - logrotate" >> ${STATES_TOP}
+  echo "    - docker.software" >> ${STATES_TOP}
+  echo "    - arvados" >> ${STATES_TOP}
+  echo "    - extra.shell_sudo_passwordless" >> ${STATES_TOP}
+  echo "    - extra.shell_cron_add_login_sync" >> ${STATES_TOP}
+  echo "    - extra.passenger_rvm" >> ${STATES_TOP}
 
   # Pillars
-  echo "    - docker" >> ${P_DIR}/top.sls
-  echo "    - nginx_api_configuration" >> ${P_DIR}/top.sls
-  echo "    - logrotate_api" >> ${P_DIR}/top.sls
-  echo "    - nginx_controller_configuration" >> ${P_DIR}/top.sls
-  echo "    - nginx_keepproxy_configuration" >> ${P_DIR}/top.sls
-  echo "    - nginx_keepweb_configuration" >> ${P_DIR}/top.sls
-  echo "    - nginx_passenger" >> ${P_DIR}/top.sls
-  echo "    - nginx_websocket_configuration" >> ${P_DIR}/top.sls
-  echo "    - nginx_webshell_configuration" >> ${P_DIR}/top.sls
-  echo "    - nginx_workbench2_configuration" >> ${P_DIR}/top.sls
-  echo "    - nginx_workbench_configuration" >> ${P_DIR}/top.sls
-  echo "    - logrotate_wb1" >> ${P_DIR}/top.sls
-  echo "    - postgresql" >> ${P_DIR}/top.sls
+  echo "    - docker" >> ${PILLARS_TOP}
+  echo "    - nginx_api_configuration" >> ${PILLARS_TOP}
+  echo "    - logrotate_api" >> ${PILLARS_TOP}
+  echo "    - nginx_controller_configuration" >> ${PILLARS_TOP}
+  echo "    - nginx_keepproxy_configuration" >> ${PILLARS_TOP}
+  echo "    - nginx_keepweb_configuration" >> ${PILLARS_TOP}
+  echo "    - nginx_passenger" >> ${PILLARS_TOP}
+  echo "    - nginx_websocket_configuration" >> ${PILLARS_TOP}
+  echo "    - nginx_webshell_configuration" >> ${PILLARS_TOP}
+  echo "    - nginx_workbench2_configuration" >> ${PILLARS_TOP}
+  echo "    - nginx_workbench_configuration" >> ${PILLARS_TOP}
+  echo "    - logrotate_wb1" >> ${PILLARS_TOP}
+  echo "    - postgresql" >> ${PILLARS_TOP}
 
   # We need to tweak the Nginx's pillar depending whether we want plan nginx or nginx+passenger
   NGINX_INSTALL_SOURCE="install_from_phusionpassenger"
@@ -644,9 +618,9 @@ if [ -z "${ROLES}" ]; then
 
   if [ "${SSL_MODE}" = "lets-encrypt" ]; then
     if [ "${USE_LETSENCRYPT_ROUTE53}" = "yes" ]; then
-      grep -q "aws_credentials" ${P_DIR}/top.sls || echo "    - aws_credentials" >> ${P_DIR}/top.sls
+      grep -q "aws_credentials" ${PILLARS_TOP} || echo "    - aws_credentials" >> ${PILLARS_TOP}
     fi
-    grep -q "letsencrypt" ${P_DIR}/top.sls || echo "    - letsencrypt" >> ${P_DIR}/top.sls
+    grep -q "letsencrypt" ${PILLARS_TOP} || echo "    - letsencrypt" >> ${PILLARS_TOP}
 
     hosts=("controller" "websocket" "workbench" "workbench2" "webshell" "keepproxy")
     if [ ${USE_SINGLE_HOSTNAME} = "no" ]; then
@@ -673,7 +647,7 @@ if [ -z "${ROLES}" ]; then
     done
   else
     # Use custom certs (either dev mode or prod)
-    grep -q "extra_custom_certs" ${P_DIR}/top.sls || echo "    - extra_custom_certs" >> ${P_DIR}/top.sls
+    grep -q "extra_custom_certs" ${PILLARS_TOP} || echo "    - extra_custom_certs" >> ${PILLARS_TOP}
     # And add the certs in the custom_certs pillar
     echo "extra_custom_certs_dir: /srv/salt/certs" > ${P_DIR}/extra_custom_certs.sls
     echo "extra_custom_certs:" >> ${P_DIR}/extra_custom_certs.sls
@@ -703,11 +677,11 @@ if [ -z "${ROLES}" ]; then
   fi
 else
   # If we add individual roles, make sure we add the repo first
-  echo "    - arvados.repo" >> ${S_DIR}/top.sls
+  echo "    - arvados.repo" >> ${STATES_TOP}
   # We add the extra_custom_certs state
-  grep -q "extra.custom_certs"    ${S_DIR}/top.sls || echo "    - extra.custom_certs" >> ${S_DIR}/top.sls
+  grep -q "extra.custom_certs"    ${STATES_TOP} || echo "    - extra.custom_certs" >> ${STATES_TOP}
   if [ "${SSL_KEY_ENCRYPTED}" = "yes" ]; then
-    grep -q "ssl_key_encrypted" ${S_DIR}/top.sls || echo "    - extra.ssl_key_encrypted" >> ${S_DIR}/top.sls
+    grep -q "ssl_key_encrypted" ${STATES_TOP} || echo "    - extra.ssl_key_encrypted" >> ${STATES_TOP}
   fi
 
   # And we add the basic part for the certs pillar
@@ -715,23 +689,23 @@ else
     # And add the certs in the custom_certs pillar
     echo "extra_custom_certs_dir: /srv/salt/certs" > ${P_DIR}/extra_custom_certs.sls
     echo "extra_custom_certs:" >> ${P_DIR}/extra_custom_certs.sls
-    grep -q "extra_custom_certs" ${P_DIR}/top.sls || echo "    - extra_custom_certs" >> ${P_DIR}/top.sls
+    grep -q "extra_custom_certs" ${PILLARS_TOP} || echo "    - extra_custom_certs" >> ${PILLARS_TOP}
   fi
 
   # Prometheus state on all nodes due to the node exporter below
-  grep -q "\- prometheus$" ${S_DIR}/top.sls || echo "    - prometheus" >> ${S_DIR}/top.sls
+  grep -q "\- prometheus$" ${STATES_TOP} || echo "    - prometheus" >> ${STATES_TOP}
   # Prometheus node exporter pillar
-  grep -q "prometheus_node_exporter" ${P_DIR}/top.sls || echo "    - prometheus_node_exporter" >> ${P_DIR}/top.sls
+  grep -q "prometheus_node_exporter" ${PILLARS_TOP} || echo "    - prometheus_node_exporter" >> ${PILLARS_TOP}
 
   for R in ${ROLES}; do
     case "${R}" in
       "database")
         # States
-        grep -q "\- postgres$" ${S_DIR}/top.sls || echo "    - postgres" >> ${S_DIR}/top.sls
-        grep -q "extra.prometheus_pg_exporter" ${S_DIR}/top.sls || echo "    - extra.prometheus_pg_exporter" >> ${S_DIR}/top.sls
+        grep -q "\- postgres$" ${STATES_TOP} || echo "    - postgres" >> ${STATES_TOP}
+        grep -q "extra.prometheus_pg_exporter" ${STATES_TOP} || echo "    - extra.prometheus_pg_exporter" >> ${STATES_TOP}
         # Pillars
-        grep -q "postgresql" ${P_DIR}/top.sls || echo "    - postgresql" >> ${P_DIR}/top.sls
-        grep -q "prometheus_pg_exporter" ${P_DIR}/top.sls || echo "    - prometheus_pg_exporter" >> ${P_DIR}/top.sls
+        grep -q "postgresql" ${PILLARS_TOP} || echo "    - postgresql" >> ${PILLARS_TOP}
+        grep -q "prometheus_pg_exporter" ${PILLARS_TOP} || echo "    - prometheus_pg_exporter" >> ${PILLARS_TOP}
       ;;
       "monitoring")
         ### Support files ###
@@ -746,18 +720,18 @@ else
         done
 
         ### States ###
-        grep -q "\- nginx$" ${S_DIR}/top.sls || echo "    - nginx" >> ${S_DIR}/top.sls
-        grep -q "extra.nginx_prometheus_configuration" ${S_DIR}/top.sls || echo "    - extra.nginx_prometheus_configuration" >> ${S_DIR}/top.sls
+        grep -q "\- nginx$" ${STATES_TOP} || echo "    - nginx" >> ${STATES_TOP}
+        grep -q "extra.nginx_prometheus_configuration" ${STATES_TOP} || echo "    - extra.nginx_prometheus_configuration" >> ${STATES_TOP}
 
-        grep -q "\- grafana$" ${S_DIR}/top.sls || echo "    - grafana" >> ${S_DIR}/top.sls
-        grep -q "extra.grafana_datasource" ${S_DIR}/top.sls || echo "    - extra.grafana_datasource" >> ${S_DIR}/top.sls
-        grep -q "extra.grafana_dashboards" ${S_DIR}/top.sls || echo "    - extra.grafana_dashboards" >> ${S_DIR}/top.sls
-        grep -q "extra.grafana_admin_user" ${S_DIR}/top.sls || echo "    - extra.grafana_admin_user" >> ${S_DIR}/top.sls
+        grep -q "\- grafana$" ${STATES_TOP} || echo "    - grafana" >> ${STATES_TOP}
+        grep -q "extra.grafana_datasource" ${STATES_TOP} || echo "    - extra.grafana_datasource" >> ${STATES_TOP}
+        grep -q "extra.grafana_dashboards" ${STATES_TOP} || echo "    - extra.grafana_dashboards" >> ${STATES_TOP}
+        grep -q "extra.grafana_admin_user" ${STATES_TOP} || echo "    - extra.grafana_admin_user" >> ${STATES_TOP}
 
         if [ "${SSL_MODE}" = "lets-encrypt" ]; then
-          grep -q "letsencrypt"     ${S_DIR}/top.sls || echo "    - letsencrypt" >> ${S_DIR}/top.sls
-          if [ "x${USE_LETSENCRYPT_ROUTE53}" = "xyes" ]; then
-            grep -q "aws_credentials" ${S_DIR}/top.sls || echo "    - aws_credentials" >> ${S_DIR}/top.sls
+          grep -q "letsencrypt"     ${STATES_TOP} || echo "    - letsencrypt" >> ${STATES_TOP}
+          if [ "x${USE_LETSENCRYPT_ROUTE53:-}" = "xyes" ]; then
+            grep -q "aws_credentials" ${STATES_TOP} || echo "    - aws_credentials" >> ${STATES_TOP}
           fi
         elif [ "${SSL_MODE}" = "bring-your-own" ]; then
           for SVC in grafana prometheus; do
@@ -765,25 +739,25 @@ else
           done
         fi
         ### Pillars ###
-        grep -q "prometheus_server" ${P_DIR}/top.sls || echo "    - prometheus_server" >> ${P_DIR}/top.sls
-        grep -q "grafana" ${P_DIR}/top.sls || echo "    - grafana" >> ${P_DIR}/top.sls
+        grep -q "prometheus_server" ${PILLARS_TOP} || echo "    - prometheus_server" >> ${PILLARS_TOP}
+        grep -q "grafana" ${PILLARS_TOP} || echo "    - grafana" >> ${PILLARS_TOP}
         for SVC in grafana prometheus; do
-          grep -q "nginx_${SVC}_configuration" ${P_DIR}/top.sls || echo "    - nginx_${SVC}_configuration" >> ${P_DIR}/top.sls
+          grep -q "nginx_${SVC}_configuration" ${PILLARS_TOP} || echo "    - nginx_${SVC}_configuration" >> ${PILLARS_TOP}
         done
         if [ "${SSL_MODE}" = "lets-encrypt" ]; then
-          grep -q "letsencrypt"     ${P_DIR}/top.sls || echo "    - letsencrypt" >> ${P_DIR}/top.sls
+          grep -q "letsencrypt"     ${PILLARS_TOP} || echo "    - letsencrypt" >> ${PILLARS_TOP}
           for SVC in grafana prometheus; do
-            grep -q "letsencrypt_${SVC}_configuration" ${P_DIR}/top.sls || echo "    - letsencrypt_${SVC}_configuration" >> ${P_DIR}/top.sls
+            grep -q "letsencrypt_${SVC}_configuration" ${PILLARS_TOP} || echo "    - letsencrypt_${SVC}_configuration" >> ${PILLARS_TOP}
             sed -i "s/__CERT_REQUIRES__/cmd: create-initial-cert-${SVC}.${DOMAIN}*/g;
                     s#__CERT_PEM__#/etc/letsencrypt/live/${SVC}.${DOMAIN}/fullchain.pem#g;
                     s#__CERT_KEY__#/etc/letsencrypt/live/${SVC}.${DOMAIN}/privkey.pem#g" \
             ${P_DIR}/nginx_${SVC}_configuration.sls
           done
           if [ "${USE_LETSENCRYPT_ROUTE53}" = "yes" ]; then
-            grep -q "aws_credentials" ${P_DIR}/top.sls || echo "    - aws_credentials" >> ${P_DIR}/top.sls
+            grep -q "aws_credentials" ${PILLARS_TOP} || echo "    - aws_credentials" >> ${PILLARS_TOP}
           fi
         elif [ "${SSL_MODE}" = "bring-your-own" ]; then
-          grep -q "ssl_key_encrypted" ${P_DIR}/top.sls || echo "    - ssl_key_encrypted" >> ${P_DIR}/top.sls
+          grep -q "ssl_key_encrypted" ${PILLARS_TOP} || echo "    - ssl_key_encrypted" >> ${PILLARS_TOP}
           for SVC in grafana prometheus; do
             sed -i "s/__CERT_REQUIRES__/file: extra_custom_certs_file_copy_arvados-${SVC}.pem/g;
                     s#__CERT_PEM__#/etc/nginx/ssl/arvados-${SVC}.pem#g;
@@ -793,59 +767,121 @@ else
           done
         fi
       ;;
-      "api")
-        # States
-        grep -q "    - logrotate" ${S_DIR}/top.sls || echo "    - logrotate" >> ${S_DIR}/top.sls
-        if grep -q "    - nginx.*$" ${S_DIR}/top.sls; then
-          sed -i s/"^    - nginx.*$"/"    - nginx.passenger"/g ${S_DIR}/top.sls
-        else
-          echo "    - nginx.passenger" >> ${S_DIR}/top.sls
+      "balancer")
+        ### States ###
+        grep -q "\- nginx$" ${STATES_TOP} || echo "    - nginx" >> ${STATES_TOP}
+
+        if [ "${SSL_MODE}" = "lets-encrypt" ]; then
+          grep -q "letsencrypt"     ${STATES_TOP} || echo "    - letsencrypt" >> ${STATES_TOP}
+          if [ "x${USE_LETSENCRYPT_ROUTE53:-}" = "xyes" ]; then
+            grep -q "aws_credentials" ${STATES_TOP} || echo "    - aws_credentials" >> ${STATES_TOP}
+          fi
+        elif [ "${SSL_MODE}" = "bring-your-own" ]; then
+          copy_custom_cert ${CUSTOM_CERTS_DIR} ${R}
         fi
-        echo "    - extra.passenger_rvm" >> ${S_DIR}/top.sls
-        ### If we don't install and run LE before arvados-api-server, it fails and breaks everything
-        ### after it. So we add this here as we are, after all, sharing the host for api and controller
+
+        ### Pillars ###
+        grep -q "nginx_${R}_configuration" ${PILLARS_TOP} || echo "    - nginx_${R}_configuration" >> ${PILLARS_TOP}
+
         if [ "${SSL_MODE}" = "lets-encrypt" ]; then
+          grep -q "letsencrypt"     ${PILLARS_TOP} || echo "    - letsencrypt" >> ${PILLARS_TOP}
+
+          grep -q "letsencrypt_${R}_configuration" ${PILLARS_TOP} || echo "    - letsencrypt_${R}_configuration" >> ${PILLARS_TOP}
+          sed -i "s/__CERT_REQUIRES__/cmd: create-initial-cert-${ROLE2NODES['balancer']}*/g;
+                  s#__CERT_PEM__#/etc/letsencrypt/live/${ROLE2NODES['balancer']}/fullchain.pem#g;
+                  s#__CERT_KEY__#/etc/letsencrypt/live/${ROLE2NODES['balancer']}/privkey.pem#g" \
+          ${P_DIR}/nginx_${R}_configuration.sls
+
           if [ "${USE_LETSENCRYPT_ROUTE53}" = "yes" ]; then
-            grep -q "aws_credentials" ${S_DIR}/top.sls || echo "    - aws_credentials" >> ${S_DIR}/top.sls
+            grep -q "aws_credentials" ${PILLARS_TOP} || echo "    - aws_credentials" >> ${PILLARS_TOP}
           fi
-          grep -q "letsencrypt" ${S_DIR}/top.sls || echo "    - letsencrypt" >> ${S_DIR}/top.sls
+        elif [ "${SSL_MODE}" = "bring-your-own" ]; then
+          grep -q "ssl_key_encrypted" ${PILLARS_TOP} || echo "    - ssl_key_encrypted" >> ${PILLARS_TOP}
+          sed -i "s/__CERT_REQUIRES__/file: extra_custom_certs_file_copy_arvados-${R}.pem/g;
+                  s#__CERT_PEM__#/etc/nginx/ssl/arvados-${R}.pem#g;
+                  s#__CERT_KEY__#/etc/nginx/ssl/arvados-${R}.key#g" \
+            ${P_DIR}/nginx_${R}_configuration.sls
+          grep -q "${R}" ${P_DIR}/extra_custom_certs.sls || echo "  - ${R}" >> ${P_DIR}/extra_custom_certs.sls
+        fi
+      ;;
+      "controller")
+        ### States ###
+        grep -q "    - logrotate" ${STATES_TOP} || echo "    - logrotate" >> ${STATES_TOP}
+        if grep -q "    - nginx.*$" ${STATES_TOP}; then
+          sed -i s/"^    - nginx.*$"/"    - nginx.passenger"/g ${STATES_TOP}
         else
-          # Use custom certs
-          if [ "${SSL_MODE}" = "bring-your-own" ]; then
-            copy_custom_cert ${CUSTOM_CERTS_DIR} controller
+          echo "    - nginx.passenger" >> ${STATES_TOP}
+        fi
+        echo "    - extra.passenger_rvm" >> ${STATES_TOP}
+
+        ### If we don't install and run LE before arvados-api-server, it fails and breaks everything
+        ### after it. So we add this here as we are, after all, sharing the host for api and controller
+        if [ "${ENABLE_BALANCER}" == "no" ]; then
+          if [ "${SSL_MODE}" = "lets-encrypt" ]; then
+            if [ "x${USE_LETSENCRYPT_ROUTE53:-}" = "xyes" ]; then
+              grep -q "aws_credentials" ${STATES_TOP} || echo "    - aws_credentials" >> ${STATES_TOP}
+            fi
+            grep -q "letsencrypt"     ${STATES_TOP} || echo "    - letsencrypt" >> ${STATES_TOP}
+          elif [ "${SSL_MODE}" = "bring-your-own" ]; then
+            copy_custom_cert ${CUSTOM_CERTS_DIR} ${R}
+            grep -q controller ${P_DIR}/extra_custom_certs.sls || echo "  - controller" >> ${P_DIR}/extra_custom_certs.sls
           fi
-          grep -q controller ${P_DIR}/extra_custom_certs.sls || echo "  - controller" >> ${P_DIR}/extra_custom_certs.sls
         fi
-        grep -q "arvados.${R}" ${S_DIR}/top.sls    || echo "    - arvados.${R}" >> ${S_DIR}/top.sls
-        # Pillars
-        grep -q "logrotate_api" ${P_DIR}/top.sls            || echo "    - logrotate_api" >> ${P_DIR}/top.sls
-        grep -q "aws_credentials" ${P_DIR}/top.sls          || echo "    - aws_credentials" >> ${P_DIR}/top.sls
-        grep -q "postgresql" ${P_DIR}/top.sls               || echo "    - postgresql" >> ${P_DIR}/top.sls
-        grep -q "nginx_passenger" ${P_DIR}/top.sls          || echo "    - nginx_passenger" >> ${P_DIR}/top.sls
-        grep -q "nginx_${R}_configuration" ${P_DIR}/top.sls || echo "    - nginx_${R}_configuration" >> ${P_DIR}/top.sls
+        grep -q "arvados.api" ${STATES_TOP} || echo "    - arvados.api" >> ${STATES_TOP}
+        grep -q "arvados.controller" ${STATES_TOP} || echo "    - arvados.controller" >> ${STATES_TOP}
 
+        ### Pillars ###
+        grep -q "logrotate_api" ${PILLARS_TOP}            || echo "    - logrotate_api" >> ${PILLARS_TOP}
+        grep -q "aws_credentials" ${PILLARS_TOP}          || echo "    - aws_credentials" >> ${PILLARS_TOP}
+        grep -q "postgresql" ${PILLARS_TOP}               || echo "    - postgresql" >> ${PILLARS_TOP}
+        grep -q "nginx_passenger" ${PILLARS_TOP}          || echo "    - nginx_passenger" >> ${PILLARS_TOP}
+        grep -q "nginx_api_configuration" ${PILLARS_TOP} || echo "    - nginx_api_configuration" >> ${PILLARS_TOP}
+        grep -q "nginx_controller_configuration" ${PILLARS_TOP} || echo "    - nginx_controller_configuration" >> ${PILLARS_TOP}
+
+        if [ "${ENABLE_BALANCER}" == "no" ]; then
+          if [ "${SSL_MODE}" = "lets-encrypt" ]; then
+            if [ "${USE_LETSENCRYPT_ROUTE53}" = "yes" ]; then
+              grep -q "aws_credentials" ${PILLARS_TOP} || echo "    - aws_credentials" >> ${PILLARS_TOP}
+            fi
+
+            grep -q "letsencrypt"     ${PILLARS_TOP} || echo "    - letsencrypt" >> ${PILLARS_TOP}
+            grep -q "letsencrypt_${R}_configuration" ${PILLARS_TOP} || echo "    - letsencrypt_${R}_configuration" >> ${PILLARS_TOP}
+            sed -i "s/__CERT_REQUIRES__/cmd: create-initial-cert-${R}.${DOMAIN}*/g;
+                    s#__CERT_PEM__#/etc/letsencrypt/live/${R}.${DOMAIN}/fullchain.pem#g;
+                    s#__CERT_KEY__#/etc/letsencrypt/live/${R}.${DOMAIN}/privkey.pem#g" \
+            ${P_DIR}/nginx_${R}_configuration.sls
+          else
+            grep -q "ssl_key_encrypted" ${PILLARS_TOP} || echo "    - ssl_key_encrypted" >> ${PILLARS_TOP}
+            sed -i "s/__CERT_REQUIRES__/file: extra_custom_certs_file_copy_arvados-${R}.pem/g;
+                    s#__CERT_PEM__#/etc/nginx/ssl/arvados-${R}.pem#g;
+                    s#__CERT_KEY__#/etc/nginx/ssl/arvados-${R}.key#g" \
+            ${P_DIR}/nginx_${R}_configuration.sls
+            grep -q ${R} ${P_DIR}/extra_custom_certs.sls || echo "  - ${R}" >> ${P_DIR}/extra_custom_certs.sls
+          fi
+        fi
         # We need to tweak the Nginx's pillar depending whether we want plain nginx or nginx+passenger
         NGINX_INSTALL_SOURCE="install_from_phusionpassenger"
         sed -i "s/__NGINX_INSTALL_SOURCE__/${NGINX_INSTALL_SOURCE}/g" ${P_DIR}/nginx_passenger.sls
       ;;
-      "controller" | "websocket" | "workbench" | "workbench2" | "webshell" | "keepweb" | "keepproxy")
-        # States
+      "websocket" | "workbench" | "workbench2" | "webshell" | "keepweb" | "keepproxy")
+        ### States ###
         if [ "${R}" = "workbench" ]; then
-          grep -q "    - logrotate" ${S_DIR}/top.sls || echo "    - logrotate" >> ${S_DIR}/top.sls
+          grep -q "    - logrotate" ${STATES_TOP} || echo "    - logrotate" >> ${STATES_TOP}
           NGINX_INSTALL_SOURCE="install_from_phusionpassenger"
-          if grep -q "    - nginx$" ${S_DIR}/top.sls; then
-            sed -i s/"^    - nginx.*$"/"    - nginx.passenger"/g ${S_DIR}/top.sls
+          if grep -q "    - nginx$" ${STATES_TOP}; then
+            sed -i s/"^    - nginx.*$"/"    - nginx.passenger"/g ${STATES_TOP}
           else
-            echo "    - nginx.passenger" >> ${S_DIR}/top.sls
+            echo "    - nginx.passenger" >> ${STATES_TOP}
           fi
         else
-          grep -q "\- nginx$" ${S_DIR}/top.sls || echo "    - nginx" >> ${S_DIR}/top.sls
+          grep -q "\- nginx$" ${STATES_TOP} || echo "    - nginx" >> ${STATES_TOP}
         fi
+
         if [ "${SSL_MODE}" = "lets-encrypt" ]; then
-          if [ "x${USE_LETSENCRYPT_ROUTE53}" = "xyes" ]; then
-            grep -q "aws_credentials" ${S_DIR}/top.sls || echo "    - aws_credentials" >> ${S_DIR}/top.sls
+          if [ "x${USE_LETSENCRYPT_ROUTE53:-}" = "xyes" ]; then
+            grep -q "aws_credentials" ${STATES_TOP} || echo "    - aws_credentials" >> ${STATES_TOP}
           fi
-          grep -q "letsencrypt"     ${S_DIR}/top.sls || echo "    - letsencrypt" >> ${S_DIR}/top.sls
+          grep -q "letsencrypt"     ${STATES_TOP} || echo "    - letsencrypt" >> ${STATES_TOP}
         else
           # Use custom certs, special case for keepweb
           if [ ${R} = "keepweb" ]; then
@@ -859,28 +895,30 @@ else
             fi
           fi
         fi
+
         # webshell role is just a nginx vhost, so it has no state
         if [ "${R}" != "webshell" ]; then
-          grep -q "arvados.${R}" ${S_DIR}/top.sls || echo "    - arvados.${R}" >> ${S_DIR}/top.sls
+          grep -q "arvados.${R}" ${STATES_TOP} || echo "    - arvados.${R}" >> ${STATES_TOP}
         fi
-        # Pillars
+
+        ### Pillars ###
         if [ "${R}" = "workbench" ]; then
-          grep -q "logrotate_wb1" ${P_DIR}/top.sls || echo "    - logrotate_wb1" >> ${P_DIR}/top.sls
+          grep -q "logrotate_wb1" ${PILLARS_TOP} || echo "    - logrotate_wb1" >> ${PILLARS_TOP}
         fi
-        grep -q "nginx_passenger" ${P_DIR}/top.sls          || echo "    - nginx_passenger" >> ${P_DIR}/top.sls
-        grep -q "nginx_${R}_configuration" ${P_DIR}/top.sls || echo "    - nginx_${R}_configuration" >> ${P_DIR}/top.sls
+        grep -q "nginx_passenger" ${PILLARS_TOP}          || echo "    - nginx_passenger" >> ${PILLARS_TOP}
+        grep -q "nginx_${R}_configuration" ${PILLARS_TOP} || echo "    - nginx_${R}_configuration" >> ${PILLARS_TOP}
         # Special case for keepweb
         if [ ${R} = "keepweb" ]; then
-          grep -q "nginx_download_configuration" ${P_DIR}/top.sls || echo "    - nginx_download_configuration" >> ${P_DIR}/top.sls
-          grep -q "nginx_collections_configuration" ${P_DIR}/top.sls || echo "    - nginx_collections_configuration" >> ${P_DIR}/top.sls
+          grep -q "nginx_download_configuration" ${PILLARS_TOP} || echo "    - nginx_download_configuration" >> ${PILLARS_TOP}
+          grep -q "nginx_collections_configuration" ${PILLARS_TOP} || echo "    - nginx_collections_configuration" >> ${PILLARS_TOP}
         fi
 
         if [ "${SSL_MODE}" = "lets-encrypt" ]; then
           if [ "${USE_LETSENCRYPT_ROUTE53}" = "yes" ]; then
-            grep -q "aws_credentials" ${P_DIR}/top.sls || echo "    - aws_credentials" >> ${P_DIR}/top.sls
+            grep -q "aws_credentials" ${PILLARS_TOP} || echo "    - aws_credentials" >> ${PILLARS_TOP}
           fi
-          grep -q "letsencrypt"     ${P_DIR}/top.sls || echo "    - letsencrypt" >> ${P_DIR}/top.sls
-          grep -q "letsencrypt_${R}_configuration" ${P_DIR}/top.sls || echo "    - letsencrypt_${R}_configuration" >> ${P_DIR}/top.sls
+          grep -q "letsencrypt"     ${PILLARS_TOP} || echo "    - letsencrypt" >> ${PILLARS_TOP}
+          grep -q "letsencrypt_${R}_configuration" ${PILLARS_TOP} || echo "    - letsencrypt_${R}_configuration" >> ${PILLARS_TOP}
 
           # As the pillar differ whether we use LE or custom certs, we need to do a final edition on them
           # Special case for keepweb
@@ -898,7 +936,7 @@ else
             ${P_DIR}/nginx_${R}_configuration.sls
           fi
         else
-          grep -q "ssl_key_encrypted" ${P_DIR}/top.sls || echo "    - ssl_key_encrypted" >> ${P_DIR}/top.sls
+          grep -q "ssl_key_encrypted" ${PILLARS_TOP} || echo "    - ssl_key_encrypted" >> ${PILLARS_TOP}
           # As the pillar differ whether we use LE or custom certs, we need to do a final edition on them
           # Special case for keepweb
           if [ ${R} = "keepweb" ]; then
@@ -922,16 +960,16 @@ else
       ;;
       "shell")
         # States
-        echo "    - extra.shell_sudo_passwordless" >> ${S_DIR}/top.sls
-        echo "    - extra.shell_cron_add_login_sync" >> ${S_DIR}/top.sls
-        grep -q "docker" ${S_DIR}/top.sls       || echo "    - docker.software" >> ${S_DIR}/top.sls
-        grep -q "arvados.${R}" ${S_DIR}/top.sls || echo "    - arvados.${R}" >> ${S_DIR}/top.sls
+        echo "    - extra.shell_sudo_passwordless" >> ${STATES_TOP}
+        echo "    - extra.shell_cron_add_login_sync" >> ${STATES_TOP}
+        grep -q "docker" ${STATES_TOP}       || echo "    - docker.software" >> ${STATES_TOP}
+        grep -q "arvados.${R}" ${STATES_TOP} || echo "    - arvados.${R}" >> ${STATES_TOP}
         # Pillars
-        grep -q "docker" ${P_DIR}/top.sls       || echo "    - docker" >> ${P_DIR}/top.sls
+        grep -q "docker" ${PILLARS_TOP}       || echo "    - docker" >> ${PILLARS_TOP}
       ;;
       "dispatcher" | "keepbalance" | "keepstore")
         # States
-        grep -q "arvados.${R}" ${S_DIR}/top.sls || echo "    - arvados.${R}" >> ${S_DIR}/top.sls
+        grep -q "arvados.${R}" ${STATES_TOP} || echo "    - arvados.${R}" >> ${STATES_TOP}
         # Pillars
         # ATM, no specific pillar needed
       ;;
@@ -960,21 +998,21 @@ fi
 # Leave a copy of the Arvados CA so the user can copy it where it's required
 if [ "${SSL_MODE}" = "self-signed" ]; then
   echo "Copying the Arvados CA certificate '${DOMAIN}-arvados-snakeoil-ca.crt' to the installer dir, so you can import it"
-  if [ "x${VAGRANT}" = "xyes" ]; then
+  if [ "x${VAGRANT:-}" = "xyes" ]; then
     cp /etc/ssl/certs/arvados-snakeoil-ca.pem /vagrant/${DOMAIN}-arvados-snakeoil-ca.pem
   else
     cp /etc/ssl/certs/arvados-snakeoil-ca.pem ${SCRIPT_DIR}/${DOMAIN}-arvados-snakeoil-ca.crt
   fi
 fi
 
-if [ "x${VAGRANT}" = "xyes" ]; then
+if [ "x${VAGRANT:-}" = "xyes" ]; then
     # If running in a vagrant VM, also add default user to docker group
     echo "Adding the vagrant user to the docker group"
     usermod -a -G docker vagrant
 fi
 
 # Test that the installation finished correctly
-if [ "x${TEST}" = "xyes" ]; then
+if [ "x${TEST:-}" = "xyes" ]; then
   cd ${T_DIR}
   # If we use RVM, we need to run this with it, or most ruby commands will fail
   RVM_EXEC=""
index 618da3a51701dbe0453e049f65108a21d9853262..807bd7d01f14c663e51f2fda0be86ccd92ecd332 100644 (file)
@@ -20,9 +20,8 @@ locals {
   compute_node_iam_role_name = data.terraform_remote_state.data-storage.outputs.compute_node_iam_role_name
   instance_profile = {
     default = aws_iam_instance_profile.default_instance_profile
-    controller = aws_iam_instance_profile.dispatcher_instance_profile
+    workbench = aws_iam_instance_profile.dispatcher_instance_profile
     keep0 = aws_iam_instance_profile.keepstore_instance_profile
-    keep1 = aws_iam_instance_profile.keepstore_instance_profile
   }
   private_subnet_id = data.terraform_remote_state.vpc.outputs.private_subnet_id
   public_subnet_id = data.terraform_remote_state.vpc.outputs.public_subnet_id
index bbc5f84958df40cff6c44ef7a7b9f3cc42a52c71..867034624429e49fb2646f18fccefaf072b95fd0 100644 (file)
 
 # Optional cluster service nodes configuration:
 #
-# List of node names which either will be hosting user-facing or internal services
-# user_facing_hosts = ["node1", "node2", ...]
-# internal_service_hosts = ["node3", ...]
+# List of node names which either will be hosting user-facing or internal
+# services. Defaults:
+# user_facing_hosts = [ "controller", "workbench" ]
+# internal_service_hosts = [ "keep0", "shell" ]
 #
-# Map assigning each node name an internal IP address
+# Map assigning each node name an internal IP address. Defaults:
 # private_ip = {
-#   node1 = "1.2.3.4"
-#   ...
+#   controller = "10.1.1.11"
+#   workbench = "10.1.1.15"
+#   shell = "10.1.2.17"
+#   keep0 = "10.1.2.13"
 # }
 #
-# Map assigning DNS aliases for service node names
+# Map assigning DNS aliases for service node names. Defaults:
 # dns_aliases = {
-#   node1 = ["alias1", "alias2", ...]
-#   ...
+#   workbench = [
+#     "ws",
+#     "workbench2",
+#     "webshell",
+#     "keep",
+#     "download",
+#     "prometheus",
+#     "grafana",
+#     "*.collections"
+#   ]
 # }
\ No newline at end of file
index b91cc421497c15a0e9fde05be7378ba528052dd5..c8d366a199dc435aa078fc13583a40c1df764e62 100644 (file)
@@ -54,8 +54,8 @@ variable "dns_aliases" {
   description = "Sets DNS name aliases for every service node"
   type = map(list(string))
   default = {
-    controller = ["ws"]
     workbench = [
+      "ws",
       "workbench2",
       "webshell",
       "keep",