20953: Adds blackbox-exporter HTTP probes and SSL Monitor grafana dashboard.
[arvados.git] / tools / salt-install / config_examples / multi_host / aws / pillars / prometheus_server.sls
index bbf997b7be364700372d55b308ad527b7a7a9aae..143097b4a454fc7a58c804fe6ed1b5d02e5f2580 100644 (file)
@@ -3,6 +3,10 @@
 #
 # SPDX-License-Identifier: AGPL-3.0
 
+{%- set controller_nodes = "__CONTROLLER_NODES__".split(',') %}
+{%- set enable_balancer = ("__ENABLE_BALANCER__"|to_bool) %}
+{%- set data_retention_time = "__PROMETHEUS_DATA_RETENTION_TIME__" %}
+
 ### PROMETHEUS
 prometheus:
   wanted:
@@ -10,10 +14,27 @@ prometheus:
       - prometheus
       - alertmanager
       - node_exporter
+      - blackbox_exporter
   pkg:
-    use_upstream_repo: true
+    use_upstream_repo: false
+    use_upstream_archive: true
     component:
+      blackbox_exporter:
+        config_file: /etc/prometheus/blackbox_exporter.yml
+        config:
+          modules:
+            http_2xx:
+              prober: http
+              timeout: 5s
+              http:
+                valid_http_versions: [HTTP/1.1, HTTP/2]
+                valid_status_codes: []  # Default is [200]
+                fail_if_ssl: false
+                fail_if_not_ssl: false
       prometheus:
+        service:
+           args:
+             storage.tsdb.retention.time: {{ data_retention_time }}
         config:
           global:
             scrape_interval: 15s
@@ -31,6 +52,43 @@ prometheus:
                   instance: mon.__CLUSTER__
                   cluster: __CLUSTER__
 
+            - job_name: http_probe
+              metrics_path: /probe
+              params:
+                module: [http_2xx]
+              static_configs:
+                - targets: ['https://__DOMAIN__']
+                  labels:
+                    instance: controller.__CLUSTER__
+                - targets: ['https://workbench.__DOMAIN__']
+                  labels:
+                    instance: workbench.__CLUSTER__
+                - targets: ['https://workbench2.__DOMAIN__']
+                  labels:
+                    instance: workbench2.__CLUSTER__
+                - targets: ['https://download.__DOMAIN__']
+                  labels:
+                    instance: download.__CLUSTER__
+                - targets: ['https://grafana.__DOMAIN__']
+                  labels:
+                    instance: grafana.__CLUSTER__
+                - targets: ['https://prometheus.__DOMAIN__']
+                  labels:
+                    instance: prometheus.__CLUSTER__
+                - targets: ['https://webshell.__DOMAIN__']
+                  labels:
+                    instance: webshell.__CLUSTER__
+                - targets: ['https://ws.__DOMAIN__']
+                  labels:
+                    instance: ws.__CLUSTER__
+              relabel_configs:
+                - source_labels: [__address__]
+                  target_label: __param_target
+                - source_labels: [__param_target]
+                  target_label: instance
+                - target_label: __address__
+                  replacement: 127.0.0.1:9115          # blackbox exporter.
+
             ## Arvados unique jobs
             - job_name: arvados_ws
               bearer_token: __MANAGEMENT_TOKEN__
@@ -42,12 +100,25 @@ prometheus:
                     cluster: __CLUSTER__
             - job_name: arvados_controller
               bearer_token: __MANAGEMENT_TOKEN__
+              {%- if enable_balancer %}
+              scheme: http
+              {%- else %}
               scheme: https
+              {%- endif %}
               static_configs:
+                {%- if enable_balancer %}
+                  {%- for controller in controller_nodes %}
+                - targets: ['{{ controller }}']
+                  labels:
+                    instance: {{ controller.split('.')[0] }}.__CLUSTER__
+                    cluster: __CLUSTER__
+                  {%- endfor %}
+                {%- else %}
                 - targets: ['__DOMAIN__:443']
                   labels:
                     instance: controller.__CLUSTER__
                     cluster: __CLUSTER__
+                {%- endif %}
             - job_name: keep_web
               bearer_token: __MANAGEMENT_TOKEN__
               scheme: https
@@ -59,7 +130,7 @@ prometheus:
             - job_name: keep_balance
               bearer_token: __MANAGEMENT_TOKEN__
               static_configs:
-                - targets: ['__CONTROLLER_INT_IP__:9005']
+                - targets: ['__KEEPBALANCE_INT_IP__:9005']
                   labels:
                     instance: keep-balance.__CLUSTER__
                     cluster: __CLUSTER__
@@ -73,11 +144,12 @@ prometheus:
             - job_name: arvados_dispatch_cloud
               bearer_token: __MANAGEMENT_TOKEN__
               static_configs:
-                - targets: ['__CONTROLLER_INT_IP__:9006']
+                - targets: ['__DISPATCHER_INT_IP__:9006']
                   labels:
                     instance: arvados-dispatch-cloud.__CLUSTER__
                     cluster: __CLUSTER__
 
+            {%- if "__DATABASE_INT_IP__" != "" %}
             # Database
             - job_name: postgresql
               static_configs:
@@ -88,16 +160,17 @@ prometheus:
                   labels:
                     instance: database.__CLUSTER__
                     cluster: __CLUSTER__
+            {%- endif %}
 
             # Nodes
+            {%- set node_list = "__NODELIST__".split(',') %}
+            {%- set nodes = [] %}
+            {%- for node in node_list %}
+              {%- set _ = nodes.append(node.split('.')[0]) %}
+            {%- endfor %}
             - job_name: node
               static_configs:
-                {% for node in [
-                  'controller',
-                  'keep0',
-                  'workbench',
-                  'shell',
-                ] %}
+                {% for node in nodes %}
                 - targets: [ "{{ node }}.__DOMAIN__:9100" ]
                   labels:
                     instance: "{{ node }}.__CLUSTER__"