16379: Adds prometheus service for monitoring arvados & postgresql.
authorLucas Di Pentima <lucas.dipentima@curii.com>
Tue, 21 Feb 2023 21:21:58 +0000 (18:21 -0300)
committerLucas Di Pentima <lucas.dipentima@curii.com>
Tue, 28 Mar 2023 17:58:21 +0000 (14:58 -0300)
Arvados-DCO-1.1-Signed-off-by: Lucas Di Pentima <lucas.dipentima@curii.com>

tools/salt-install/config_examples/multi_host/aws/pillars/arvados.sls
tools/salt-install/config_examples/multi_host/aws/pillars/letsencrypt_monitoring_configuration.sls [new file with mode: 0644]
tools/salt-install/config_examples/multi_host/aws/pillars/nginx_monitoring_configuration.sls [new file with mode: 0644]
tools/salt-install/config_examples/multi_host/aws/pillars/postgresql.sls
tools/salt-install/config_examples/multi_host/aws/pillars/prometheus_pg_exporter.sls [new file with mode: 0644]
tools/salt-install/config_examples/multi_host/aws/pillars/prometheus_server.sls [new file with mode: 0644]
tools/salt-install/config_examples/multi_host/aws/states/postgresql_mtail.sls [new file with mode: 0644]
tools/salt-install/installer.sh
tools/salt-install/local.params.example.multiple_hosts
tools/salt-install/provision.sh
tools/salt-install/terraform/aws/vpc/terraform.tfvars

index b33282f180a39e6af86cd50edb39d6d7a6e6058e..cacc6a0a11f045f6aea49196ce5a01cb4aef1d5f 100644 (file)
@@ -161,7 +161,7 @@ arvados:
           'http://__CONTROLLER_INT_IP__:9006': {}
       Keepbalance:
         InternalURLs:
-          'http://localhost:9005': {}
+          'http://__CONTROLLER_INT_IP__:9005': {}
       Keepproxy:
         ExternalURL: 'https://keep.__CLUSTER__.__DOMAIN__:__KEEP_EXT_SSL_PORT__'
         InternalURLs:
diff --git a/tools/salt-install/config_examples/multi_host/aws/pillars/letsencrypt_monitoring_configuration.sls b/tools/salt-install/config_examples/multi_host/aws/pillars/letsencrypt_monitoring_configuration.sls
new file mode 100644 (file)
index 0000000..91dbd84
--- /dev/null
@@ -0,0 +1,10 @@
+---
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+### LETSENCRYPT
+letsencrypt:
+  domainsets:
+    monitoring.__CLUSTER__.__DOMAIN__:
+      - mon.__CLUSTER__.__DOMAIN__
diff --git a/tools/salt-install/config_examples/multi_host/aws/pillars/nginx_monitoring_configuration.sls b/tools/salt-install/config_examples/multi_host/aws/pillars/nginx_monitoring_configuration.sls
new file mode 100644 (file)
index 0000000..2f15324
--- /dev/null
@@ -0,0 +1,62 @@
+---
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+{%- import_yaml "ssl_key_encrypted.sls" as ssl_key_encrypted_pillar %}
+
+### NGINX
+nginx:
+  ### SERVER
+  server:
+    config:
+      ### STREAMS
+      http:
+        upstream prometheus_upstream:
+          - server: '127.0.0.1:9090 fail_timeout=10s'
+
+  ### SITES
+  servers:
+    managed:
+      ### PROMETHEUS
+      prometheus:
+        enabled: true
+        overwrite: true
+        config:
+          - server:
+            - server_name: mon.__CLUSTER__.__DOMAIN__
+            - listen:
+              - 80
+            - location /.well-known:
+              - root: /var/www
+            - location /:
+              - return: '301 https://$host$request_uri'
+
+      prometheus-ssl:
+        enabled: true
+        overwrite: true
+        requires:
+          __CERT_REQUIRES__
+        config:
+          - server:
+            - server_name: mon.__CLUSTER__.__DOMAIN__
+            - listen:
+              - 443 http2 ssl
+            - index: index.html index.htm
+            - location /:
+              - proxy_pass: 'http://prometheus_upstream'
+              - proxy_read_timeout: 300
+              - proxy_connect_timeout: 90
+              - proxy_redirect: 'off'
+              - proxy_set_header: X-Forwarded-Proto https
+              - proxy_set_header: 'Host $http_host'
+              - proxy_set_header: 'X-Real-IP $remote_addr'
+              - proxy_set_header: 'X-Forwarded-For $proxy_add_x_forwarded_for'
+            - ssl_certificate: __CERT_PEM__
+            - ssl_certificate_key: __CERT_KEY__
+            - include: snippets/ssl_hardening_default.conf
+            {%- if ssl_key_encrypted_pillar.ssl_key_encrypted.enabled %}
+            - ssl_password_file: {{ '/run/arvados/' | path_join(ssl_key_encrypted_pillar.ssl_key_encrypted.privkey_password_filename) }}
+            {%- endif %}
+            - access_log: /var/log/nginx/mon.__CLUSTER__.__DOMAIN__.access.log combined
+            - error_log: /var/log/nginx/mon.__CLUSTER__.__DOMAIN__.error.log
index d6320da24651612e760178fa598bdd0fb6353b83..2eed52a1db93b80fad73051d00d22195d37d8dcb 100644 (file)
@@ -5,6 +5,8 @@
 
 ### POSTGRESQL
 postgres:
+  pkgs_extra:
+    - postgresql-contrib
   use_upstream_repo: true
   version: '12'
   postgresconf: |-
@@ -20,6 +22,8 @@ postgres:
     __CLUSTER___arvados:
       ensure: present
       password: "__DATABASE_PASSWORD__"
+    prometheus:
+      ensure: present
 
   # tablespaces:
   #   arvados_tablespace:
diff --git a/tools/salt-install/config_examples/multi_host/aws/pillars/prometheus_pg_exporter.sls b/tools/salt-install/config_examples/multi_host/aws/pillars/prometheus_pg_exporter.sls
new file mode 100644 (file)
index 0000000..73f706d
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+### PROMETHEUS
+prometheus:
+  wanted:
+    component:
+      - postgres_exporter
diff --git a/tools/salt-install/config_examples/multi_host/aws/pillars/prometheus_server.sls b/tools/salt-install/config_examples/multi_host/aws/pillars/prometheus_server.sls
new file mode 100644 (file)
index 0000000..dd5594d
--- /dev/null
@@ -0,0 +1,80 @@
+---
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: AGPL-3.0
+
+### PROMETHEUS
+prometheus:
+  wanted:
+    component:
+      - prometheus
+      - alertmanager
+      - blackbox_exporter
+  pkg:
+    use_upstream_repo: true
+    use_upstream_archive: true
+
+    component:
+      prometheus:
+        config:
+          global:
+            scrape_interval: 15s
+            evaluation_interval: 15s
+          rule_files:
+            - rules.yml
+
+          scrape_configs:
+            - job_name: prometheus
+              # metrics_path defaults to /metrics
+              # scheme defaults to http.
+              static_configs:
+              - targets: ['localhost:9090']
+                labels:
+                  instance: mon.__CLUSTER__
+                  cluster: __CLUSTER__
+
+            ## Arvados unique jobs
+            - job_name: keep_web
+              bearer_token: __MANAGEMENT_TOKEN__
+              scheme: https
+              static_configs:
+                - targets: ['keep.__CLUSTER__.__DOMAIN__:443']
+                  labels:
+                    instance: keep-web.__CLUSTER__
+                    cluster: __CLUSTER__
+            - job_name: keep_balance
+              bearer_token: __MANAGEMENT_TOKEN__
+              static_configs:
+                - targets: ['__CONTROLLER_INT_IP__:9005']
+                  labels:
+                    instance: keep-balance.__CLUSTER__
+                    cluster: __CLUSTER__
+            - job_name: keepstore
+              bearer_token: __MANAGEMENT_TOKEN__
+              static_configs:
+                - targets: ['__KEEPSTORE0_INT_IP__:25107']
+                  labels:
+                    instance: keep0.__CLUSTER__
+                    cluster: __CLUSTER__
+                - targets: ['__KEEPSTORE1_INT_IP__:25107']
+                  labels:
+                    instance: keep1.__CLUSTER__
+                    cluster: __CLUSTER__
+            - job_name: arvados_dispatch_cloud
+              bearer_token: __MANAGEMENT_TOKEN__
+              static_configs:
+                - targets: ['__CONTROLLER_INT_IP__:9006']
+                  labels:
+                    instance: arvados-dispatch-cloud.__CLUSTER__
+                    cluster: __CLUSTER__
+
+            # Database
+            - job_name: postgresql
+              static_configs:
+                - targets: [
+                    '__DATABASE_INT_IP__:9187',
+                    '__DATABASE_INT_IP__:3903'
+                  ]
+                  labels:
+                    instance: database.__CLUSTER__
+                    cluster: __CLUSTER__
diff --git a/tools/salt-install/config_examples/multi_host/aws/states/postgresql_mtail.sls b/tools/salt-install/config_examples/multi_host/aws/states/postgresql_mtail.sls
new file mode 100644 (file)
index 0000000..6af01bb
--- /dev/null
@@ -0,0 +1,78 @@
+# Copyright (C) The Arvados Authors. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+### PACKAGES
+monitoring_required_pkgs:
+  pkg.installed:
+    - name: mtail
+
+### FILES
+prometheus_pg_exporter_etc_default:
+  file.managed:
+    - name: /etc/default/prometheus-postgres-exporter
+    - contents: |
+        ### This file managed by Salt, do not edit by hand!!
+        #
+        # For details, check /usr/share/doc/prometheus-postgres-exporter/README.Debian
+        DATA_SOURCE_NAME='user=prometheus host=/run/postgresql dbname=postgres'
+    - require:
+      - pkg: prometheus-package-install-postgres_exporter-installed
+
+mtail_postgresql_conf:
+  file.managed:
+    - name: /etc/mtail/postgresql.mtail
+    - contents: |
+        ########################################################################
+        # File managed by Salt.
+        # Your changes will be overwritten.
+        ########################################################################
+
+        # Parser for postgresql's log statement duration
+
+        gauge postgresql_statement_duration_seconds by statement
+
+        /^/ +
+        /(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} (\w+)) / + # 2019-01-16 16:53:45 GMT
+        /LOG: +duration: / +
+        /(?P<duration>[0-9\.]+) ms/ + # 153.967 ms
+        /(.*?): (?P<statement>.+)/ + # statement: SELECT COUNT(*) FROM (SELECT rolname FROM pg_roles WHERE rolname='arvados') count
+        /$/ {
+          strptime($timestamp, "2006-01-02 15:04:05 MST") # for tests
+
+          postgresql_statement_duration_seconds[$statement] = $duration / 1000
+        }
+    - require:
+      - pkg: monitoring_required_pkgs
+
+mtail_etc_default:
+  file.managed:
+    - name: /etc/default/mtail
+    - contents: |
+        ### This file managed by Salt, do not edit by hand!!
+        #
+        ENABLED=true
+        # List of files to monitor (mandatory).
+        LOGS=/var/log/postgresql/postgresql*log
+    - require:
+      - pkg: monitoring_required_pkgs
+
+### SERVICES
+prometheus_pg_exporter_service:
+  service.running:
+    - name: prometheus-postgres-exporter
+    - enable: true
+    - require:
+      - pkg: prometheus-package-install-postgres_exporter-installed
+    - watch:
+      - file: /etc/default/prometheus-postgres-exporter
+
+mtail_service:
+  service.running:
+    - name: mtail
+    - enable: true
+    - require:
+      - pkg: monitoring_required_pkgs
+    - watch:
+      - file: /etc/mtail/postgresql.mtail
+      - file: /etc/default/mtail
index 0f1d16ddee94375a97035adddd19c8137ef4fd25..000ed32929d892dd7515cd0beb70a302fcde0ba2 100755 (executable)
@@ -301,7 +301,7 @@ case "$subcmd" in
        else
            # Just deploy the node that was supplied on the command line.
            sync $NODE $BRANCH
-           deploynode $NODE ""
+           deploynode $NODE "${NODES[$NODE]}"
        fi
 
        set +x
index 01a321c4a0ff701212567f544082f703488bc3d5..430f6da4570fc229c35fa9d3aa6f31b9c83d920e 100644 (file)
@@ -150,3 +150,4 @@ RELEASE="production"
 # DOCKER_TAG="v2.4.2"
 # LOCALE_TAG="v0.3.4"
 # LETSENCRYPT_TAG="v2.1.0"
+# PROMETHEUS_TAG="v5.6.5"
\ No newline at end of file
index 05a41ded6096646ef32a75dbfbfcab3c1e00cf04..b84367cf977b84b76cc3164b649aebd8c2b0a2a5 100755 (executable)
@@ -32,6 +32,7 @@ usage() {
   echo >&2 "                                                keepbalance"
   echo >&2 "                                                keepstore"
   echo >&2 "                                                keepweb"
+  echo >&2 "                                                monitoring"
   echo >&2 "                                                shell"
   echo >&2 "                                                webshell"
   echo >&2 "                                                websocket"
@@ -108,7 +109,7 @@ arguments() {
         for i in ${2//,/ }
           do
             # Verify the role exists
-            if [[ ! "database,api,controller,keepstore,websocket,keepweb,workbench2,webshell,keepbalance,keepproxy,shell,workbench,dispatcher" == *"$i"* ]]; then
+            if [[ ! "database,api,controller,keepstore,websocket,keepweb,workbench2,webshell,keepbalance,keepproxy,shell,workbench,dispatcher,monitoring" == *"$i"* ]]; then
               echo "The role '${i}' is not a valid role"
               usage
               exit 1
@@ -220,6 +221,7 @@ DOCKER_TAG="v2.4.2"
 LOCALE_TAG="v0.3.4"
 LETSENCRYPT_TAG="v2.1.0"
 LOGROTATE_TAG="v0.14.0"
+PROMETHEUS_TAG="v5.6.5"
 
 # Salt's dir
 DUMP_SALT_CONFIG_DIR=""
@@ -358,6 +360,11 @@ test -d postgres && ( cd postgres && git fetch ) \
   || git clone --quiet ${POSTGRES_URL} ${F_DIR}/postgres
 ( cd postgres && git checkout --quiet tags/"${POSTGRES_TAG}" )
 
+echo "...prometheus"
+test -d prometheus && ( cd prometheus && git fetch ) \
+  || git clone --quiet https://github.com/saltstack-formulas/prometheus-formula.git ${F_DIR}/prometheus
+( cd prometheus && git checkout --quiet tags/"${PROMETHEUS_TAG}" )
+
 echo "...letsencrypt"
 test -d letsencrypt && ( cd letsencrypt && git fetch ) \
   || git clone --quiet https://github.com/saltstack-formulas/letsencrypt-formula.git ${F_DIR}/letsencrypt
@@ -685,9 +692,49 @@ else
     case "${R}" in
       "database")
         # States
-        echo "    - postgres" >> ${S_DIR}/top.sls
+        grep -q "\- postgres$" ${S_DIR}/top.sls || echo "    - postgres" >> ${S_DIR}/top.sls
+        grep -q "prometheus" ${S_DIR}/top.sls || echo "    - prometheus" >> ${S_DIR}/top.sls
+        grep -q "extra.postgresql_mtail" ${S_DIR}/top.sls || echo "    - extra.postgresql_mtail" >> ${S_DIR}/top.sls
         # Pillars
-        echo '    - postgresql' >> ${P_DIR}/top.sls
+        grep -q "postgresql" ${P_DIR}/top.sls || echo "    - postgresql" >> ${P_DIR}/top.sls
+        grep -q "prometheus_pg_exporter" ${P_DIR}/top.sls || echo "    - prometheus_pg_exporter" >> ${P_DIR}/top.sls
+      ;;
+      "monitoring")
+        ### States ###
+        grep -q "nginx" ${S_DIR}/top.sls || echo "    - nginx" >> ${S_DIR}/top.sls
+        grep -q "prometheus" ${S_DIR}/top.sls || echo "    - prometheus" >> ${S_DIR}/top.sls
+        if [ "${SSL_MODE}" = "lets-encrypt" ]; then
+          grep -q "letsencrypt"     ${S_DIR}/top.sls || echo "    - letsencrypt" >> ${S_DIR}/top.sls
+          if [ "x${USE_LETSENCRYPT_ROUTE53}" = "xyes" ]; then
+            grep -q "aws_credentials" ${S_DIR}/top.sls || echo "    - aws_credentials" >> ${S_DIR}/top.sls
+          fi
+        elif [ "${SSL_MODE}" = "bring-your-own" ]; then
+          copy_custom_cert ${CUSTOM_CERTS_DIR} ${R}
+          if [ "${SSL_KEY_ENCRYPTED}" = "yes" ]; then
+            grep -q "ssl_key_encrypted" ${S_DIR}/top.sls || echo "    - extra.ssl_key_encrypted" >> ${S_DIR}/top.sls
+          fi
+        fi
+        ### Pillars ###
+        grep -q "nginx_${R}_configuration" ${P_DIR}/top.sls || echo "    - nginx_${R}_configuration" >> ${P_DIR}/top.sls
+        grep -q "prometheus_server" ${P_DIR}/top.sls || echo "    - prometheus_server" >> ${P_DIR}/top.sls
+        if [ "${SSL_MODE}" = "lets-encrypt" ]; then
+          grep -q "letsencrypt"     ${P_DIR}/top.sls || echo "    - letsencrypt" >> ${P_DIR}/top.sls
+          grep -q "letsencrypt_${R}_configuration" ${P_DIR}/top.sls || echo "    - letsencrypt_${R}_configuration" >> ${P_DIR}/top.sls
+          if [ "${USE_LETSENCRYPT_ROUTE53}" = "yes" ]; then
+            grep -q "aws_credentials" ${P_DIR}/top.sls || echo "    - aws_credentials" >> ${P_DIR}/top.sls
+          fi
+          sed -i "s/__CERT_REQUIRES__/cmd: create-initial-cert-${R}.${CLUSTER}.${DOMAIN}*/g;
+                  s#__CERT_PEM__#/etc/letsencrypt/live/${R}.${CLUSTER}.${DOMAIN}/fullchain.pem#g;
+                  s#__CERT_KEY__#/etc/letsencrypt/live/${R}.${CLUSTER}.${DOMAIN}/privkey.pem#g" \
+          ${P_DIR}/nginx_${R}_configuration.sls
+        elif [ "${SSL_MODE}" = "bring-your-own" ]; then
+          grep -q "ssl_key_encrypted" ${P_DIR}/top.sls || echo "    - ssl_key_encrypted" >> ${P_DIR}/top.sls
+          sed -i "s/__CERT_REQUIRES__/file: extra_custom_certs_file_copy_arvados-${R}.pem/g;
+                  s#__CERT_PEM__#/etc/nginx/ssl/arvados-${R}.pem#g;
+                  s#__CERT_KEY__#/etc/nginx/ssl/arvados-${R}.key#g" \
+            ${P_DIR}/nginx_${R}_configuration.sls
+          grep -q ${R} ${P_DIR}/extra_custom_certs.sls || echo "  - ${R}" >> ${P_DIR}/extra_custom_certs.sls
+        fi
       ;;
       "api")
         # States
index cac62ed6f12c56c29eb5b32c567c44d85a010431..210a2d2e752c24ac79cab1e8013a4517cc670348 100644 (file)
@@ -3,5 +3,5 @@
 # SPDX-License-Identifier: CC-BY-SA-3.0
 
 region_name = "us-east-1"
-cluster_name = "xarv1"
-domain_name = "example.com"
+cluster_name = "xarv1"
+domain_name = "example.com"