X-Git-Url: https://git.arvados.org/arvados.git/blobdiff_plain/6dcb171901a8da9604c6d3df5b6e8d647b67d690..85d79eb901f52a2fb10fae4bc64b8f9a18dff781:/tools/salt-install/config_examples/multi_host/aws/pillars/prometheus_server.sls diff --git a/tools/salt-install/config_examples/multi_host/aws/pillars/prometheus_server.sls b/tools/salt-install/config_examples/multi_host/aws/pillars/prometheus_server.sls index bbf997b7be..143097b4a4 100644 --- a/tools/salt-install/config_examples/multi_host/aws/pillars/prometheus_server.sls +++ b/tools/salt-install/config_examples/multi_host/aws/pillars/prometheus_server.sls @@ -3,6 +3,10 @@ # # SPDX-License-Identifier: AGPL-3.0 +{%- set controller_nodes = "__CONTROLLER_NODES__".split(',') %} +{%- set enable_balancer = ("__ENABLE_BALANCER__"|to_bool) %} +{%- set data_retention_time = "__PROMETHEUS_DATA_RETENTION_TIME__" %} + ### PROMETHEUS prometheus: wanted: @@ -10,10 +14,27 @@ prometheus: - prometheus - alertmanager - node_exporter + - blackbox_exporter pkg: - use_upstream_repo: true + use_upstream_repo: false + use_upstream_archive: true component: + blackbox_exporter: + config_file: /etc/prometheus/blackbox_exporter.yml + config: + modules: + http_2xx: + prober: http + timeout: 5s + http: + valid_http_versions: [HTTP/1.1, HTTP/2] + valid_status_codes: [] # Default is [200] + fail_if_ssl: false + fail_if_not_ssl: false prometheus: + service: + args: + storage.tsdb.retention.time: {{ data_retention_time }} config: global: scrape_interval: 15s @@ -31,6 +52,43 @@ prometheus: instance: mon.__CLUSTER__ cluster: __CLUSTER__ + - job_name: http_probe + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: ['https://__DOMAIN__'] + labels: + instance: controller.__CLUSTER__ + - targets: ['https://workbench.__DOMAIN__'] + labels: + instance: workbench.__CLUSTER__ + - targets: ['https://workbench2.__DOMAIN__'] + labels: + instance: workbench2.__CLUSTER__ + - targets: ['https://download.__DOMAIN__'] + labels: + instance: download.__CLUSTER__ + - targets: ['https://grafana.__DOMAIN__'] + labels: + instance: grafana.__CLUSTER__ + - targets: ['https://prometheus.__DOMAIN__'] + labels: + instance: prometheus.__CLUSTER__ + - targets: ['https://webshell.__DOMAIN__'] + labels: + instance: webshell.__CLUSTER__ + - targets: ['https://ws.__DOMAIN__'] + labels: + instance: ws.__CLUSTER__ + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: 127.0.0.1:9115 # blackbox exporter. + ## Arvados unique jobs - job_name: arvados_ws bearer_token: __MANAGEMENT_TOKEN__ @@ -42,12 +100,25 @@ prometheus: cluster: __CLUSTER__ - job_name: arvados_controller bearer_token: __MANAGEMENT_TOKEN__ + {%- if enable_balancer %} + scheme: http + {%- else %} scheme: https + {%- endif %} static_configs: + {%- if enable_balancer %} + {%- for controller in controller_nodes %} + - targets: ['{{ controller }}'] + labels: + instance: {{ controller.split('.')[0] }}.__CLUSTER__ + cluster: __CLUSTER__ + {%- endfor %} + {%- else %} - targets: ['__DOMAIN__:443'] labels: instance: controller.__CLUSTER__ cluster: __CLUSTER__ + {%- endif %} - job_name: keep_web bearer_token: __MANAGEMENT_TOKEN__ scheme: https @@ -59,7 +130,7 @@ prometheus: - job_name: keep_balance bearer_token: __MANAGEMENT_TOKEN__ static_configs: - - targets: ['__CONTROLLER_INT_IP__:9005'] + - targets: ['__KEEPBALANCE_INT_IP__:9005'] labels: instance: keep-balance.__CLUSTER__ cluster: __CLUSTER__ @@ -73,11 +144,12 @@ prometheus: - job_name: arvados_dispatch_cloud bearer_token: __MANAGEMENT_TOKEN__ static_configs: - - targets: ['__CONTROLLER_INT_IP__:9006'] + - targets: ['__DISPATCHER_INT_IP__:9006'] labels: instance: arvados-dispatch-cloud.__CLUSTER__ cluster: __CLUSTER__ + {%- if "__DATABASE_INT_IP__" != "" %} # Database - job_name: postgresql static_configs: @@ -88,16 +160,17 @@ prometheus: labels: instance: database.__CLUSTER__ cluster: __CLUSTER__ + {%- endif %} # Nodes + {%- set node_list = "__NODELIST__".split(',') %} + {%- set nodes = [] %} + {%- for node in node_list %} + {%- set _ = nodes.append(node.split('.')[0]) %} + {%- endfor %} - job_name: node static_configs: - {% for node in [ - 'controller', - 'keep0', - 'workbench', - 'shell', - ] %} + {% for node in nodes %} - targets: [ "{{ node }}.__DOMAIN__:9100" ] labels: instance: "{{ node }}.__CLUSTER__"