21700: Install Bundler system-wide in Rails postinst
[arvados.git] / tools / salt-install / config_examples / multi_host / aws / pillars / prometheus_server.sls
1 ---
2 # Copyright (C) The Arvados Authors. All rights reserved.
3 #
4 # SPDX-License-Identifier: AGPL-3.0
5
6 {%- set controller_nodes = "__CONTROLLER_NODES__".split(',') %}
7 {%- set enable_balancer = ("__ENABLE_BALANCER__"|to_bool) %}
8 {%- set data_retention_time = "__PROMETHEUS_DATA_RETENTION_TIME__" %}
9
10 ### PROMETHEUS
11 prometheus:
12   wanted:
13     component:
14       - prometheus
15       - alertmanager
16       - node_exporter
17       - blackbox_exporter
18   pkg:
19     use_upstream_repo: false
20     use_upstream_archive: true
21     component:
22       blackbox_exporter:
23         config_file: /etc/prometheus/blackbox_exporter.yml
24         config:
25           modules:
26             http_2xx:
27               prober: http
28               timeout: 5s
29               http:
30                 valid_http_versions: [HTTP/1.1, HTTP/2]
31                 valid_status_codes: [200]
32                 method: GET
33                 tls_config:
34                   insecure_skip_verify: true # Avoid failures on self-signed certs
35                 fail_if_ssl: false
36                 fail_if_not_ssl: true
37             http_2xx_mngmt_token:
38               prober: http
39               timeout: 5s
40               http:
41                 valid_http_versions: [HTTP/1.1, HTTP/2]
42                 valid_status_codes: [200]
43                 method: GET
44                 bearer_token: __MANAGEMENT_TOKEN__
45                 tls_config:
46                   insecure_skip_verify: true # Avoid failures on self-signed certs
47                 fail_if_ssl: false
48                 fail_if_not_ssl: true
49             http_2xx_basic_auth:
50               prober: http
51               timeout: 5s
52               http:
53                 valid_http_versions: [HTTP/1.1, HTTP/2]
54                 valid_status_codes: [200]
55                 method: GET
56                 basic_auth:
57                   username: "__MONITORING_USERNAME__"
58                   password: "__MONITORING_PASSWORD__"
59                 tls_config:
60                   insecure_skip_verify: true # Avoid failures on self-signed certs
61                 fail_if_ssl: false
62                 fail_if_not_ssl: true
63       prometheus:
64         service:
65            args:
66              storage.tsdb.retention.time: {{ data_retention_time }}
67         config:
68           global:
69             scrape_interval: 15s
70             evaluation_interval: 15s
71           rule_files:
72             - rules.yml
73
74           scrape_configs:
75             - job_name: prometheus
76               # metrics_path defaults to /metrics
77               # scheme defaults to http.
78               static_configs:
79               - targets: ['localhost:9090']
80                 labels:
81                   instance: mon.__CLUSTER__
82                   cluster: __CLUSTER__
83
84             - job_name: http_probe
85               metrics_path: /probe
86               params:
87                 module: [http_2xx]
88               static_configs:
89                 - targets: ['https://workbench.__DOMAIN__']
90                   labels:
91                     instance: workbench.__CLUSTER__
92                 - targets: ['https://workbench2.__DOMAIN__']
93                   labels:
94                     instance: workbench2.__CLUSTER__
95                 - targets: ['https://webshell.__DOMAIN__']
96                   labels:
97                     instance: webshell.__CLUSTER__
98               relabel_configs:
99                 - source_labels: [__address__]
100                   target_label: __param_target
101                 - source_labels: [__param_target]
102                   target_label: instance
103                 - target_label: __address__
104                   replacement: 127.0.0.1:9115          # blackbox exporter.
105
106             - job_name: http_probe_mngmt_token
107               metrics_path: /probe
108               params:
109                 module: [http_2xx_mngmt_token]
110               static_configs:
111                 - targets: ['https://__DOMAIN__/_health/ping']
112                   labels:
113                     instance: controller.__CLUSTER__
114                 - targets: ['https://download.__DOMAIN__/_health/ping']
115                   labels:
116                     instance: download.__CLUSTER__
117                 - targets: ['https://ws.__DOMAIN__/_health/ping']
118                   labels:
119                     instance: ws.__CLUSTER__
120               relabel_configs:
121                 - source_labels: [__address__]
122                   target_label: __param_target
123                 - source_labels: [__param_target]
124                   target_label: instance
125                 - target_label: __address__
126                   replacement: 127.0.0.1:9115          # blackbox exporter.
127
128             - job_name: http_probe_basic_auth
129               metrics_path: /probe
130               params:
131                 module: [http_2xx_basic_auth]
132               static_configs:
133                 - targets: ['https://grafana.__DOMAIN__']
134                   labels:
135                     instance: grafana.__CLUSTER__
136                 - targets: ['https://prometheus.__DOMAIN__']
137                   labels:
138                     instance: prometheus.__CLUSTER__
139               relabel_configs:
140                 - source_labels: [__address__]
141                   target_label: __param_target
142                 - source_labels: [__param_target]
143                   target_label: instance
144                 - target_label: __address__
145                   replacement: 127.0.0.1:9115          # blackbox exporter.
146
147             ## Arvados unique jobs
148             - job_name: arvados_ws
149               bearer_token: __MANAGEMENT_TOKEN__
150               scheme: https
151               static_configs:
152                 - targets: ['ws.__DOMAIN__:443']
153                   labels:
154                     instance: ws.__CLUSTER__
155                     cluster: __CLUSTER__
156             - job_name: arvados_controller
157               bearer_token: __MANAGEMENT_TOKEN__
158               {%- if enable_balancer %}
159               scheme: http
160               {%- else %}
161               scheme: https
162               {%- endif %}
163               static_configs:
164                 {%- if enable_balancer %}
165                   {%- for controller in controller_nodes %}
166                 - targets: ['{{ controller }}']
167                   labels:
168                     instance: {{ controller.split('.')[0] }}.__CLUSTER__
169                     cluster: __CLUSTER__
170                   {%- endfor %}
171                 {%- else %}
172                 - targets: ['__DOMAIN__:443']
173                   labels:
174                     instance: controller.__CLUSTER__
175                     cluster: __CLUSTER__
176                 {%- endif %}
177             - job_name: keep_web
178               bearer_token: __MANAGEMENT_TOKEN__
179               scheme: https
180               static_configs:
181                 - targets: ['keep.__DOMAIN__:443']
182                   labels:
183                     instance: keep-web.__CLUSTER__
184                     cluster: __CLUSTER__
185             - job_name: keep_balance
186               bearer_token: __MANAGEMENT_TOKEN__
187               static_configs:
188                 - targets: ['__KEEPBALANCE_INT_IP__:9005']
189                   labels:
190                     instance: keep-balance.__CLUSTER__
191                     cluster: __CLUSTER__
192             - job_name: keepstore
193               bearer_token: __MANAGEMENT_TOKEN__
194               static_configs:
195                 - targets: ['__KEEPSTORE0_INT_IP__:25107']
196                   labels:
197                     instance: keep0.__CLUSTER__
198                     cluster: __CLUSTER__
199             - job_name: arvados_dispatch_cloud
200               bearer_token: __MANAGEMENT_TOKEN__
201               static_configs:
202                 - targets: ['__DISPATCHER_INT_IP__:9006']
203                   labels:
204                     instance: arvados-dispatch-cloud.__CLUSTER__
205                     cluster: __CLUSTER__
206
207             {%- if "__DATABASE_INT_IP__" != "" %}
208             # Database
209             - job_name: postgresql
210               static_configs:
211                 - targets: [
212                     '__DATABASE_INT_IP__:9187',
213                     '__DATABASE_INT_IP__:3903'
214                   ]
215                   labels:
216                     instance: database.__CLUSTER__
217                     cluster: __CLUSTER__
218             {%- endif %}
219
220             # Nodes
221             {%- set node_list = "__NODELIST__".split(',') %}
222             {%- set nodes = [] %}
223             {%- for node in node_list %}
224               {%- set _ = nodes.append(node.split('.')[0]) %}
225             {%- endfor %}
226             - job_name: node
227               static_configs:
228                 {% for node in nodes %}
229                 - targets: [ "{{ node }}.__DOMAIN__:9100" ]
230                   labels:
231                     instance: "{{ node }}.__CLUSTER__"
232                     cluster: __CLUSTER__
233                 {% endfor %}