Skip to content

Commit 09135ee

Browse files
authored
Enable Multi Cluster alerts by default (prometheus-operator#2099)
1 parent 5ac666d commit 09135ee

8 files changed

+1480
-97
lines changed

jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ local defaults = {
1111
mixin:: {
1212
ruleLabels: {},
1313
_config: {
14+
showMultiCluster: true,
1415
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
1516
kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
1617
kubeStateMetricsSelector: 'job="kube-state-metrics"',

jsonnet/kube-prometheus/components/mixin/alerts/general.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
summary: 'One or more targets are unreachable.',
1111
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.',
1212
},
13-
expr: '100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10',
13+
expr: '100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10',
1414
'for': '10m',
1515
labels: {
1616
severity: 'warning',

jsonnet/kube-prometheus/components/prometheus-operator.libsonnet

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ local defaults = {
3838
prometheus: defaults.name,
3939
},
4040
_config: {
41+
groupLabels: 'cluster,controller,namespace',
4142
prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + defaults.namespace + '"',
4243
runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/%s',
4344
},

manifests/grafana-dashboardDefinitions.yaml

Lines changed: 1454 additions & 79 deletions
Large diffs are not rendered by default.

manifests/grafana-deployment.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ spec:
8383
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster
8484
name: grafana-dashboard-k8s-resources-cluster
8585
readOnly: false
86+
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-multicluster
87+
name: grafana-dashboard-k8s-resources-multicluster
88+
readOnly: false
8689
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-namespace
8790
name: grafana-dashboard-k8s-resources-namespace
8891
readOnly: false
@@ -180,6 +183,9 @@ spec:
180183
- configMap:
181184
name: grafana-dashboard-k8s-resources-cluster
182185
name: grafana-dashboard-k8s-resources-cluster
186+
- configMap:
187+
name: grafana-dashboard-k8s-resources-multicluster
188+
name: grafana-dashboard-k8s-resources-multicluster
183189
- configMap:
184190
name: grafana-dashboard-k8s-resources-namespace
185191
name: grafana-dashboard-k8s-resources-namespace

manifests/kubePrometheus-prometheusRule.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ spec:
1818
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
1919
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
2020
summary: One or more targets are unreachable.
21-
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
21+
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10
2222
for: 10m
2323
labels:
2424
severity: warning

manifests/kubernetesControlPlane-prometheusRule.yaml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -247,50 +247,50 @@ spec:
247247
rules:
248248
- alert: KubeCPUOvercommit
249249
annotations:
250-
description: Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
250+
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
251251
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
252252
summary: Cluster has overcommitted CPU resource requests.
253253
expr: |
254-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})) > 0
254+
sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
255255
and
256-
(sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})) > 0
256+
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
257257
for: 10m
258258
labels:
259259
severity: warning
260260
- alert: KubeMemoryOvercommit
261261
annotations:
262-
description: Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.
262+
description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.
263263
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
264264
summary: Cluster has overcommitted memory resource requests.
265265
expr: |
266-
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})) > 0
266+
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
267267
and
268-
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})) > 0
268+
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
269269
for: 10m
270270
labels:
271271
severity: warning
272272
- alert: KubeCPUQuotaOvercommit
273273
annotations:
274-
description: Cluster has overcommitted CPU resource requests for Namespaces.
274+
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces.
275275
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
276276
summary: Cluster has overcommitted CPU resource requests.
277277
expr: |
278-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"}))
278+
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
279279
/
280-
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})
280+
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
281281
> 1.5
282282
for: 5m
283283
labels:
284284
severity: warning
285285
- alert: KubeMemoryQuotaOvercommit
286286
annotations:
287-
description: Cluster has overcommitted memory resource requests for Namespaces.
287+
description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Namespaces.
288288
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
289289
summary: Cluster has overcommitted memory resource requests.
290290
expr: |
291-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"}))
291+
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
292292
/
293-
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})
293+
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
294294
> 1.5
295295
for: 5m
296296
labels:

manifests/prometheusOperator-prometheusRule.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ spec:
2020
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors
2121
summary: Errors while performing list operations in controller.
2222
expr: |
23-
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
23+
(sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
2424
for: 15m
2525
labels:
2626
severity: warning
@@ -30,7 +30,7 @@ spec:
3030
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors
3131
summary: Errors while performing watch operations in controller.
3232
expr: |
33-
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.4
33+
(sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.4
3434
for: 15m
3535
labels:
3636
severity: warning
@@ -50,7 +50,7 @@ spec:
5050
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors
5151
summary: Errors while reconciling controller.
5252
expr: |
53-
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1
53+
(sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1
5454
for: 10m
5555
labels:
5656
severity: warning
@@ -70,7 +70,7 @@ spec:
7070
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
7171
summary: Prometheus operator not ready
7272
expr: |
73-
min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
73+
min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
7474
for: 5m
7575
labels:
7676
severity: warning

0 commit comments

Comments
 (0)