@@ -30,11 +30,11 @@ spec:
30
30
runbook_url : https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
31
31
summary : Pod has been in a non-ready state for more than 15 minutes.
32
32
expr : |
33
- sum by (namespace, pod) (
34
- max by(namespace, pod) (
33
+ sum by (namespace, pod, cluster ) (
34
+ max by(namespace, pod, cluster ) (
35
35
kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}
36
- ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
37
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
36
+ ) * on(namespace, pod, cluster ) group_left(owner_kind) topk by(namespace, pod, cluster ) (
37
+ 1, max by(namespace, pod, owner_kind, cluster ) (kube_pod_owner{owner_kind!="Job"})
38
38
)
39
39
) > 0
40
40
for : 15m
@@ -174,7 +174,7 @@ spec:
174
174
runbook_url : https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
175
175
summary : Pod container waiting longer than 1 hour
176
176
expr : |
177
- sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
177
+ sum by (namespace, pod, container, cluster ) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
178
178
for : 1h
179
179
labels :
180
180
severity : warning
@@ -209,7 +209,7 @@ spec:
209
209
runbook_url : https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
210
210
summary : Job did not complete in time
211
211
expr : |
212
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics"}
212
+ time() - max by(namespace, job_name, cluster ) (kube_job_status_start_time{job="kube-state-metrics"}
213
213
and
214
214
kube_job_status_active{job="kube-state-metrics"} > 0) > 43200
215
215
labels :
@@ -488,7 +488,7 @@ spec:
488
488
runbook_url : https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
489
489
summary : Different semantic versions of Kubernetes components running.
490
490
expr : |
491
- count( count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
491
+ count by (cluster) ( count by (git_version, cluster ) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
492
492
for : 15m
493
493
labels :
494
494
severity : warning
@@ -594,7 +594,7 @@ spec:
594
594
runbook_url : https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
595
595
summary : Kubernetes aggregated API has reported errors.
596
596
expr : |
597
- sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
597
+ sum by(name, namespace, cluster )(increase(aggregator_unavailable_apiservice_total[10m])) > 4
598
598
labels :
599
599
severity : warning
600
600
- alert : KubeAggregatedAPIDown
@@ -604,7 +604,7 @@ spec:
604
604
runbook_url : https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
605
605
summary : Kubernetes aggregated API is down.
606
606
expr : |
607
- (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
607
+ (1 - max by(name, namespace, cluster )(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
608
608
for : 5m
609
609
labels :
610
610
severity : warning
0 commit comments