Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
alert: etcdMembersDown
expr: max
by(job) (sum by(job) (up{job=~".*etcd.*"} == bool 0) or count by(job, endpoint)
(sum by(job, endpoint, To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m]))
> 0.01)) > 0
for: 3m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
}}).'
|
ok
|
|
4.498s ago
|
976.5us |
alert: etcdInsufficientMembers
expr: sum
by(job) (up{job=~".*etcd.*"} == bool 1) < ((count by(job) (up{job=~".*etcd.*"})
+ 1) / 2)
for: 3m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
}}).'
|
ok
|
|
4.497s ago
|
150.2us |
alert: etcdNoLeader
expr: etcd_server_has_leader{job=~".*etcd.*"}
== 0
for: 1m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
has no leader.'
|
ok
|
|
4.497s ago
|
74.99us |
alert: etcdHighNumberOfLeaderChanges
expr: increase((max
by(job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0 *
absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
>= 3
for: 5m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
within the last 15 minutes. Frequent elections may be a sign of insufficient resources,
high network latency, or disruptions by other components and should be investigated.'
|
ok
|
|
4.497s ago
|
216.8us |
alert: etcdHighNumberOfFailedGRPCRequests
expr: 100
* sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m]))
/ sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m]))
> 1
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
ok
|
|
4.497s ago
|
155.4us |
alert: etcdHighNumberOfFailedGRPCRequests
expr: 100
* sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m]))
/ sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m]))
> 5
for: 5m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
ok
|
|
4.497s ago
|
177.9us |
alert: etcdGRPCRequestsSlow
expr: histogram_quantile(0.99,
sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job=~".*etcd.*"}[5m])))
> 0.15
for: 10m
labels:
severity: critical
annotations:
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
4.497s ago
|
111.4us |
alert: etcdMemberCommunicationSlow
expr: histogram_quantile(0.99,
rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": member communication with {{
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
4.497s ago
|
78.05us |
alert: etcdHighNumberOfFailedProposals
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m])
> 5
for: 15m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
ok
|
|
4.497s ago
|
103.8us |
alert: etcdHighFsyncDurations
expr: histogram_quantile(0.99,
rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
4.497s ago
|
94.53us |
alert: etcdHighCommitDurations
expr: histogram_quantile(0.99,
rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
annotations:
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
ok
|
|
4.497s ago
|
88.52us |
alert: etcdHighNumberOfFailedHTTPRequests
expr: sum
by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) >
0.01
for: 10m
labels:
severity: warning
annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}'
|
ok
|
|
4.497s ago
|
181.3us |
alert: etcdHighNumberOfFailedHTTPRequests
expr: sum
by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m]))
/ sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) >
0.05
for: 10m
labels:
severity: critical
annotations:
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
{{ $labels.instance }}.'
|
ok
|
|
4.496s ago
|
134.7us |
alert: etcdHTTPRequestsSlow
expr: histogram_quantile(0.99,
rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
}} are slow.
|
ok
|
|
4.496s ago
|
48.77us |
|
23.377s ago |
2.824ms |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
record: namespace:container_cpu_usage_seconds_total:sum_rate
expr: sum
by(namespace) (rate(container_cpu_usage_seconds_total{container!="POD",image!="",job="kubelet",metrics_path="/metrics/cadvisor"}[5m]))
|
ok
|
|
22.321s ago
|
4.845ms |
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
expr: sum
by(cluster, namespace, pod, container) (rate(container_cpu_usage_seconds_total{container!="POD",image!="",job="kubelet",metrics_path="/metrics/cadvisor"}[5m]))
* on(cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod)
(1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}))
|
ok
|
|
22.317s ago
|
9.018ms |
record: node_namespace_pod_container:container_memory_working_set_bytes
expr: container_memory_working_set_bytes{image!="",job="kubelet",metrics_path="/metrics/cadvisor"}
* on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace,
pod, node) (kube_pod_info{node!=""}))
|
ok
|
|
22.308s ago
|
14.47ms |
record: node_namespace_pod_container:container_memory_rss
expr: container_memory_rss{image!="",job="kubelet",metrics_path="/metrics/cadvisor"}
* on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace,
pod, node) (kube_pod_info{node!=""}))
|
ok
|
|
22.293s ago
|
13.11ms |
record: node_namespace_pod_container:container_memory_cache
expr: container_memory_cache{image!="",job="kubelet",metrics_path="/metrics/cadvisor"}
* on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace,
pod, node) (kube_pod_info{node!=""}))
|
ok
|
|
22.28s ago
|
13.12ms |
record: node_namespace_pod_container:container_memory_swap
expr: container_memory_swap{image!="",job="kubelet",metrics_path="/metrics/cadvisor"}
* on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace,
pod, node) (kube_pod_info{node!=""}))
|
ok
|
|
22.267s ago
|
13.44ms |
record: namespace:container_memory_usage_bytes:sum
expr: sum
by(namespace) (container_memory_usage_bytes{container!="POD",image!="",job="kubelet",metrics_path="/metrics/cadvisor"})
|
ok
|
|
22.254s ago
|
2.592ms |
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum
expr: sum
by(namespace) (sum by(namespace, pod) (max by(namespace, pod, container) (kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"})
* on(namespace, pod) group_left() max by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Running"}
== 1)))
|
ok
|
|
22.251s ago
|
6.077ms |
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
expr: sum
by(namespace) (sum by(namespace, pod) (max by(namespace, pod, container) (kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"})
* on(namespace, pod) group_left() max by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Running"}
== 1)))
|
ok
|
|
22.245s ago
|
5.566ms |
record: namespace_workload_pod:kube_pod_owner:relabel
expr: max
by(cluster, namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)") * on(replicaset,
namespace) group_left(owner_name) topk by(replicaset, namespace) (1, max by(replicaset,
namespace, owner_name) (kube_replicaset_owner{job="kube-state-metrics"})),
"workload", "$1", "owner_name", "(.*)"))
labels:
workload_type: deployment
|
ok
|
|
22.24s ago
|
2.716ms |
record: namespace_workload_pod:kube_pod_owner:relabel
expr: max
by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"))
labels:
workload_type: daemonset
|
ok
|
|
22.237s ago
|
1.232ms |
record: namespace_workload_pod:kube_pod_owner:relabel
expr: max
by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"))
labels:
workload_type: statefulset
|
ok
|
|
22.236s ago
|
167.2us |
|
1m33.824s ago |
342.4ms |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
record: apiserver_request:availability30d
expr: 1
- ((sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
- sum(increase(apiserver_request_duration_seconds_bucket{le="1",verb=~"POST|PUT|PATCH|DELETE"}[30d])))
+ (sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
- ((sum(increase(apiserver_request_duration_seconds_bucket{le="0.1",scope=~"resource|",verb=~"LIST|GET"}[30d]))
or vector(0)) + sum(increase(apiserver_request_duration_seconds_bucket{le="0.5",scope="namespace",verb=~"LIST|GET"}[30d]))
+ sum(increase(apiserver_request_duration_seconds_bucket{le="5",scope="cluster",verb=~"LIST|GET"}[30d]))))
+ sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)))
/ sum(code:apiserver_request_total:increase30d)
labels:
verb: all
|
ok
|
|
1m33.825s ago
|
101.2ms |
record: apiserver_request:availability30d
expr: 1
- (sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
- ((sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[30d]))
or vector(0)) + sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[30d]))
+ sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[30d])))
+ sum(code:apiserver_request_total:increase30d{code=~"5..",verb="read"}
or vector(0))) / sum(code:apiserver_request_total:increase30d{verb="read"})
labels:
verb: read
|
ok
|
|
1m33.724s ago
|
64.41ms |
record: apiserver_request:availability30d
expr: 1
- ((sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
- sum(increase(apiserver_request_duration_seconds_bucket{le="1",verb=~"POST|PUT|PATCH|DELETE"}[30d])))
+ sum(code:apiserver_request_total:increase30d{code=~"5..",verb="write"}
or vector(0))) / sum(code:apiserver_request_total:increase30d{verb="write"})
labels:
verb: write
|
ok
|
|
1m33.66s ago
|
35.13ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"2..",job="apiserver",verb="LIST"}[30d]))
|
ok
|
|
1m33.624s ago
|
56.47ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"2..",job="apiserver",verb="GET"}[30d]))
|
ok
|
|
1m33.568s ago
|
24.78ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"2..",job="apiserver",verb="POST"}[30d]))
|
ok
|
|
1m33.543s ago
|
11.49ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"2..",job="apiserver",verb="PUT"}[30d]))
|
ok
|
|
1m33.532s ago
|
9.154ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"2..",job="apiserver",verb="PATCH"}[30d]))
|
ok
|
|
1m33.523s ago
|
6.067ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"2..",job="apiserver",verb="DELETE"}[30d]))
|
ok
|
|
1m33.517s ago
|
6.873ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"3..",job="apiserver",verb="LIST"}[30d]))
|
ok
|
|
1m33.51s ago
|
205.6us |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"3..",job="apiserver",verb="GET"}[30d]))
|
ok
|
|
1m33.51s ago
|
130.7us |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"3..",job="apiserver",verb="POST"}[30d]))
|
ok
|
|
1m33.51s ago
|
118.2us |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"3..",job="apiserver",verb="PUT"}[30d]))
|
ok
|
|
1m33.51s ago
|
119.9us |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"3..",job="apiserver",verb="PATCH"}[30d]))
|
ok
|
|
1m33.51s ago
|
125.8us |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"3..",job="apiserver",verb="DELETE"}[30d]))
|
ok
|
|
1m33.51s ago
|
160.2us |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"4..",job="apiserver",verb="LIST"}[30d]))
|
ok
|
|
1m33.509s ago
|
3.05ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"4..",job="apiserver",verb="GET"}[30d]))
|
ok
|
|
1m33.506s ago
|
3.28ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"4..",job="apiserver",verb="POST"}[30d]))
|
ok
|
|
1m33.503s ago
|
1.48ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"4..",job="apiserver",verb="PUT"}[30d]))
|
ok
|
|
1m33.502s ago
|
2.706ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"4..",job="apiserver",verb="PATCH"}[30d]))
|
ok
|
|
1m33.499s ago
|
2.945ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"4..",job="apiserver",verb="DELETE"}[30d]))
|
ok
|
|
1m33.496s ago
|
1.181ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"5..",job="apiserver",verb="LIST"}[30d]))
|
ok
|
|
1m33.495s ago
|
1.762ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"5..",job="apiserver",verb="GET"}[30d]))
|
ok
|
|
1m33.493s ago
|
2.932ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"5..",job="apiserver",verb="POST"}[30d]))
|
ok
|
|
1m33.491s ago
|
2.112ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"5..",job="apiserver",verb="PUT"}[30d]))
|
ok
|
|
1m33.488s ago
|
2.091ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"5..",job="apiserver",verb="PATCH"}[30d]))
|
ok
|
|
1m33.486s ago
|
1.791ms |
record: code_verb:apiserver_request_total:increase30d
expr: sum
by(code, verb) (increase(apiserver_request_total{code=~"5..",job="apiserver",verb="DELETE"}[30d]))
|
ok
|
|
1m33.485s ago
|
181.1us |
record: code:apiserver_request_total:increase30d
expr: sum
by(code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
labels:
verb: read
|
ok
|
|
1m33.485s ago
|
134.7us |
record: code:apiserver_request_total:increase30d
expr: sum
by(code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
labels:
verb: write
|
ok
|
|
1m33.484s ago
|
191.2us |
|
4.368s ago |
1.32ms |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
record: apiserver_request:burnrate1d
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
- ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[1d]))
or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[1d]))
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[1d]))))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[1d])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
labels:
verb: read
|
ok
|
|
15.88s ago
|
165.3ms |
record: apiserver_request:burnrate1h
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
- ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[1h]))
or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[1h]))
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[1h]))))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[1h])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
labels:
verb: read
|
ok
|
|
15.715s ago
|
9.586ms |
record: apiserver_request:burnrate2h
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
- ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[2h]))
or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[2h]))
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[2h]))))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[2h])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
labels:
verb: read
|
ok
|
|
15.706s ago
|
19.61ms |
record: apiserver_request:burnrate30m
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
- ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[30m]))
or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[30m]))
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[30m]))))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[30m])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
labels:
verb: read
|
ok
|
|
15.686s ago
|
6.157ms |
record: apiserver_request:burnrate3d
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
- ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[3d]))
or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[3d]))
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[3d]))))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[3d])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
labels:
verb: read
|
ok
|
|
15.68s ago
|
158.8ms |
record: apiserver_request:burnrate5m
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
- ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[5m]))
or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[5m]))
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[5m]))))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[5m])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
|
ok
|
|
15.522s ago
|
5.655ms |
record: apiserver_request:burnrate6h
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
- ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[6h]))
or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[6h]))
+ sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[6h]))))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[6h])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
labels:
verb: read
|
ok
|
|
15.516s ago
|
47.79ms |
record: apiserver_request:burnrate1d
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
- sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[1d])))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
labels:
verb: write
|
ok
|
|
15.469s ago
|
73.49ms |
record: apiserver_request:burnrate1h
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
- sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[1h])))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
labels:
verb: write
|
ok
|
|
15.395s ago
|
5.332ms |
record: apiserver_request:burnrate2h
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
- sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[2h])))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
labels:
verb: write
|
ok
|
|
15.39s ago
|
10.36ms |
record: apiserver_request:burnrate30m
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
- sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[30m])))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
labels:
verb: write
|
ok
|
|
15.38s ago
|
3.109ms |
record: apiserver_request:burnrate3d
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
- sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[3d])))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
labels:
verb: write
|
ok
|
|
15.377s ago
|
83.71ms |
record: apiserver_request:burnrate5m
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
- sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[5m])))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
|
ok
|
|
15.293s ago
|
3.079ms |
record: apiserver_request:burnrate6h
expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
- sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[6h])))
+ sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])))
/ sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
labels:
verb: write
|
ok
|
|
15.29s ago
|
25.03ms |
record: code_resource:apiserver_request_total:rate5m
expr: sum
by(code, resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
|
ok
|
|
15.265s ago
|
3.157ms |
record: code_resource:apiserver_request_total:rate5m
expr: sum
by(code, resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
|
ok
|
|
15.262s ago
|
1.789ms |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
expr: histogram_quantile(0.99,
sum by(le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m])))
> 0
labels:
quantile: "0.99"
verb: read
|
ok
|
|
15.261s ago
|
35.22ms |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
expr: histogram_quantile(0.99,
sum by(le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])))
> 0
labels:
quantile: "0.99"
verb: write
|
ok
|
|
15.225s ago
|
19.28ms |
record: cluster:apiserver_request_duration_seconds:mean5m
expr: sum
without(instance, pod) (rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m]))
/ sum without(instance, pod) (rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m]))
|
ok
|
|
15.206s ago
|
3.242ms |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
expr: histogram_quantile(0.99,
sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])))
labels:
quantile: "0.99"
|
ok
|
|
15.203s ago
|
37.41ms |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
expr: histogram_quantile(0.9,
sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])))
labels:
quantile: "0.9"
|
ok
|
|
15.166s ago
|
38.46ms |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
expr: histogram_quantile(0.5,
sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])))
labels:
quantile: "0.5"
|
ok
|
|
15.127s ago
|
38.91ms |
|
12.61s ago |
1.76ms |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics",namespace=~".*"}[5m])
* 60 * 5 > 0
for: 15m
labels:
severity: warning
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
summary: Pod is crash looping.
|
ok
|
|
18.597s ago
|
4.785ms |
alert: KubePodNotReady
expr: sum
by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",namespace=~".*",phase=~"Pending|Unknown"})
* on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace,
pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}))) > 0
for: 15m
labels:
severity: warning
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
|
ok
|
|
18.592s ago
|
7.043ms |
alert: KubeDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation{job="kube-state-metrics",namespace=~".*"}
!= kube_deployment_metadata_generation{job="kube-state-metrics",namespace=~".*"}
for: 15m
labels:
severity: warning
annotations:
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has not been
rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
summary: Deployment generation mismatch due to possible roll-back
|
ok
|
|
18.585s ago
|
734.2us |
alert: KubeDeploymentReplicasMismatch
expr: (kube_deployment_spec_replicas{job="kube-state-metrics",namespace=~".*"}
!= kube_deployment_status_replicas_available{job="kube-state-metrics",namespace=~".*"})
and (changes(kube_deployment_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[5m])
== 0)
for: 15m
labels:
severity: warning
annotations:
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
|
ok
|
|
18.585s ago
|
1.414ms |
alert: KubeStatefulSetReplicasMismatch
expr: (kube_statefulset_status_replicas_ready{job="kube-state-metrics",namespace=~".*"}
!= kube_statefulset_status_replicas{job="kube-state-metrics",namespace=~".*"})
and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[5m])
== 0)
for: 15m
labels:
severity: warning
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not
matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
|
ok
|
|
18.584s ago
|
279.7us |
alert: KubeStatefulSetGenerationMismatch
expr: kube_statefulset_status_observed_generation{job="kube-state-metrics",namespace=~".*"}
!= kube_statefulset_metadata_generation{job="kube-state-metrics",namespace=~".*"}
for: 15m
labels:
severity: warning
annotations:
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has not
been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back
|
ok
|
|
18.583s ago
|
142.9us |
alert: KubeStatefulSetUpdateNotRolledOut
expr: (max
without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics",namespace=~".*"}
unless kube_statefulset_status_update_revision{job="kube-state-metrics",namespace=~".*"})
* (kube_statefulset_replicas{job="kube-state-metrics",namespace=~".*"}
!= kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}))
and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[5m])
== 0)
for: 15m
labels:
severity: warning
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
|
ok
|
|
18.583s ago
|
367.8us |
alert: KubeDaemonSetRolloutStuck
expr: ((kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace=~".*"}
!= kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"})
or (kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace=~".*"}
!= 0) or (kube_daemonset_updated_number_scheduled{job="kube-state-metrics",namespace=~".*"}
!= kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"})
or (kube_daemonset_status_number_available{job="kube-state-metrics",namespace=~".*"}
!= kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"}))
and (changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics",namespace=~".*"}[5m])
== 0)
for: 15m
labels:
severity: warning
annotations:
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished
or progressed for at least 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
|
ok
|
|
18.583s ago
|
1.786ms |
alert: KubeContainerWaiting
expr: sum
by(namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",namespace=~".*"})
> 0
for: 1h
labels:
severity: warning
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
has been in waiting state for longer than 1 hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
|
ok
|
|
18.582s ago
|
26.85ms |
alert: KubeDaemonSetNotScheduled
expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"}
- kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace=~".*"}
> 0
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
summary: DaemonSet pods are not scheduled.
|
ok
|
|
18.555s ago
|
313.9us |
alert: KubeDaemonSetMisScheduled
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace=~".*"}
> 0
for: 15m
labels:
severity: warning
annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
summary: DaemonSet pods are misscheduled.
|
ok
|
|
18.555s ago
|
338us |
alert: KubeJobCompletion
expr: kube_job_spec_completions{job="kube-state-metrics",namespace=~".*"}
- kube_job_status_succeeded{job="kube-state-metrics",namespace=~".*"}
> 0
for: 12h
labels:
severity: warning
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
12 hours to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
summary: Job did not complete in time
|
ok
|
|
18.555s ago
|
12.11ms |
alert: KubeJobFailed
expr: kube_job_failed{job="kube-state-metrics",namespace=~".*"}
> 0
for: 15m
labels:
severity: warning
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
summary: Job failed to complete.
|
ok
|
|
18.543s ago
|
1.111ms |
alert: KubeHpaReplicasMismatch
expr: (kube_hpa_status_desired_replicas{job="kube-state-metrics",namespace=~".*"}
!= kube_hpa_status_current_replicas{job="kube-state-metrics",namespace=~".*"})
and changes(kube_hpa_status_current_replicas[15m]) == 0
for: 15m
labels:
severity: warning
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired
number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
summary: HPA has not matched descired number of replicas.
|
ok
|
|
18.542s ago
|
157.1us |
alert: KubeHpaMaxedOut
expr: kube_hpa_status_current_replicas{job="kube-state-metrics",namespace=~".*"}
== kube_hpa_spec_max_replicas{job="kube-state-metrics",namespace=~".*"}
for: 15m
labels:
severity: warning
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max
replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
summary: HPA is running at max replicas
|
ok
|
|
18.541s ago
|
74.64us |
|
16.351s ago |
3.4ms |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
alert: KubeCPUOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum)
/ sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores)
- 1) / count(kube_node_status_allocatable_cpu_cores)
for: 5m
labels:
severity: warning
annotations:
description: Cluster has overcommitted CPU resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.
|
ok
|
|
16.352s ago
|
602.9us |
alert: KubeMemoryOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum)
/ sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes)
- 1) / count(kube_node_status_allocatable_memory_bytes)
for: 5m
labels:
severity: warning
annotations:
description: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.
|
ok
|
|
16.351s ago
|
379.3us |
alert: KubeCPUQuotaOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"})
/ sum(kube_node_status_allocatable_cpu_cores) > 1.5
for: 5m
labels:
severity: warning
annotations:
description: Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
summary: Cluster has overcommitted CPU resource requests.
|
ok
|
|
16.351s ago
|
189.2us |
alert: KubeMemoryQuotaOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"})
/ sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) >
1.5
for: 5m
labels:
severity: warning
annotations:
description: Cluster has overcommitted memory resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
summary: Cluster has overcommitted memory resource requests.
|
ok
|
|
16.351s ago
|
74.83us |
alert: KubeQuotaAlmostFull
expr: kube_resourcequota{job="kube-state-metrics",type="used"}
/ ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"}
> 0) > 0.9 < 1
for: 15m
labels:
severity: info
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull
summary: Namespace quota is going to be full.
|
ok
|
|
16.351s ago
|
88.05us |
alert: KubeQuotaFullyUsed
expr: kube_resourcequota{job="kube-state-metrics",type="used"}
/ ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"}
> 0) == 1
for: 15m
labels:
severity: info
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
summary: Namespace quota is fully used.
|
ok
|
|
16.351s ago
|
65.61us |
alert: KubeQuotaExceeded
expr: kube_resourcequota{job="kube-state-metrics",type="used"}
/ ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"}
> 0) > 1
for: 15m
labels:
severity: warning
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
summary: Namespace quota has exceeded the limits.
|
ok
|
|
16.351s ago
|
75.63us |
alert: CPUThrottlingHigh
expr: sum
by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m]))
/ sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m]))
> (25 / 100)
for: 15m
labels:
severity: info
annotations:
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{
$labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
}}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
summary: Processes experience elevated CPU throttling.
|
ok
|
|
16.351s ago
|
1.91ms |
|
8.844s ago |
1.04ms |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"}
> 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m])))
< 604800
labels:
severity: warning
annotations:
description: A client certificate used to authenticate to the apiserver is expiring
in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
summary: Client certificate is about to expire.
|
ok
|
|
22.551s ago
|
542.5us |
alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"}
> 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m])))
< 86400
labels:
severity: critical
annotations:
description: A client certificate used to authenticate to the apiserver is expiring
in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
summary: Client certificate is about to expire.
|
ok
|
|
22.551s ago
|
342us |
alert: AggregatedAPIErrors
expr: sum
by(name, namespace) (increase(aggregator_unavailable_apiservice_count[5m])) >
2
labels:
severity: warning
annotations:
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported
errors. The number of errors have increased for it in the past five minutes. High
values indicate that the availability of the service changes too often.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
summary: An aggregated API has reported errors.
|
ok
|
|
22.551s ago
|
60.75us |
alert: AggregatedAPIDown
expr: (1
- max by(name, namespace) (avg_over_time(aggregator_unavailable_apiservice[10m])))
* 100 < 85
for: 5m
labels:
severity: warning
annotations:
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been
only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
summary: An aggregated API is down.
|
ok
|
|
22.551s ago
|
71.48us |
alert: KubeAPIDown
expr: absent(up{job="apiserver"}
== 1)
for: 15m
labels:
severity: critical
annotations:
description: KubeAPI has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
summary: Target disappeared from Prometheus target discovery.
|
ok
|
|
22.551s ago
|
53.55us |
|
15.854s ago |
160.7us |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
alert: KubeNodeNotReady
expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"}
== 0
for: 15m
labels:
severity: warning
annotations:
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
summary: Node is not ready.
|
ok
|
|
22.661s ago
|
865.5us |
alert: KubeNodeUnreachable
expr: (kube_node_spec_taint{effect="NoSchedule",job="kube-state-metrics",key="node.kubernetes.io/unreachable"}
unless ignoring(key, value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"})
== 1
for: 15m
labels:
severity: warning
annotations:
description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
summary: Node is unreachable.
|
ok
|
|
22.661s ago
|
866.7us |
alert: KubeletTooManyPods
expr: count
by(node) ((kube_pod_status_phase{job="kube-state-metrics",phase="Running"}
== 1) * on(instance, pod, namespace, cluster) group_left(node) topk by(instance,
pod, namespace, cluster) (1, kube_pod_info{job="kube-state-metrics"})) /
max by(node) (kube_node_status_capacity_pods{job="kube-state-metrics"} !=
1) > 0.95
for: 15m
labels:
severity: warning
annotations:
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
}} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
summary: Kubelet is running at capacity.
|
ok
|
|
22.66s ago
|
6.124ms |
alert: KubeNodeReadinessFlapping
expr: sum
by(node) (changes(kube_node_status_condition{condition="Ready",status="true"}[15m]))
> 2
for: 15m
labels:
severity: warning
annotations:
description: The readiness status of node {{ $labels.node }} has changed {{ $value
}} times in the last 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
summary: Node readiness status is flapping.
|
ok
|
|
22.654s ago
|
240.1us |
alert: KubeletPlegDurationHigh
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"}
>= 10
for: 5m
labels:
severity: warning
annotations:
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
of {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
|
ok
|
|
22.654s ago
|
153.3us |
alert: KubeletPodStartUpLatencyHigh
expr: histogram_quantile(0.99,
sum by(instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet",metrics_path="/metrics"}[5m])))
* on(instance) group_left(node) kubelet_node_name{job="kubelet",metrics_path="/metrics"}
> 60
for: 15m
labels:
severity: warning
annotations:
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
summary: Kubelet Pod startup latency is too high.
|
ok
|
|
22.654s ago
|
1.935ms |
alert: KubeletClientCertificateExpiration
expr: kubelet_certificate_manager_client_ttl_seconds
< 604800
labels:
severity: warning
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires in
{{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
|
ok
|
|
22.652s ago
|
49.41us |
alert: KubeletClientCertificateExpiration
expr: kubelet_certificate_manager_client_ttl_seconds
< 86400
labels:
severity: critical
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires in
{{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
|
ok
|
|
22.652s ago
|
29.33us |
alert: KubeletServerCertificateExpiration
expr: kubelet_certificate_manager_server_ttl_seconds
< 604800
labels:
severity: warning
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires in
{{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
|
ok
|
|
22.652s ago
|
25.82us |
alert: KubeletServerCertificateExpiration
expr: kubelet_certificate_manager_server_ttl_seconds
< 86400
labels:
severity: critical
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires in
{{ $value | humanizeDuration }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
|
ok
|
|
22.652s ago
|
25.7us |
alert: KubeletClientCertificateRenewalErrors
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m])
> 0
for: 15m
labels:
severity: warning
annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate
({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors
summary: Kubelet has failed to renew its client certificate.
|
ok
|
|
22.652s ago
|
45.84us |
alert: KubeletServerCertificateRenewalErrors
expr: increase(kubelet_server_expiration_renew_errors[5m])
> 0
for: 15m
labels:
severity: warning
annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate
({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors
summary: Kubelet has failed to renew its server certificate.
|
ok
|
|
22.652s ago
|
36.25us |
alert: KubeletDown
expr: absent(up{job="kubelet",metrics_path="/metrics"}
== 1)
for: 15m
labels:
severity: critical
annotations:
description: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
summary: Target disappeared from Prometheus target discovery.
|
ok
|
|
22.652s ago
|
162.3us |
|
1.065s ago |
321.6us |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
alert: NodeFilesystemSpaceFillingUp
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"}
/ node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100
< 40 and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h],
24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"}
== 0)
for: 1h
labels:
severity: warning
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left and is filling up.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
ok
|
|
9.957s ago
|
2.623ms |
alert: NodeFilesystemSpaceFillingUp
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"}
/ node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100
< 15 and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h],
4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"}
== 0)
for: 1h
labels:
severity: critical
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left and is filling up fast.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
ok
|
|
9.955s ago
|
2.276ms |
alert: NodeFilesystemAlmostOutOfSpace
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"}
/ node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100
< 5 and node_filesystem_readonly{fstype!="",job="node-exporter"}
== 0)
for: 1h
labels:
severity: warning
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
|
ok
|
|
9.952s ago
|
444.5us |
alert: NodeFilesystemAlmostOutOfSpace
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"}
/ node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100
< 3 and node_filesystem_readonly{fstype!="",job="node-exporter"}
== 0)
for: 1h
labels:
severity: critical
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
|
ok
|
|
9.952s ago
|
364.6us |
alert: NodeFilesystemFilesFillingUp
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"}
/ node_filesystem_files{fstype!="",job="node-exporter"} * 100 <
40 and predict_linear(node_filesystem_files_free{fstype!="",job="node-exporter"}[6h],
24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"}
== 0)
for: 1h
labels:
severity: warning
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left and is filling up.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
ok
|
|
9.952s ago
|
1.903ms |
alert: NodeFilesystemFilesFillingUp
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"}
/ node_filesystem_files{fstype!="",job="node-exporter"} * 100 <
20 and predict_linear(node_filesystem_files_free{fstype!="",job="node-exporter"}[6h],
4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"}
== 0)
for: 1h
labels:
severity: critical
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left and is filling up fast.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
ok
|
|
9.95s ago
|
1.866ms |
alert: NodeFilesystemAlmostOutOfFiles
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"}
/ node_filesystem_files{fstype!="",job="node-exporter"} * 100 <
5 and node_filesystem_readonly{fstype!="",job="node-exporter"} ==
0)
for: 1h
labels:
severity: warning
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
|
ok
|
|
9.949s ago
|
363us |
alert: NodeFilesystemAlmostOutOfFiles
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"}
/ node_filesystem_files{fstype!="",job="node-exporter"} * 100 <
3 and node_filesystem_readonly{fstype!="",job="node-exporter"} ==
0)
for: 1h
labels:
severity: critical
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
|
ok
|
|
9.948s ago
|
322.3us |
alert: NodeNetworkReceiveErrs
expr: increase(node_network_receive_errs_total[2m])
> 10
for: 1h
labels:
severity: warning
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
|
ok
|
|
9.948s ago
|
1.054ms |
alert: NodeNetworkTransmitErrs
expr: increase(node_network_transmit_errs_total[2m])
> 10
for: 1h
labels:
severity: warning
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
|
ok
|
|
9.947s ago
|
1.14ms |
alert: NodeHighNumberConntrackEntriesUsed
expr: (node_nf_conntrack_entries
/ node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
|
ok
|
|
9.946s ago
|
235.6us |
alert: NodeTextFileCollectorScrapeError
expr: node_textfile_scrape_error{job="node-exporter"}
== 1
labels:
severity: warning
annotations:
description: Node Exporter text file collector failed to scrape.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
|
ok
|
|
9.946s ago
|
112.7us |
alert: NodeClockSkewDetected
expr: (node_timex_offset_seconds
> 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds
< -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 10m
labels:
severity: warning
annotations:
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
NTP is configured correctly on this host.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected
summary: Clock skew detected.
|
ok
|
|
9.946s ago
|
630.7us |
alert: NodeClockNotSynchronising
expr: min_over_time(node_timex_sync_status[5m])
== 0
for: 10m
labels:
severity: warning
annotations:
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
on this host.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising
summary: Clock not synchronising.
|
ok
|
|
9.945s ago
|
106.7us |
alert: NodeRAIDDegraded
expr: node_md_disks_required
- ignoring(state) (node_md_disks{state="active"}) > 0
for: 15m
labels:
severity: critical
annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }}
is in degraded state due to one or more disks failures. Number of spare drives
is insufficient to fix issue automatically.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddegraded
summary: RAID Array is degraded
|
ok
|
|
9.945s ago
|
48.33us |
alert: NodeRAIDDiskFailure
expr: node_md_disks{state="fail"}
> 0
labels:
severity: warning
annotations:
description: At least one device in RAID array on {{ $labels.instance }} failed.
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure
summary: Failed device in RAID array
|
ok
|
|
9.945s ago
|
27.16us |
|
10.732s ago |
1.347ms |
Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
alert: PrometheusBadConfig
expr: max_over_time(prometheus_config_last_reload_successful{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
== 0
for: 10m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload
its configuration.
summary: Failed Prometheus configuration reload.
|
ok
|
|
20.559s ago
|
230.5us |
alert: PrometheusNotificationQueueRunningFull
expr: (predict_linear(prometheus_notifications_queue_length{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m],
60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))
for: 15m
labels:
severity: warning
annotations:
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}
is running full.
summary: Prometheus alert notification queue predicted to run full in less than
30m.
|
ok
|
|
20.559s ago
|
270.7us |
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
expr: (rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
/ rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))
* 100 > 1
for: 15m
labels:
severity: warning
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from
Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a specific
Alertmanager.
|
ok
|
|
20.559s ago
|
162.1us |
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
expr: min
without(alertmanager) (rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
/ rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))
* 100 > 3
for: 15m
labels:
severity: critical
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
ok
|
|
20.559s ago
|
147.4us |
alert: PrometheusNotConnectedToAlertmanagers
expr: max_over_time(prometheus_notifications_alertmanagers_discovered{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
< 1
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to
any Alertmanagers.
summary: Prometheus is not connected to any Alertmanagers.
|
ok
|
|
20.559s ago
|
86.92us |
alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[3h])
> 0
for: 4h
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
| humanize}} reload failures over the last 3h.
summary: Prometheus has issues reloading blocks from disk.
|
ok
|
|
20.559s ago
|
194.1us |
alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[3h])
> 0
for: 4h
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
| humanize}} compaction failures over the last 3h.
summary: Prometheus has issues compacting blocks.
|
ok
|
|
20.559s ago
|
174.3us |
alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
<= 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
summary: Prometheus is not ingesting samples.
|
ok
|
|
20.559s ago
|
95.56us |
alert: PrometheusDuplicateTimestamps
expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
> 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf
"%.4g" $value }} samples/s with different values but duplicated timestamp.
summary: Prometheus is dropping samples with duplicate timestamps.
|
ok
|
|
20.559s ago
|
75.05us |
alert: PrometheusOutOfOrderTimestamps
expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
> 0
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf
"%.4g" $value }} samples/s with timestamps arriving out of order.
summary: Prometheus drops samples with out-of-order timestamps.
|
ok
|
|
20.559s ago
|
60.19us |
alert: PrometheusRemoteStorageFailures
expr: (rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
/ (rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
+ rate(prometheus_remote_storage_succeeded_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])))
* 100 > 1
for: 15m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{
printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{
$labels.url }}
summary: Prometheus fails to send samples to remote storage.
|
ok
|
|
20.559s ago
|
173.6us |
alert: PrometheusRemoteWriteBehind
expr: (max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
- on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))
> 120
for: 15m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{
printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url
}}.
summary: Prometheus remote write is behind.
|
ok
|
|
20.559s ago
|
127.9us |
alert: PrometheusRemoteWriteDesiredShards
expr: (max_over_time(prometheus_remote_storage_shards_desired{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
> max_over_time(prometheus_remote_storage_shards_max{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))
for: 15m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired
shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{
$labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="kube-prometheus-stack-prometheus",namespace="monitoring"}`
$labels.instance | query | first | value }}.
summary: Prometheus remote write desired shards calculation wants to run more than
configured max shards.
|
ok
|
|
20.559s ago
|
79.82us |
alert: PrometheusRuleFailures
expr: increase(prometheus_rule_evaluation_failures_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
> 0
for: 15m
labels:
severity: critical
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate
{{ printf "%.0f" $value }} rules in the last 5m.
summary: Prometheus is failing rule evaluations.
|
ok
|
|
20.559s ago
|
317.6us |
alert: PrometheusMissingRuleEvaluations
expr: increase(prometheus_rule_group_iterations_missed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
> 0
for: 15m
labels:
severity: warning
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf
"%.0f" $value }} rule group evaluations in the last 5m.
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
ok
|
|
20.558s ago
|
73.05us |