Rules

alertmanager.rules

14.113s ago

478.3us

Rule State Error Last Evaluation Evaluation Time
alert: AlertmanagerConfigInconsistent expr: count by(namespace, service) (count_values by(namespace, service) ("config_hash", alertmanager_config_hash{job="kube-prometheus-stack-alertmanager",namespace="monitoring"})) != 1 for: 5m labels: severity: critical annotations: message: | The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync. {{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }} Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}" {{ end }} ok 14.115s ago 291.2us
alert: AlertmanagerFailedReload expr: alertmanager_config_last_reload_successful{job="kube-prometheus-stack-alertmanager",namespace="monitoring"} == 0 for: 10m labels: severity: warning annotations: message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. ok 14.115s ago 57.11us
alert: AlertmanagerMembersInconsistent expr: alertmanager_cluster_members{job="kube-prometheus-stack-alertmanager",namespace="monitoring"} != on(service) group_left() count by(service) (alertmanager_cluster_members{job="kube-prometheus-stack-alertmanager",namespace="monitoring"}) for: 5m labels: severity: critical annotations: message: Alertmanager has not found all other members of the cluster. ok 14.115s ago 119.6us

etcd

4.496s ago

2.616ms

Rule State Error Last Evaluation Evaluation Time
alert: etcdMembersDown expr: max by(job) (sum by(job) (up{job=~".*etcd.*"} == bool 0) or count by(job, endpoint) (sum by(job, endpoint, To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m])) > 0.01)) > 0 for: 3m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).' ok 4.498s ago 976.5us
alert: etcdInsufficientMembers expr: sum by(job) (up{job=~".*etcd.*"} == bool 1) < ((count by(job) (up{job=~".*etcd.*"}) + 1) / 2) for: 3m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).' ok 4.497s ago 150.2us
alert: etcdNoLeader expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 for: 1m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.' ok 4.497s ago 74.99us
alert: etcdHighNumberOfLeaderChanges expr: increase((max by(job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0 * absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 3 for: 5m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' ok 4.497s ago 216.8us
alert: etcdHighNumberOfFailedGRPCRequests expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) > 1 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' ok 4.497s ago 155.4us
alert: etcdHighNumberOfFailedGRPCRequests expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) > 5 for: 5m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' ok 4.497s ago 177.9us
alert: etcdGRPCRequestsSlow expr: histogram_quantile(0.99, sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job=~".*etcd.*"}[5m]))) > 0.15 for: 10m labels: severity: critical annotations: message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 4.497s ago 111.4us
alert: etcdMemberCommunicationSlow expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 4.497s ago 78.05us
alert: etcdHighNumberOfFailedProposals expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 for: 15m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.' ok 4.497s ago 103.8us
alert: etcdHighFsyncDurations expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 4.497s ago 94.53us
alert: etcdHighCommitDurations expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.25 for: 10m labels: severity: warning annotations: message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' ok 4.497s ago 88.52us
alert: etcdHighNumberOfFailedHTTPRequests expr: sum by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) > 0.01 for: 10m labels: severity: warning annotations: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' ok 4.497s ago 181.3us
alert: etcdHighNumberOfFailedHTTPRequests expr: sum by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m])) / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) > 0.05 for: 10m labels: severity: critical annotations: message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.' ok 4.496s ago 134.7us
alert: etcdHTTPRequestsSlow expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 for: 10m labels: severity: warning annotations: message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow. ok 4.496s ago 48.77us

general.rules

23.377s ago

2.824ms

Rule State Error Last Evaluation Evaluation Time
alert: TargetDown expr: 100 * (count by(job, namespace, service) (up == 0) / count by(job, namespace, service) (up)) > 10 for: 10m labels: severity: warning annotations: message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' ok 23.377s ago 2.619ms
alert: Watchdog expr: vector(1) labels: severity: none annotations: message: | This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty. ok 23.374s ago 187.3us

k8s.rules

22.32s ago

86.39ms

Rule State Error Last Evaluation Evaluation Time
record: namespace:container_cpu_usage_seconds_total:sum_rate expr: sum by(namespace) (rate(container_cpu_usage_seconds_total{container!="POD",image!="",job="kubelet",metrics_path="/metrics/cadvisor"}[5m])) ok 22.321s ago 4.845ms
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate expr: sum by(cluster, namespace, pod, container) (rate(container_cpu_usage_seconds_total{container!="POD",image!="",job="kubelet",metrics_path="/metrics/cadvisor"}[5m])) * on(cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})) ok 22.317s ago 9.018ms
record: node_namespace_pod_container:container_memory_working_set_bytes expr: container_memory_working_set_bytes{image!="",job="kubelet",metrics_path="/metrics/cadvisor"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""})) ok 22.308s ago 14.47ms
record: node_namespace_pod_container:container_memory_rss expr: container_memory_rss{image!="",job="kubelet",metrics_path="/metrics/cadvisor"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""})) ok 22.293s ago 13.11ms
record: node_namespace_pod_container:container_memory_cache expr: container_memory_cache{image!="",job="kubelet",metrics_path="/metrics/cadvisor"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""})) ok 22.28s ago 13.12ms
record: node_namespace_pod_container:container_memory_swap expr: container_memory_swap{image!="",job="kubelet",metrics_path="/metrics/cadvisor"} * on(namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""})) ok 22.267s ago 13.44ms
record: namespace:container_memory_usage_bytes:sum expr: sum by(namespace) (container_memory_usage_bytes{container!="POD",image!="",job="kubelet",metrics_path="/metrics/cadvisor"}) ok 22.254s ago 2.592ms
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum expr: sum by(namespace) (sum by(namespace, pod) (max by(namespace, pod, container) (kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) * on(namespace, pod) group_left() max by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Running"} == 1))) ok 22.251s ago 6.077ms
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum expr: sum by(namespace) (sum by(namespace, pod) (max by(namespace, pod, container) (kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}) * on(namespace, pod) group_left() max by(namespace, pod) (kube_pod_status_phase{phase=~"Pending|Running"} == 1))) ok 22.245s ago 5.566ms
record: namespace_workload_pod:kube_pod_owner:relabel expr: max by(cluster, namespace, workload, pod) (label_replace(label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)") * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (1, max by(replicaset, namespace, owner_name) (kube_replicaset_owner{job="kube-state-metrics"})), "workload", "$1", "owner_name", "(.*)")) labels: workload_type: deployment ok 22.24s ago 2.716ms
record: namespace_workload_pod:kube_pod_owner:relabel expr: max by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: daemonset ok 22.237s ago 1.232ms
record: namespace_workload_pod:kube_pod_owner:relabel expr: max by(cluster, namespace, workload, pod) (label_replace(kube_pod_owner{job="kube-state-metrics",owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)")) labels: workload_type: statefulset ok 22.236s ago 167.2us

kube-apiserver-availability.rules

1m33.824s ago

342.4ms

Rule State Error Last Evaluation Evaluation Time
record: apiserver_request:availability30d expr: 1 - ((sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - sum(increase(apiserver_request_duration_seconds_bucket{le="1",verb=~"POST|PUT|PATCH|DELETE"}[30d]))) + (sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) - ((sum(increase(apiserver_request_duration_seconds_bucket{le="0.1",scope=~"resource|",verb=~"LIST|GET"}[30d])) or vector(0)) + sum(increase(apiserver_request_duration_seconds_bucket{le="0.5",scope="namespace",verb=~"LIST|GET"}[30d])) + sum(increase(apiserver_request_duration_seconds_bucket{le="5",scope="cluster",verb=~"LIST|GET"}[30d])))) + sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))) / sum(code:apiserver_request_total:increase30d) labels: verb: all ok 1m33.825s ago 101.2ms
record: apiserver_request:availability30d expr: 1 - (sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) - ((sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[30d])) or vector(0)) + sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[30d])) + sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[30d]))) + sum(code:apiserver_request_total:increase30d{code=~"5..",verb="read"} or vector(0))) / sum(code:apiserver_request_total:increase30d{verb="read"}) labels: verb: read ok 1m33.724s ago 64.41ms
record: apiserver_request:availability30d expr: 1 - ((sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - sum(increase(apiserver_request_duration_seconds_bucket{le="1",verb=~"POST|PUT|PATCH|DELETE"}[30d]))) + sum(code:apiserver_request_total:increase30d{code=~"5..",verb="write"} or vector(0))) / sum(code:apiserver_request_total:increase30d{verb="write"}) labels: verb: write ok 1m33.66s ago 35.13ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"2..",job="apiserver",verb="LIST"}[30d])) ok 1m33.624s ago 56.47ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"2..",job="apiserver",verb="GET"}[30d])) ok 1m33.568s ago 24.78ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"2..",job="apiserver",verb="POST"}[30d])) ok 1m33.543s ago 11.49ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"2..",job="apiserver",verb="PUT"}[30d])) ok 1m33.532s ago 9.154ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"2..",job="apiserver",verb="PATCH"}[30d])) ok 1m33.523s ago 6.067ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"2..",job="apiserver",verb="DELETE"}[30d])) ok 1m33.517s ago 6.873ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"3..",job="apiserver",verb="LIST"}[30d])) ok 1m33.51s ago 205.6us
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"3..",job="apiserver",verb="GET"}[30d])) ok 1m33.51s ago 130.7us
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"3..",job="apiserver",verb="POST"}[30d])) ok 1m33.51s ago 118.2us
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"3..",job="apiserver",verb="PUT"}[30d])) ok 1m33.51s ago 119.9us
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"3..",job="apiserver",verb="PATCH"}[30d])) ok 1m33.51s ago 125.8us
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"3..",job="apiserver",verb="DELETE"}[30d])) ok 1m33.51s ago 160.2us
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"4..",job="apiserver",verb="LIST"}[30d])) ok 1m33.509s ago 3.05ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"4..",job="apiserver",verb="GET"}[30d])) ok 1m33.506s ago 3.28ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"4..",job="apiserver",verb="POST"}[30d])) ok 1m33.503s ago 1.48ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"4..",job="apiserver",verb="PUT"}[30d])) ok 1m33.502s ago 2.706ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"4..",job="apiserver",verb="PATCH"}[30d])) ok 1m33.499s ago 2.945ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"4..",job="apiserver",verb="DELETE"}[30d])) ok 1m33.496s ago 1.181ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"5..",job="apiserver",verb="LIST"}[30d])) ok 1m33.495s ago 1.762ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"5..",job="apiserver",verb="GET"}[30d])) ok 1m33.493s ago 2.932ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"5..",job="apiserver",verb="POST"}[30d])) ok 1m33.491s ago 2.112ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"5..",job="apiserver",verb="PUT"}[30d])) ok 1m33.488s ago 2.091ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"5..",job="apiserver",verb="PATCH"}[30d])) ok 1m33.486s ago 1.791ms
record: code_verb:apiserver_request_total:increase30d expr: sum by(code, verb) (increase(apiserver_request_total{code=~"5..",job="apiserver",verb="DELETE"}[30d])) ok 1m33.485s ago 181.1us
record: code:apiserver_request_total:increase30d expr: sum by(code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) labels: verb: read ok 1m33.485s ago 134.7us
record: code:apiserver_request_total:increase30d expr: sum by(code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) labels: verb: write ok 1m33.484s ago 191.2us

kube-apiserver-slos

4.368s ago

1.32ms

Rule State Error Last Evaluation Evaluation Time
alert: KubeAPIErrorBudgetBurn expr: sum(apiserver_request:burnrate1h) > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01) for: 2m labels: long: 1h severity: critical short: 5m annotations: description: The API server is burning too much error budget. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn summary: The API server is burning too much error budget. ok 4.368s ago 371.3us
alert: KubeAPIErrorBudgetBurn expr: sum(apiserver_request:burnrate6h) > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01) for: 15m labels: long: 6h severity: critical short: 30m annotations: description: The API server is burning too much error budget. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn summary: The API server is burning too much error budget. ok 4.368s ago 158us
alert: KubeAPIErrorBudgetBurn expr: sum(apiserver_request:burnrate1d) > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01) for: 1h labels: long: 1d severity: warning short: 2h annotations: description: The API server is burning too much error budget. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn summary: The API server is burning too much error budget. ok 4.368s ago 159.7us
alert: KubeAPIErrorBudgetBurn expr: sum(apiserver_request:burnrate3d) > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01) for: 3h labels: long: 3d severity: warning short: 6h annotations: description: The API server is burning too much error budget. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn summary: The API server is burning too much error budget. ok 4.368s ago 620us

kube-apiserver.rules

15.88s ago

794.5ms

Rule State Error Last Evaluation Evaluation Time
record: apiserver_request:burnrate1d expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[1d])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[1d])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[1d])))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[1d]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) labels: verb: read ok 15.88s ago 165.3ms
record: apiserver_request:burnrate1h expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[1h])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[1h])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[1h])))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[1h]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) labels: verb: read ok 15.715s ago 9.586ms
record: apiserver_request:burnrate2h expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[2h])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[2h])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[2h])))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[2h]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) labels: verb: read ok 15.706s ago 19.61ms
record: apiserver_request:burnrate30m expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[30m])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[30m])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[30m])))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[30m]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) labels: verb: read ok 15.686s ago 6.157ms
record: apiserver_request:burnrate3d expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[3d])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[3d])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[3d])))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[3d]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) labels: verb: read ok 15.68s ago 158.8ms
record: apiserver_request:burnrate5m expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[5m])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[5m])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[5m])))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[5m]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: verb: read ok 15.522s ago 5.655ms
record: apiserver_request:burnrate6h expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) - ((sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",scope=~"resource|",verb=~"LIST|GET"}[6h])) or vector(0)) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.5",scope="namespace",verb=~"LIST|GET"}[6h])) + sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="5",scope="cluster",verb=~"LIST|GET"}[6h])))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"LIST|GET"}[6h]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) labels: verb: read ok 15.516s ago 47.79ms
record: apiserver_request:burnrate1d expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[1d]))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) labels: verb: write ok 15.469s ago 73.49ms
record: apiserver_request:burnrate1h expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[1h]))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) labels: verb: write ok 15.395s ago 5.332ms
record: apiserver_request:burnrate2h expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[2h]))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) labels: verb: write ok 15.39s ago 10.36ms
record: apiserver_request:burnrate30m expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[30m]))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) labels: verb: write ok 15.38s ago 3.109ms
record: apiserver_request:burnrate3d expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[3d]))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) labels: verb: write ok 15.377s ago 83.71ms
record: apiserver_request:burnrate5m expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write ok 15.293s ago 3.079ms
record: apiserver_request:burnrate6h expr: ((sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",le="1",verb=~"POST|PUT|PATCH|DELETE"}[6h]))) + sum(rate(apiserver_request_total{code=~"5..",job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))) / sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) labels: verb: write ok 15.29s ago 25.03ms
record: code_resource:apiserver_request_total:rate5m expr: sum by(code, resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: verb: read ok 15.265s ago 3.157ms
record: code_resource:apiserver_request_total:rate5m expr: sum by(code, resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write ok 15.262s ago 1.789ms
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum by(le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 labels: quantile: "0.99" verb: read ok 15.261s ago 35.22ms
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum by(le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 labels: quantile: "0.99" verb: write ok 15.225s ago 19.28ms
record: cluster:apiserver_request_duration_seconds:mean5m expr: sum without(instance, pod) (rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) / sum without(instance, pod) (rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) ok 15.206s ago 3.242ms
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m]))) labels: quantile: "0.99" ok 15.203s ago 37.41ms
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m]))) labels: quantile: "0.9" ok 15.166s ago 38.46ms
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m]))) labels: quantile: "0.5" ok 15.127s ago 38.91ms

kube-prometheus-general.rules

12.61s ago

1.76ms

Rule State Error Last Evaluation Evaluation Time
record: count:up1 expr: count without(instance, pod, node) (up == 1) ok 12.61s ago 1.034ms
record: count:up0 expr: count without(instance, pod, node) (up == 0) ok 12.609s ago 716.7us

kube-prometheus-node-recording.rules

4.06s ago

16.06ms

Rule State Error Last Evaluation Evaluation Time
record: instance:node_cpu:rate:sum expr: sum by(instance) (rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) ok 4.06s ago 2.708ms
record: instance:node_network_receive_bytes:rate:sum expr: sum by(instance) (rate(node_network_receive_bytes_total[3m])) ok 4.057s ago 1.909ms
record: instance:node_network_transmit_bytes:rate:sum expr: sum by(instance) (rate(node_network_transmit_bytes_total[3m])) ok 4.055s ago 1.738ms
record: instance:node_cpu:ratio expr: sum without(cpu, mode) (rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) / on(instance) group_left() count by(instance) (sum by(instance, cpu) (node_cpu_seconds_total)) ok 4.054s ago 5.079ms
record: cluster:node_cpu:sum_rate5m expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) ok 4.049s ago 2.103ms
record: cluster:node_cpu:ratio expr: cluster:node_cpu_seconds_total:rate5m / count(sum by(instance, cpu) (node_cpu_seconds_total)) ok 4.047s ago 2.505ms

kube-scheduler.rules

24.092s ago

2.901ms

Rule State Error Last Evaluation Evaluation Time
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 24.092s ago 541.2us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 24.092s ago 333.5us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.99" ok 24.091s ago 280.4us
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 24.091s ago 324.5us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 24.091s ago 270.4us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.9" ok 24.091s ago 250.9us
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 24.091s ago 244.7us
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 24.09s ago 363.7us
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum without(instance, pod) (rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))) labels: quantile: "0.5" ok 24.09s ago 274.3us

kube-state-metrics

10.305s ago

348.8us

Rule State Error Last Evaluation Evaluation Time
alert: KubeStateMetricsListErrors expr: (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) / sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) > 0.01 for: 15m labels: severity: critical annotations: description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors summary: kube-state-metrics is experiencing errors in list operations. ok 10.306s ago 239.5us
alert: KubeStateMetricsWatchErrors expr: (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) / sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) > 0.01 for: 15m labels: severity: critical annotations: description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors summary: kube-state-metrics is experiencing errors in watch operations. ok 10.305s ago 100.5us

kubelet.rules

26.091s ago

6.186ms

Rule State Error Last Evaluation Evaluation Time
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile expr: histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(instance) group_left(node) kubelet_node_name{job="kubelet",metrics_path="/metrics"}) labels: quantile: "0.99" ok 26.091s ago 2.343ms
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile expr: histogram_quantile(0.9, sum by(instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(instance) group_left(node) kubelet_node_name{job="kubelet",metrics_path="/metrics"}) labels: quantile: "0.9" ok 26.089s ago 2ms
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile expr: histogram_quantile(0.5, sum by(instance, le) (rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) * on(instance) group_left(node) kubelet_node_name{job="kubelet",metrics_path="/metrics"}) labels: quantile: "0.5" ok 26.087s ago 1.83ms

kubernetes-apps

18.597s ago

57.54ms

Rule State Error Last Evaluation Evaluation Time
alert: KubePodCrashLooping expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics",namespace=~".*"}[5m]) * 60 * 5 > 0 for: 15m labels: severity: warning annotations: description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping summary: Pod is crash looping. ok 18.597s ago 4.785ms
alert: KubePodNotReady expr: sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",namespace=~".*",phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}))) > 0 for: 15m labels: severity: warning annotations: description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready summary: Pod has been in a non-ready state for more than 15 minutes. ok 18.592s ago 7.043ms
alert: KubeDeploymentGenerationMismatch expr: kube_deployment_status_observed_generation{job="kube-state-metrics",namespace=~".*"} != kube_deployment_metadata_generation{job="kube-state-metrics",namespace=~".*"} for: 15m labels: severity: warning annotations: description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch summary: Deployment generation mismatch due to possible roll-back ok 18.585s ago 734.2us
alert: KubeDeploymentReplicasMismatch expr: (kube_deployment_spec_replicas{job="kube-state-metrics",namespace=~".*"} != kube_deployment_status_replicas_available{job="kube-state-metrics",namespace=~".*"}) and (changes(kube_deployment_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[5m]) == 0) for: 15m labels: severity: warning annotations: description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch summary: Deployment has not matched the expected number of replicas. ok 18.585s ago 1.414ms
alert: KubeStatefulSetReplicasMismatch expr: (kube_statefulset_status_replicas_ready{job="kube-state-metrics",namespace=~".*"} != kube_statefulset_status_replicas{job="kube-state-metrics",namespace=~".*"}) and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[5m]) == 0) for: 15m labels: severity: warning annotations: description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch summary: Deployment has not matched the expected number of replicas. ok 18.584s ago 279.7us
alert: KubeStatefulSetGenerationMismatch expr: kube_statefulset_status_observed_generation{job="kube-state-metrics",namespace=~".*"} != kube_statefulset_metadata_generation{job="kube-state-metrics",namespace=~".*"} for: 15m labels: severity: warning annotations: description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch summary: StatefulSet generation mismatch due to possible roll-back ok 18.583s ago 142.9us
alert: KubeStatefulSetUpdateNotRolledOut expr: (max without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics",namespace=~".*"} unless kube_statefulset_status_update_revision{job="kube-state-metrics",namespace=~".*"}) * (kube_statefulset_replicas{job="kube-state-metrics",namespace=~".*"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"})) and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[5m]) == 0) for: 15m labels: severity: warning annotations: description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout summary: StatefulSet update has not been rolled out. ok 18.583s ago 367.8us
alert: KubeDaemonSetRolloutStuck expr: ((kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace=~".*"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"}) or (kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace=~".*"} != 0) or (kube_daemonset_updated_number_scheduled{job="kube-state-metrics",namespace=~".*"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"}) or (kube_daemonset_status_number_available{job="kube-state-metrics",namespace=~".*"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"})) and (changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics",namespace=~".*"}[5m]) == 0) for: 15m labels: severity: warning annotations: description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck summary: DaemonSet rollout is stuck. ok 18.583s ago 1.786ms
alert: KubeContainerWaiting expr: sum by(namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",namespace=~".*"}) > 0 for: 1h labels: severity: warning annotations: description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting summary: Pod container waiting longer than 1 hour ok 18.582s ago 26.85ms
alert: KubeDaemonSetNotScheduled expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace=~".*"} > 0 for: 10m labels: severity: warning annotations: description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled summary: DaemonSet pods are not scheduled. ok 18.555s ago 313.9us
alert: KubeDaemonSetMisScheduled expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace=~".*"} > 0 for: 15m labels: severity: warning annotations: description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled summary: DaemonSet pods are misscheduled. ok 18.555s ago 338us
alert: KubeJobCompletion expr: kube_job_spec_completions{job="kube-state-metrics",namespace=~".*"} - kube_job_status_succeeded{job="kube-state-metrics",namespace=~".*"} > 0 for: 12h labels: severity: warning annotations: description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion summary: Job did not complete in time ok 18.555s ago 12.11ms
alert: KubeJobFailed expr: kube_job_failed{job="kube-state-metrics",namespace=~".*"} > 0 for: 15m labels: severity: warning annotations: description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed summary: Job failed to complete. ok 18.543s ago 1.111ms
alert: KubeHpaReplicasMismatch expr: (kube_hpa_status_desired_replicas{job="kube-state-metrics",namespace=~".*"} != kube_hpa_status_current_replicas{job="kube-state-metrics",namespace=~".*"}) and changes(kube_hpa_status_current_replicas[15m]) == 0 for: 15m labels: severity: warning annotations: description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch summary: HPA has not matched descired number of replicas. ok 18.542s ago 157.1us
alert: KubeHpaMaxedOut expr: kube_hpa_status_current_replicas{job="kube-state-metrics",namespace=~".*"} == kube_hpa_spec_max_replicas{job="kube-state-metrics",namespace=~".*"} for: 15m labels: severity: warning annotations: description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout summary: HPA is running at max replicas ok 18.541s ago 74.64us

kubernetes-resources

16.351s ago

3.4ms

Rule State Error Last Evaluation Evaluation Time
alert: KubeCPUOvercommit expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum) / sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores) - 1) / count(kube_node_status_allocatable_cpu_cores) for: 5m labels: severity: warning annotations: description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit summary: Cluster has overcommitted CPU resource requests. ok 16.352s ago 602.9us
alert: KubeMemoryOvercommit expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum) / sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes) - 1) / count(kube_node_status_allocatable_memory_bytes) for: 5m labels: severity: warning annotations: description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit summary: Cluster has overcommitted memory resource requests. ok 16.351s ago 379.3us
alert: KubeCPUQuotaOvercommit expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"}) / sum(kube_node_status_allocatable_cpu_cores) > 1.5 for: 5m labels: severity: warning annotations: description: Cluster has overcommitted CPU resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit summary: Cluster has overcommitted CPU resource requests. ok 16.351s ago 189.2us
alert: KubeMemoryQuotaOvercommit expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"}) / sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) > 1.5 for: 5m labels: severity: warning annotations: description: Cluster has overcommitted memory resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit summary: Cluster has overcommitted memory resource requests. ok 16.351s ago 74.83us
alert: KubeQuotaAlmostFull expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) > 0.9 < 1 for: 15m labels: severity: info annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull summary: Namespace quota is going to be full. ok 16.351s ago 88.05us
alert: KubeQuotaFullyUsed expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) == 1 for: 15m labels: severity: info annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused summary: Namespace quota is fully used. ok 16.351s ago 65.61us
alert: KubeQuotaExceeded expr: kube_resourcequota{job="kube-state-metrics",type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"} > 0) > 1 for: 15m labels: severity: warning annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded summary: Namespace quota has exceeded the limits. ok 16.351s ago 75.63us
alert: CPUThrottlingHigh expr: sum by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m])) > (25 / 100) for: 15m labels: severity: info annotations: description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh summary: Processes experience elevated CPU throttling. ok 16.351s ago 1.91ms

kubernetes-storage

8.844s ago

1.04ms

Rule State Error Last Evaluation Evaluation Time
alert: KubePersistentVolumeFillingUp expr: kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"} < 0.03 for: 1m labels: severity: critical annotations: description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup summary: PersistentVolume is filling up. ok 8.844s ago 226.5us
alert: KubePersistentVolumeFillingUp expr: (kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"}) < 0.15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"}[6h], 4 * 24 * 3600) < 0 for: 1h labels: severity: warning annotations: description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup summary: PersistentVolume is filling up. ok 8.844s ago 682.8us
alert: KubePersistentVolumeErrors expr: kube_persistentvolume_status_phase{job="kube-state-metrics",phase=~"Failed|Pending"} > 0 for: 5m labels: severity: critical annotations: description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors summary: PersistentVolume is having issues with provisioning. ok 8.843s ago 121.3us

kubernetes-system-apiserver

22.551s ago

1.082ms

Rule State Error Last Evaluation Evaluation Time
alert: KubeClientCertificateExpiration expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 labels: severity: warning annotations: description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration summary: Client certificate is about to expire. ok 22.551s ago 542.5us
alert: KubeClientCertificateExpiration expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: severity: critical annotations: description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration summary: Client certificate is about to expire. ok 22.551s ago 342us
alert: AggregatedAPIErrors expr: sum by(name, namespace) (increase(aggregator_unavailable_apiservice_count[5m])) > 2 labels: severity: warning annotations: description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors summary: An aggregated API has reported errors. ok 22.551s ago 60.75us
alert: AggregatedAPIDown expr: (1 - max by(name, namespace) (avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 for: 5m labels: severity: warning annotations: description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown summary: An aggregated API is down. ok 22.551s ago 71.48us
alert: KubeAPIDown expr: absent(up{job="apiserver"} == 1) for: 15m labels: severity: critical annotations: description: KubeAPI has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown summary: Target disappeared from Prometheus target discovery. ok 22.551s ago 53.55us

kubernetes-system-controller-manager

15.854s ago

160.7us

Rule State Error Last Evaluation Evaluation Time
alert: KubeControllerManagerDown expr: absent(up{job="kube-controller-manager"} == 1) for: 15m labels: severity: critical annotations: description: KubeControllerManager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown summary: Target disappeared from Prometheus target discovery. ok 15.854s ago 156.8us

kubernetes-system-kubelet

22.661s ago

10.58ms

Rule State Error Last Evaluation Evaluation Time
alert: KubeNodeNotReady expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} == 0 for: 15m labels: severity: warning annotations: description: '{{ $labels.node }} has been unready for more than 15 minutes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready summary: Node is not ready. ok 22.661s ago 865.5us
alert: KubeNodeUnreachable expr: (kube_node_spec_taint{effect="NoSchedule",job="kube-state-metrics",key="node.kubernetes.io/unreachable"} unless ignoring(key, value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 for: 15m labels: severity: warning annotations: description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable summary: Node is unreachable. ok 22.661s ago 866.7us
alert: KubeletTooManyPods expr: count by(node) ((kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance, pod, namespace, cluster) group_left(node) topk by(instance, pod, namespace, cluster) (1, kube_pod_info{job="kube-state-metrics"})) / max by(node) (kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) > 0.95 for: 15m labels: severity: warning annotations: description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods summary: Kubelet is running at capacity. ok 22.66s ago 6.124ms
alert: KubeNodeReadinessFlapping expr: sum by(node) (changes(kube_node_status_condition{condition="Ready",status="true"}[15m])) > 2 for: 15m labels: severity: warning annotations: description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping summary: Node readiness status is flapping. ok 22.654s ago 240.1us
alert: KubeletPlegDurationHigh expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 for: 5m labels: severity: warning annotations: description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. ok 22.654s ago 153.3us
alert: KubeletPodStartUpLatencyHigh expr: histogram_quantile(0.99, sum by(instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet",metrics_path="/metrics"}[5m]))) * on(instance) group_left(node) kubelet_node_name{job="kubelet",metrics_path="/metrics"} > 60 for: 15m labels: severity: warning annotations: description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh summary: Kubelet Pod startup latency is too high. ok 22.654s ago 1.935ms
alert: KubeletClientCertificateExpiration expr: kubelet_certificate_manager_client_ttl_seconds < 604800 labels: severity: warning annotations: description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration summary: Kubelet client certificate is about to expire. ok 22.652s ago 49.41us
alert: KubeletClientCertificateExpiration expr: kubelet_certificate_manager_client_ttl_seconds < 86400 labels: severity: critical annotations: description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration summary: Kubelet client certificate is about to expire. ok 22.652s ago 29.33us
alert: KubeletServerCertificateExpiration expr: kubelet_certificate_manager_server_ttl_seconds < 604800 labels: severity: warning annotations: description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration summary: Kubelet server certificate is about to expire. ok 22.652s ago 25.82us
alert: KubeletServerCertificateExpiration expr: kubelet_certificate_manager_server_ttl_seconds < 86400 labels: severity: critical annotations: description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration summary: Kubelet server certificate is about to expire. ok 22.652s ago 25.7us
alert: KubeletClientCertificateRenewalErrors expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 for: 15m labels: severity: warning annotations: description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors summary: Kubelet has failed to renew its client certificate. ok 22.652s ago 45.84us
alert: KubeletServerCertificateRenewalErrors expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0 for: 15m labels: severity: warning annotations: description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors summary: Kubelet has failed to renew its server certificate. ok 22.652s ago 36.25us
alert: KubeletDown expr: absent(up{job="kubelet",metrics_path="/metrics"} == 1) for: 15m labels: severity: critical annotations: description: Kubelet has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown summary: Target disappeared from Prometheus target discovery. ok 22.652s ago 162.3us

kubernetes-system-scheduler

1.065s ago

321.6us

Rule State Error Last Evaluation Evaluation Time
alert: KubeSchedulerDown expr: absent(up{job="kube-scheduler"} == 1) for: 15m labels: severity: critical annotations: description: KubeScheduler has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown summary: Target disappeared from Prometheus target discovery. ok 1.065s ago 313.2us

kubernetes-system

13.811s ago

2.22ms

Rule State Error Last Evaluation Evaluation Time
alert: KubeVersionMismatch expr: count(count by(gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"}, "gitVersion", "$1", "gitVersion", "(v[0-9]*.[0-9]*).*"))) > 1 for: 15m labels: severity: warning annotations: description: There are {{ $value }} different semantic versions of Kubernetes components running. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch summary: Different semantic versions of Kubernetes components running. ok 13.811s ago 576.1us
alert: KubeClientErrors expr: (sum by(instance, job) (rate(rest_client_requests_total{code=~"5.."}[5m])) / sum by(instance, job) (rate(rest_client_requests_total[5m]))) > 0.01 for: 15m labels: severity: warning annotations: description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors summary: Kubernetes API server client is experiencing errors. ok 13.811s ago 1.633ms

node-exporter.rules

2.234s ago

19.85ms

Rule State Error Last Evaluation Evaluation Time
record: instance:node_num_cpu:sum expr: count without(cpu) (count without(mode) (node_cpu_seconds_total{job="node-exporter"})) ok 2.234s ago 3.228ms
record: instance:node_cpu_utilisation:rate1m expr: 1 - avg without(cpu, mode) (rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) ok 2.23s ago 806.3us
record: instance:node_load1_per_cpu:ratio expr: (node_load1{job="node-exporter"} / instance:node_num_cpu:sum{job="node-exporter"}) ok 2.23s ago 399.1us
record: instance:node_memory_utilisation:ratio expr: 1 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}) ok 2.229s ago 442.9us
record: instance:node_vmstat_pgmajfault:rate1m expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) ok 2.229s ago 229.6us
record: instance_device:node_disk_io_time_seconds:rate1m expr: rate(node_disk_io_time_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+",job="node-exporter"}[1m]) ok 2.229s ago 649.1us
record: instance_device:node_disk_io_time_weighted_seconds:rate1m expr: rate(node_disk_io_time_weighted_seconds_total{device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+",job="node-exporter"}[1m]) ok 2.228s ago 671.7us
record: instance:node_network_receive_bytes_excluding_lo:rate1m expr: sum without(device) (rate(node_network_receive_bytes_total{device!="lo",job="node-exporter"}[1m])) ok 2.228s ago 7.514ms
record: instance:node_network_transmit_bytes_excluding_lo:rate1m expr: sum without(device) (rate(node_network_transmit_bytes_total{device!="lo",job="node-exporter"}[1m])) ok 2.22s ago 1.807ms
record: instance:node_network_receive_drop_excluding_lo:rate1m expr: sum without(device) (rate(node_network_receive_drop_total{device!="lo",job="node-exporter"}[1m])) ok 2.218s ago 2.787ms
record: instance:node_network_transmit_drop_excluding_lo:rate1m expr: sum without(device) (rate(node_network_transmit_drop_total{device!="lo",job="node-exporter"}[1m])) ok 2.216s ago 1.289ms

node-exporter

9.957s ago

13.54ms

Rule State Error Last Evaluation Evaluation Time
alert: NodeFilesystemSpaceFillingUp expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 40 and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) for: 1h labels: severity: warning annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup summary: Filesystem is predicted to run out of space within the next 24 hours. ok 9.957s ago 2.623ms
alert: NodeFilesystemSpaceFillingUp expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 15 and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) for: 1h labels: severity: critical annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup summary: Filesystem is predicted to run out of space within the next 4 hours. ok 9.955s ago 2.276ms
alert: NodeFilesystemAlmostOutOfSpace expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 5 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) for: 1h labels: severity: warning annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace summary: Filesystem has less than 5% space left. ok 9.952s ago 444.5us
alert: NodeFilesystemAlmostOutOfSpace expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 3 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) for: 1h labels: severity: critical annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace summary: Filesystem has less than 3% space left. ok 9.952s ago 364.6us
alert: NodeFilesystemFilesFillingUp expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 40 and predict_linear(node_filesystem_files_free{fstype!="",job="node-exporter"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) for: 1h labels: severity: warning annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup summary: Filesystem is predicted to run out of inodes within the next 24 hours. ok 9.952s ago 1.903ms
alert: NodeFilesystemFilesFillingUp expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 20 and predict_linear(node_filesystem_files_free{fstype!="",job="node-exporter"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) for: 1h labels: severity: critical annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup summary: Filesystem is predicted to run out of inodes within the next 4 hours. ok 9.95s ago 1.866ms
alert: NodeFilesystemAlmostOutOfFiles expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 5 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) for: 1h labels: severity: warning annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles summary: Filesystem has less than 5% inodes left. ok 9.949s ago 363us
alert: NodeFilesystemAlmostOutOfFiles expr: (node_filesystem_files_free{fstype!="",job="node-exporter"} / node_filesystem_files{fstype!="",job="node-exporter"} * 100 < 3 and node_filesystem_readonly{fstype!="",job="node-exporter"} == 0) for: 1h labels: severity: critical annotations: description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles summary: Filesystem has less than 3% inodes left. ok 9.948s ago 322.3us
alert: NodeNetworkReceiveErrs expr: increase(node_network_receive_errs_total[2m]) > 10 for: 1h labels: severity: warning annotations: description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs summary: Network interface is reporting many receive errors. ok 9.948s ago 1.054ms
alert: NodeNetworkTransmitErrs expr: increase(node_network_transmit_errs_total[2m]) > 10 for: 1h labels: severity: warning annotations: description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs summary: Network interface is reporting many transmit errors. ok 9.947s ago 1.14ms
alert: NodeHighNumberConntrackEntriesUsed expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 labels: severity: warning annotations: description: '{{ $value | humanizePercentage }} of conntrack entries are used.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused summary: Number of conntrack are getting close to the limit. ok 9.946s ago 235.6us
alert: NodeTextFileCollectorScrapeError expr: node_textfile_scrape_error{job="node-exporter"} == 1 labels: severity: warning annotations: description: Node Exporter text file collector failed to scrape. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror summary: Node Exporter text file collector failed to scrape. ok 9.946s ago 112.7us
alert: NodeClockSkewDetected expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) for: 10m labels: severity: warning annotations: message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected summary: Clock skew detected. ok 9.946s ago 630.7us
alert: NodeClockNotSynchronising expr: min_over_time(node_timex_sync_status[5m]) == 0 for: 10m labels: severity: warning annotations: message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising summary: Clock not synchronising. ok 9.945s ago 106.7us
alert: NodeRAIDDegraded expr: node_md_disks_required - ignoring(state) (node_md_disks{state="active"}) > 0 for: 15m labels: severity: critical annotations: description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddegraded summary: RAID Array is degraded ok 9.945s ago 48.33us
alert: NodeRAIDDiskFailure expr: node_md_disks{state="fail"} > 0 labels: severity: warning annotations: description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure summary: Failed device in RAID array ok 9.945s ago 27.16us

node-network

10.732s ago

1.347ms

Rule State Error Last Evaluation Evaluation Time
alert: NodeNetworkInterfaceFlapping expr: changes(node_network_up{device!~"veth.+",job="node-exporter"}[2m]) > 2 for: 2m labels: severity: warning annotations: message: Network interface "{{ $labels.device }}" changing it's up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" ok 10.732s ago 1.339ms

node.rules

3.066s ago

14.42ms

Rule State Error Last Evaluation Evaluation Time
record: :kube_pod_info_node_count: expr: sum(min by(cluster, node) (kube_pod_info{node!=""})) ok 3.066s ago 3.029ms
record: node_namespace_pod:kube_pod_info: expr: topk by(namespace, pod) (1, max by(node, namespace, pod) (label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)"))) ok 3.063s ago 5.507ms
record: node:node_num_cpu:sum expr: count by(cluster, node) (sum by(node, cpu) (node_cpu_seconds_total{job="node-exporter"} * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:)) ok 3.057s ago 5.134ms
record: :node_memory_MemAvailable_bytes:sum expr: sum by(cluster) (node_memory_MemAvailable_bytes{job="node-exporter"} or (node_memory_Buffers_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Slab_bytes{job="node-exporter"})) ok 3.052s ago 732.3us

prometheus-operator

18.871s ago

873.8us

Rule State Error Last Evaluation Evaluation Time
alert: PrometheusOperatorListErrors expr: (sum by(controller, namespace) (rate(prometheus_operator_list_operations_failed_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[10m])) / sum by(controller, namespace) (rate(prometheus_operator_list_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[10m]))) > 0.4 for: 15m labels: severity: warning annotations: description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorlisterrors summary: Errors while performing list operations in controller. ok 18.871s ago 430.5us
alert: PrometheusOperatorWatchErrors expr: (sum by(controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[10m])) / sum by(controller, namespace) (rate(prometheus_operator_watch_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[10m]))) > 0.4 for: 15m labels: severity: warning annotations: description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorwatcherrors summary: Errors while performing watch operations in controller. ok 18.87s ago 220.4us
alert: PrometheusOperatorReconcileErrors expr: (sum by(controller, namespace) (rate(prometheus_operator_reconcile_errors_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m]))) / (sum by(controller, namespace) (rate(prometheus_operator_reconcile_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning annotations: description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorreconcileerrors summary: Errors while reconciling controller. ok 18.87s ago 144.8us
alert: PrometheusOperatorNodeLookupErrors expr: rate(prometheus_operator_node_address_lookup_errors_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning annotations: description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornodelookuperrors summary: Errors while reconciling Prometheus. ok 18.87s ago 64.52us

prometheus

20.559s ago

2.291ms

Rule State Error Last Evaluation Evaluation Time
alert: PrometheusBadConfig expr: max_over_time(prometheus_config_last_reload_successful{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) == 0 for: 10m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. summary: Failed Prometheus configuration reload. ok 20.559s ago 230.5us
alert: PrometheusNotificationQueueRunningFull expr: (predict_linear(prometheus_notifications_queue_length{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m], 60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])) for: 15m labels: severity: warning annotations: description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. summary: Prometheus alert notification queue predicted to run full in less than 30m. ok 20.559s ago 270.7us
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers expr: (rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])) * 100 > 1 for: 15m labels: severity: warning annotations: description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. ok 20.559s ago 162.1us
alert: PrometheusErrorSendingAlertsToAnyAlertmanager expr: min without(alertmanager) (rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])) * 100 > 3 for: 15m labels: severity: critical annotations: description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. ok 20.559s ago 147.4us
alert: PrometheusNotConnectedToAlertmanagers expr: max_over_time(prometheus_notifications_alertmanagers_discovered{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) < 1 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. summary: Prometheus is not connected to any Alertmanagers. ok 20.559s ago 86.92us
alert: PrometheusTSDBReloadsFailing expr: increase(prometheus_tsdb_reloads_failures_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. summary: Prometheus has issues reloading blocks from disk. ok 20.559s ago 194.1us
alert: PrometheusTSDBCompactionsFailing expr: increase(prometheus_tsdb_compactions_failed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. summary: Prometheus has issues compacting blocks. ok 20.559s ago 174.3us
alert: PrometheusNotIngestingSamples expr: rate(prometheus_tsdb_head_samples_appended_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) <= 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. summary: Prometheus is not ingesting samples. ok 20.559s ago 95.56us
alert: PrometheusDuplicateTimestamps expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. summary: Prometheus is dropping samples with duplicate timestamps. ok 20.559s ago 75.05us
alert: PrometheusOutOfOrderTimestamps expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. summary: Prometheus drops samples with out-of-order timestamps. ok 20.559s ago 60.19us
alert: PrometheusRemoteStorageFailures expr: (rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) / (rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) + rate(prometheus_remote_storage_succeeded_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))) * 100 > 1 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} summary: Prometheus fails to send samples to remote storage. ok 20.559s ago 173.6us
alert: PrometheusRemoteWriteBehind expr: (max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) - on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])) > 120 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. summary: Prometheus remote write is behind. ok 20.559s ago 127.9us
alert: PrometheusRemoteWriteDesiredShards expr: (max_over_time(prometheus_remote_storage_shards_desired{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > max_over_time(prometheus_remote_storage_shards_max{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])) for: 15m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="kube-prometheus-stack-prometheus",namespace="monitoring"}` $labels.instance | query | first | value }}. summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. ok 20.559s ago 79.82us
alert: PrometheusRuleFailures expr: increase(prometheus_rule_evaluation_failures_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: critical annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. summary: Prometheus is failing rule evaluations. ok 20.559s ago 317.6us
alert: PrometheusMissingRuleEvaluations expr: increase(prometheus_rule_group_iterations_missed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: warning annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. summary: Prometheus is missing rule evaluations due to slow rule group evaluation. ok 20.558s ago 73.05us