Alerts


/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-alertmanager.rules.yaml > alertmanager.rules
AlertmanagerConfigInconsistent (0 active)
alert: AlertmanagerConfigInconsistent
expr: count
  by(namespace, service) (count_values by(namespace, service) ("config_hash",
  alertmanager_config_hash{job="kube-prometheus-stack-alertmanager",namespace="monitoring"}))
  != 1
for: 5m
labels:
  severity: critical
annotations:
  message: |
    The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
    {{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
    Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
    {{ end }}
AlertmanagerFailedReload (0 active)
alert: AlertmanagerFailedReload
expr: alertmanager_config_last_reload_successful{job="kube-prometheus-stack-alertmanager",namespace="monitoring"}
  == 0
for: 10m
labels:
  severity: warning
annotations:
  message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
    }}/{{ $labels.pod}}.
AlertmanagerMembersInconsistent (0 active)
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-etcd.yaml > etcd
etcdGRPCRequestsSlow (0 active)
alert: etcdGRPCRequestsSlow
expr: histogram_quantile(0.99,
  sum by(job, instance, grpc_service, grpc_method, le) (rate(grpc_server_handling_seconds_bucket{grpc_type="unary",job=~".*etcd.*"}[5m])))
  > 0.15
for: 10m
labels:
  severity: critical
annotations:
  message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
    }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
etcdHTTPRequestsSlow (0 active)
alert: etcdHTTPRequestsSlow
expr: histogram_quantile(0.99,
  rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
  severity: warning
annotations:
  message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
    }} are slow.
etcdHighCommitDurations (0 active)
alert: etcdHighCommitDurations
expr: histogram_quantile(0.99,
  rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
  > 0.25
for: 10m
labels:
  severity: warning
annotations:
  message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
    {{ $value }}s on etcd instance {{ $labels.instance }}.'
etcdHighFsyncDurations (0 active)
alert: etcdHighFsyncDurations
expr: histogram_quantile(0.99,
  rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
  > 0.5
for: 10m
labels:
  severity: warning
annotations:
  message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations
    are {{ $value }}s on etcd instance {{ $labels.instance }}.'
etcdHighNumberOfFailedGRPCRequests (0 active)
alert: etcdHighNumberOfFailedGRPCRequests
expr: 100
  * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m]))
  / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m]))
  > 5
for: 5m
labels:
  severity: critical
annotations:
  message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
    {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
etcdHighNumberOfFailedGRPCRequests (0 active)
alert: etcdHighNumberOfFailedGRPCRequests
expr: 100
  * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m]))
  / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m]))
  > 1
for: 10m
labels:
  severity: warning
annotations:
  message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
    {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
etcdHighNumberOfFailedHTTPRequests (0 active)
alert: etcdHighNumberOfFailedHTTPRequests
expr: sum
  by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m]))
  / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) >
  0.05
for: 10m
labels:
  severity: critical
annotations:
  message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
    {{ $labels.instance }}.'
etcdHighNumberOfFailedHTTPRequests (0 active)
alert: etcdHighNumberOfFailedHTTPRequests
expr: sum
  by(method) (rate(etcd_http_failed_total{code!="404",job=~".*etcd.*"}[5m]))
  / sum by(method) (rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) >
  0.01
for: 10m
labels:
  severity: warning
annotations:
  message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance
    {{ $labels.instance }}'
etcdHighNumberOfFailedProposals (0 active)
alert: etcdHighNumberOfFailedProposals
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m])
  > 5
for: 15m
labels:
  severity: warning
annotations:
  message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
    within the last 30 minutes on etcd instance {{ $labels.instance }}.'
etcdHighNumberOfLeaderChanges (0 active)
alert: etcdHighNumberOfLeaderChanges
expr: increase((max
  by(job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0 *
  absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
  >= 3
for: 5m
labels:
  severity: warning
annotations:
  message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
    within the last 15 minutes. Frequent elections may be a sign of insufficient resources,
    high network latency, or disruptions by other components and should be investigated.'
etcdInsufficientMembers (0 active)
alert: etcdInsufficientMembers
expr: sum
  by(job) (up{job=~".*etcd.*"} == bool 1) < ((count by(job) (up{job=~".*etcd.*"})
  + 1) / 2)
for: 3m
labels:
  severity: critical
annotations:
  message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
    }}).'
etcdMemberCommunicationSlow (0 active)
alert: etcdMemberCommunicationSlow
expr: histogram_quantile(0.99,
  rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
  > 0.15
for: 10m
labels:
  severity: warning
annotations:
  message: 'etcd cluster "{{ $labels.job }}": member communication with {{
    $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
etcdMembersDown (0 active)
alert: etcdMembersDown
expr: max
  by(job) (sum by(job) (up{job=~".*etcd.*"} == bool 0) or count by(job, endpoint)
  (sum by(job, endpoint, To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m]))
  > 0.01)) > 0
for: 3m
labels:
  severity: critical
annotations:
  message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
    }}).'
etcdNoLeader (0 active)
alert: etcdNoLeader
expr: etcd_server_has_leader{job=~".*etcd.*"}
  == 0
for: 1m
labels:
  severity: critical
annotations:
  message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
    has no leader.'
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-general.rules.yaml > general.rules
TargetDown (6 active)
alert: TargetDown
expr: 100
  * (count by(job, namespace, service) (up == 0) / count by(job, namespace, service)
  (up)) > 10
for: 10m
labels:
  severity: warning
annotations:
  message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
    }} targets in {{ $labels.namespace }} namespace are down.'
Labels State Active Since Value
alertname="TargetDown" job="kube-proxy" namespace="kube-system" service="kube-prometheus-stack-kube-proxy" severity="warning" firing 2025-01-08 01:00:52.549584131 +0000 UTC 100
alertname="TargetDown" job="kube-controller-manager" namespace="kube-system" service="kube-prometheus-stack-kube-controller-manager" severity="warning" firing 2025-01-08 01:00:52.549584131 +0000 UTC 66.66666666666666
alertname="TargetDown" job="kubelet" namespace="kube-system" service="kube-prometheus-stack-kubelet" severity="warning" firing 2025-01-08 01:00:52.549584131 +0000 UTC 18.181818181818183
alertname="TargetDown" job="pmc-partner-api" namespace="pmc-production" service="pmc-partner-api" severity="warning" firing 2025-01-08 01:00:52.549584131 +0000 UTC 100
alertname="TargetDown" job="node-exporter" namespace="monitoring" service="kube-prometheus-stack-prometheus-node-exporter" severity="warning" firing 2025-01-08 01:00:52.549584131 +0000 UTC 18.181818181818183
alertname="TargetDown" job="kube-scheduler" namespace="kube-system" service="kube-prometheus-stack-kube-scheduler" severity="warning" firing 2025-01-08 01:00:52.549584131 +0000 UTC 66.66666666666666
Watchdog (1 active)
alert: Watchdog
expr: vector(1)
labels:
  severity: none
annotations:
  message: |
    This is an alert meant to ensure that the entire alerting pipeline is functional.
    This alert is always firing, therefore it should always be firing in Alertmanager
    and always fire against a receiver. There are integrations with various notification
    mechanisms that send a notification when this alert is not firing. For example the
    "DeadMansSnitch" integration in PagerDuty.
Labels State Active Since Value
alertname="Watchdog" severity="none" firing 2025-01-08 01:00:22.549584131 +0000 UTC 1
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-kube-apiserver-slos.yaml > kube-apiserver-slos
KubeAPIErrorBudgetBurn (1 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate3d)
  > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01)
for: 3h
labels:
  long: 3d
  severity: warning
  short: 6h
annotations:
  description: The API server is burning too much error budget.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
  summary: The API server is burning too much error budget.
Labels State Active Since Value
alertname="KubeAPIErrorBudgetBurn" long="3d" severity="warning" short="6h" firing 2025-01-08 19:50:11.563519891 +0000 UTC 0.012205295777970604
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate1h)
  > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01)
for: 2m
labels:
  long: 1h
  severity: critical
  short: 5m
annotations:
  description: The API server is burning too much error budget.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
  summary: The API server is burning too much error budget.
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate6h)
  > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01)
for: 15m
labels:
  long: 6h
  severity: critical
  short: 30m
annotations:
  description: The API server is burning too much error budget.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
  summary: The API server is burning too much error budget.
KubeAPIErrorBudgetBurn (0 active)
alert: KubeAPIErrorBudgetBurn
expr: sum(apiserver_request:burnrate1d)
  > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01)
for: 1h
labels:
  long: 1d
  severity: warning
  short: 2h
annotations:
  description: The API server is burning too much error budget.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
  summary: The API server is burning too much error budget.
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-kube-state-metrics.yaml > kube-state-metrics
KubeStateMetricsListErrors (0 active)
alert: KubeStateMetricsListErrors
expr: (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
  / sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
  > 0.01
for: 15m
labels:
  severity: critical
annotations:
  description: kube-state-metrics is experiencing errors at an elevated rate in list
    operations. This is likely causing it to not be able to expose metrics about Kubernetes
    objects correctly or at all.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors
  summary: kube-state-metrics is experiencing errors in list operations.
KubeStateMetricsWatchErrors (0 active)
alert: KubeStateMetricsWatchErrors
expr: (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
  / sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
  > 0.01
for: 15m
labels:
  severity: critical
annotations:
  description: kube-state-metrics is experiencing errors at an elevated rate in watch
    operations. This is likely causing it to not be able to expose metrics about Kubernetes
    objects correctly or at all.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors
  summary: kube-state-metrics is experiencing errors in watch operations.
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-kubernetes-apps.yaml > kubernetes-apps
KubeContainerWaiting (44 active)
alert: KubeContainerWaiting
expr: sum
  by(namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",namespace=~".*"})
  > 0
for: 1h
labels:
  severity: warning
annotations:
  description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
    has been in waiting state for longer than 1 hour.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
  summary: Pod container waiting longer than 1 hour
Labels State Active Since Value
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1696485600-86sht" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1699855200-78td2" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1699336800-gl9qz" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1699596000-s4kbq" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1734674400-prv85" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1699941600-frmpz" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="cron-reference" namespace="pmc-production" pod="pmc-statistic-cron-reference-1711929600-hvj47" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1698818400-smdrj" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1699509600-fl7kf" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1697436000-wq782" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1696312800-nllf2" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1695880800-vxwh2" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1698127200-gx96s" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1700114400-76j75" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1696917600-25d7s" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1695967200-szbf9" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1733724000-cvkkw" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-remove-old-files" namespace="pmc-production" pod="pmc-bank-reports-cron-remove-old-files-1696150800-ccg9x" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1700028000-569s6" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1696572000-rpjp8" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1696831200-bkdpm" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="cron-reference" namespace="pmc-production" pod="pmc-statistic-cron-reference-1709251200-j8wc5" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1698300000-dxspz" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1735020000-9j7f6" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1697090400-ws452" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1700460000-svtcn" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="check-received-ibmmq-consumer" namespace="pmc-production" pod="pmc-bank-reports-759d9c5c76-zpgmn" severity="warning" pending 2025-01-09 05:01:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1698213600-bndv5" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1696226400-k2ww7" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1698040800-wc2sg" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1698645600-6xtht" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1699423200-bsn46" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1700546400-c8b8q" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1697004000-7jmvl" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1697781600-vzmpr" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-remove-old-files" namespace="pmc-production" pod="pmc-bank-reports-cron-remove-old-files-1698829200-tndzz" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1695794400-kzqhk" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1700200800-gnrn4" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1697176800-lrdx4" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1698991200-lj5dk" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1698386400-f76zw" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1699250400-j9t6l" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1698904800-9dv89" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeContainerWaiting" container="pmc-bank-reports-cron-check-mail-box" namespace="pmc-production" pod="pmc-bank-reports-cron-check-mail-box-1697608800-rjcsj" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
KubeDaemonSetMisScheduled (1 active)
alert: KubeDaemonSetMisScheduled
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace=~".*"}
  > 0
for: 15m
labels:
  severity: warning
annotations:
  description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
    }} are running where they are not supposed to run.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
  summary: DaemonSet pods are misscheduled.
Labels State Active Since Value
alertname="KubeDaemonSetMisScheduled" daemonset="fluent-bit" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" namespace="logging" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 2
KubeDaemonSetRolloutStuck (5 active)
alert: KubeDaemonSetRolloutStuck
expr: ((kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace=~".*"}
  != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"})
  or (kube_daemonset_status_number_misscheduled{job="kube-state-metrics",namespace=~".*"}
  != 0) or (kube_daemonset_updated_number_scheduled{job="kube-state-metrics",namespace=~".*"}
  != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"})
  or (kube_daemonset_status_number_available{job="kube-state-metrics",namespace=~".*"}
  != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"}))
  and (changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics",namespace=~".*"}[5m])
  == 0)
for: 15m
labels:
  severity: warning
annotations:
  description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished
    or progressed for at least 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
  summary: DaemonSet rollout is stuck.
Labels State Active Since Value
alertname="KubeDaemonSetRolloutStuck" daemonset="fluent-bit" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" namespace="logging" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 2
alertname="KubeDaemonSetRolloutStuck" daemonset="calico-node" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" namespace="kube-system" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 8
alertname="KubeDaemonSetRolloutStuck" daemonset="kube-proxy" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" namespace="kube-system" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 9
alertname="KubeDaemonSetRolloutStuck" daemonset="nodelocaldns" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" namespace="kube-system" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 9
alertname="KubeDaemonSetRolloutStuck" daemonset="kube-prometheus-stack-prometheus-node-exporter" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" namespace="monitoring" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 9
KubeDeploymentReplicasMismatch (1 active)
alert: KubeDeploymentReplicasMismatch
expr: (kube_deployment_spec_replicas{job="kube-state-metrics",namespace=~".*"}
  != kube_deployment_status_replicas_available{job="kube-state-metrics",namespace=~".*"})
  and (changes(kube_deployment_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[5m])
  == 0)
for: 15m
labels:
  severity: warning
annotations:
  description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
    matched the expected number of replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
  summary: Deployment has not matched the expected number of replicas.
Labels State Active Since Value
alertname="KubeDeploymentReplicasMismatch" deployment="pmc-bank-reports" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
KubeJobCompletion (48 active)
alert: KubeJobCompletion
expr: kube_job_spec_completions{job="kube-state-metrics",namespace=~".*"}
  - kube_job_status_succeeded{job="kube-state-metrics",namespace=~".*"}
  > 0
for: 12h
labels:
  severity: warning
annotations:
  description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than
    12 hours to complete.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
  summary: Job did not complete in time
Labels State Active Since Value
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1700028000" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1698991200" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1698645600" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1700114400" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1699941600" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1697176800" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-statistic-cron-reference-1709251200" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1698300000" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1699423200" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1695880800" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1699509600" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1697781600" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-statistic-cron-reference-1711929600" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1697004000" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1698213600" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1697436000" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1699596000" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1700460000" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1696917600" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-billing-cron-report-generator-service-1683291900" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-billing-cron-report-service-1710893100" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1700200800" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-number-pool-migrated-full-1736396400" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" pending 2025-01-09 04:20:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1698127200" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1696226400" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1735020000" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1699250400" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-remove-old-files-1698829200" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1700546400" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1697608800" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1696831200" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1698818400" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1696312800" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1697090400" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1698386400" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1695794400" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-remove-old-files-1696150800" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1696572000" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1698040800" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1734674400" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-number-pool-migrated-changes-1690932000" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1699336800" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1733724000" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1695967200" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1699855200" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1696485600" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-bank-reports-cron-check-mail-box-1698904800" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobCompletion" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-number-pool-base-full-1736396400" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" pending 2025-01-09 04:20:57.339694135 +0000 UTC 1
KubeJobFailed (5 active)
alert: KubeJobFailed
expr: kube_job_failed{job="kube-state-metrics",namespace=~".*"}
  > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
  summary: Job failed to complete.
Labels State Active Since Value
alertname="KubeJobFailed" condition="true" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-number-pool-migrated-changes-1690932000" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobFailed" condition="true" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-number-pool-base-full-1736396400" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-09 04:26:27.339694135 +0000 UTC 1
alertname="KubeJobFailed" condition="true" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-billing-cron-report-generator-service-1683291900" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
alertname="KubeJobFailed" condition="true" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-number-pool-migrated-full-1736396400" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-09 04:26:57.339694135 +0000 UTC 1
alertname="KubeJobFailed" condition="true" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" job_name="pmc-billing-cron-report-service-1710893100" namespace="pmc-production" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:57.339694135 +0000 UTC 1
KubePodCrashLooping (1 active)
alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics",namespace=~".*"}[5m])
  * 60 * 5 > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
    }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
  summary: Pod is crash looping.
Labels State Active Since Value
alertname="KubePodCrashLooping" container="check-received-ibmmq-consumer" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" namespace="pmc-production" pod="pmc-bank-reports-759d9c5c76-zpgmn" service="kube-prometheus-stack-kube-state-metrics" severity="warning" pending 2025-01-09 05:11:57.339694135 +0000 UTC 1.111102880719402
KubeDaemonSetNotScheduled (0 active)
alert: KubeDaemonSetNotScheduled
expr: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics",namespace=~".*"}
  - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics",namespace=~".*"}
  > 0
for: 10m
labels:
  severity: warning
annotations:
  description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
    }} are not scheduled.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
  summary: DaemonSet pods are not scheduled.
KubeDeploymentGenerationMismatch (0 active)
alert: KubeDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation{job="kube-state-metrics",namespace=~".*"}
  != kube_deployment_metadata_generation{job="kube-state-metrics",namespace=~".*"}
for: 15m
labels:
  severity: warning
annotations:
  description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
    }} does not match, this indicates that the Deployment has failed but has not been
    rolled back.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
  summary: Deployment generation mismatch due to possible roll-back
KubeHpaMaxedOut (0 active)
alert: KubeHpaMaxedOut
expr: kube_hpa_status_current_replicas{job="kube-state-metrics",namespace=~".*"}
  == kube_hpa_spec_max_replicas{job="kube-state-metrics",namespace=~".*"}
for: 15m
labels:
  severity: warning
annotations:
  description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max
    replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
  summary: HPA is running at max replicas
KubeHpaReplicasMismatch (0 active)
alert: KubeHpaReplicasMismatch
expr: (kube_hpa_status_desired_replicas{job="kube-state-metrics",namespace=~".*"}
  != kube_hpa_status_current_replicas{job="kube-state-metrics",namespace=~".*"})
  and changes(kube_hpa_status_current_replicas[15m]) == 0
for: 15m
labels:
  severity: warning
annotations:
  description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired
    number of replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
  summary: HPA has not matched descired number of replicas.
KubePodNotReady (0 active)
alert: KubePodNotReady
expr: sum
  by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",namespace=~".*",phase=~"Pending|Unknown"})
  * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace,
  pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}))) > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
    state for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
  summary: Pod has been in a non-ready state for more than 15 minutes.
KubeStatefulSetGenerationMismatch (0 active)
alert: KubeStatefulSetGenerationMismatch
expr: kube_statefulset_status_observed_generation{job="kube-state-metrics",namespace=~".*"}
  != kube_statefulset_metadata_generation{job="kube-state-metrics",namespace=~".*"}
for: 15m
labels:
  severity: warning
annotations:
  description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
    }} does not match, this indicates that the StatefulSet has failed but has not
    been rolled back.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
  summary: StatefulSet generation mismatch due to possible roll-back
KubeStatefulSetReplicasMismatch (0 active)
alert: KubeStatefulSetReplicasMismatch
expr: (kube_statefulset_status_replicas_ready{job="kube-state-metrics",namespace=~".*"}
  != kube_statefulset_status_replicas{job="kube-state-metrics",namespace=~".*"})
  and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[5m])
  == 0)
for: 15m
labels:
  severity: warning
annotations:
  description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not
    matched the expected number of replicas for longer than 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
  summary: Deployment has not matched the expected number of replicas.
KubeStatefulSetUpdateNotRolledOut (0 active)
alert: KubeStatefulSetUpdateNotRolledOut
expr: (max
  without(revision) (kube_statefulset_status_current_revision{job="kube-state-metrics",namespace=~".*"}
  unless kube_statefulset_status_update_revision{job="kube-state-metrics",namespace=~".*"})
  * (kube_statefulset_replicas{job="kube-state-metrics",namespace=~".*"}
  != kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}))
  and (changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace=~".*"}[5m])
  == 0)
for: 15m
labels:
  severity: warning
annotations:
  description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
    has not been rolled out.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
  summary: StatefulSet update has not been rolled out.
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-kubernetes-resources.yaml > kubernetes-resources
CPUThrottlingHigh (1 active)
alert: CPUThrottlingHigh
expr: sum
  by(container, pod, namespace) (increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m]))
  / sum by(container, pod, namespace) (increase(container_cpu_cfs_periods_total[5m]))
  > (25 / 100)
for: 15m
labels:
  severity: info
annotations:
  description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{
    $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
    }}.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
  summary: Processes experience elevated CPU throttling.
Labels State Active Since Value
alertname="CPUThrottlingHigh" container="calico-node" namespace="kube-system" pod="calico-node-prj5v" severity="info" firing 2025-01-08 01:01:29.587197825 +0000 UTC 0.9560685033507075
KubeCPUOvercommit (0 active)
alert: KubeCPUOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum)
  / sum(kube_node_status_allocatable_cpu_cores) > (count(kube_node_status_allocatable_cpu_cores)
  - 1) / count(kube_node_status_allocatable_cpu_cores)
for: 5m
labels:
  severity: warning
annotations:
  description: Cluster has overcommitted CPU resource requests for Pods and cannot
    tolerate node failure.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
  summary: Cluster has overcommitted CPU resource requests.
KubeCPUQuotaOvercommit (0 active)
alert: KubeCPUQuotaOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="cpu",type="hard"})
  / sum(kube_node_status_allocatable_cpu_cores) > 1.5
for: 5m
labels:
  severity: warning
annotations:
  description: Cluster has overcommitted CPU resource requests for Namespaces.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
  summary: Cluster has overcommitted CPU resource requests.
KubeMemoryOvercommit (0 active)
alert: KubeMemoryOvercommit
expr: sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum)
  / sum(kube_node_status_allocatable_memory_bytes) > (count(kube_node_status_allocatable_memory_bytes)
  - 1) / count(kube_node_status_allocatable_memory_bytes)
for: 5m
labels:
  severity: warning
annotations:
  description: Cluster has overcommitted memory resource requests for Pods and cannot
    tolerate node failure.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
  summary: Cluster has overcommitted memory resource requests.
KubeMemoryQuotaOvercommit (0 active)
alert: KubeMemoryQuotaOvercommit
expr: sum(kube_resourcequota{job="kube-state-metrics",resource="memory",type="hard"})
  / sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"}) >
  1.5
for: 5m
labels:
  severity: warning
annotations:
  description: Cluster has overcommitted memory resource requests for Namespaces.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
  summary: Cluster has overcommitted memory resource requests.
KubeQuotaAlmostFull (0 active)
alert: KubeQuotaAlmostFull
expr: kube_resourcequota{job="kube-state-metrics",type="used"}
  / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"}
  > 0) > 0.9 < 1
for: 15m
labels:
  severity: info
annotations:
  description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
    }} of its {{ $labels.resource }} quota.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull
  summary: Namespace quota is going to be full.
KubeQuotaExceeded (0 active)
alert: KubeQuotaExceeded
expr: kube_resourcequota{job="kube-state-metrics",type="used"}
  / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"}
  > 0) > 1
for: 15m
labels:
  severity: warning
annotations:
  description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
    }} of its {{ $labels.resource }} quota.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
  summary: Namespace quota has exceeded the limits.
KubeQuotaFullyUsed (0 active)
alert: KubeQuotaFullyUsed
expr: kube_resourcequota{job="kube-state-metrics",type="used"}
  / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard"}
  > 0) == 1
for: 15m
labels:
  severity: info
annotations:
  description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
    }} of its {{ $labels.resource }} quota.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
  summary: Namespace quota is fully used.
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-kubernetes-storage.yaml > kubernetes-storage
KubePersistentVolumeErrors (0 active)
alert: KubePersistentVolumeErrors
expr: kube_persistentvolume_status_phase{job="kube-state-metrics",phase=~"Failed|Pending"}
  > 0
for: 5m
labels:
  severity: critical
annotations:
  description: The persistent volume {{ $labels.persistentvolume }} has status {{
    $labels.phase }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
  summary: PersistentVolume is having issues with provisioning.
KubePersistentVolumeFillingUp (0 active)
alert: KubePersistentVolumeFillingUp
expr: kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"}
  / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"}
  < 0.03
for: 1m
labels:
  severity: critical
annotations:
  description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }}
    in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }}
    free.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
  summary: PersistentVolume is filling up.
KubePersistentVolumeFillingUp (0 active)
alert: KubePersistentVolumeFillingUp
expr: (kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"}
  / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"})
  < 0.15 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics",namespace=~".*"}[6h],
  4 * 24 * 3600) < 0
for: 1h
labels:
  severity: warning
annotations:
  description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
    }} in Namespace {{ $labels.namespace }} is expected to fill up within four days.
    Currently {{ $value | humanizePercentage }} is available.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
  summary: PersistentVolume is filling up.
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-kubernetes-system-apiserver.yaml > kubernetes-system-apiserver
AggregatedAPIDown (0 active)
alert: AggregatedAPIDown
expr: (1
  - max by(name, namespace) (avg_over_time(aggregator_unavailable_apiservice[10m])))
  * 100 < 85
for: 5m
labels:
  severity: warning
annotations:
  description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been
    only {{ $value | humanize }}% available over the last 10m.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
  summary: An aggregated API is down.
AggregatedAPIErrors (0 active)
alert: AggregatedAPIErrors
expr: sum
  by(name, namespace) (increase(aggregator_unavailable_apiservice_count[5m])) >
  2
labels:
  severity: warning
annotations:
  description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported
    errors. The number of errors have increased for it in the past five minutes. High
    values indicate that the availability of the service changes too often.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
  summary: An aggregated API has reported errors.
KubeAPIDown (0 active)
alert: KubeAPIDown
expr: absent(up{job="apiserver"}
  == 1)
for: 15m
labels:
  severity: critical
annotations:
  description: KubeAPI has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
  summary: Target disappeared from Prometheus target discovery.
KubeClientCertificateExpiration (0 active)
alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"}
  > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m])))
  < 604800
labels:
  severity: warning
annotations:
  description: A client certificate used to authenticate to the apiserver is expiring
    in less than 7.0 days.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
  summary: Client certificate is about to expire.
KubeClientCertificateExpiration (0 active)
alert: KubeClientCertificateExpiration
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"}
  > 0 and on(job) histogram_quantile(0.01, sum by(job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m])))
  < 86400
labels:
  severity: critical
annotations:
  description: A client certificate used to authenticate to the apiserver is expiring
    in less than 24.0 hours.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
  summary: Client certificate is about to expire.
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-kubernetes-system-controller-manager.yaml > kubernetes-system-controller-manager
KubeControllerManagerDown (0 active)
alert: KubeControllerManagerDown
expr: absent(up{job="kube-controller-manager"}
  == 1)
for: 15m
labels:
  severity: critical
annotations:
  description: KubeControllerManager has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
  summary: Target disappeared from Prometheus target discovery.
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-kubernetes-system-kubelet.yaml > kubernetes-system-kubelet
KubeNodeNotReady (2 active)
alert: KubeNodeNotReady
expr: kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"}
  == 0
for: 15m
labels:
  severity: warning
annotations:
  description: '{{ $labels.node }} has been unready for more than 15 minutes.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
  summary: Node is not ready.
Labels State Active Since Value
alertname="KubeNodeNotReady" condition="Ready" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" namespace="monitoring" node="preprod-master-3.preprod.pmc.vas-stream.ru" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" status="true" firing 2025-01-08 01:00:53.279564141 +0000 UTC 0
alertname="KubeNodeNotReady" condition="Ready" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" namespace="monitoring" node="preprod-master-2.preprod.pmc.vas-stream.ru" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" status="true" firing 2025-01-08 01:00:53.279564141 +0000 UTC 0
KubeNodeUnreachable (2 active)
alert: KubeNodeUnreachable
expr: (kube_node_spec_taint{effect="NoSchedule",job="kube-state-metrics",key="node.kubernetes.io/unreachable"}
  unless ignoring(key, value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"})
  == 1
for: 15m
labels:
  severity: warning
annotations:
  description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
  summary: Node is unreachable.
Labels State Active Since Value
alertname="KubeNodeUnreachable" effect="NoSchedule" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" key="node.kubernetes.io/unreachable" namespace="monitoring" node="preprod-master-2.preprod.pmc.vas-stream.ru" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:53.279564141 +0000 UTC 1
alertname="KubeNodeUnreachable" effect="NoSchedule" endpoint="http" instance="10.233.79.108:8080" job="kube-state-metrics" key="node.kubernetes.io/unreachable" namespace="monitoring" node="preprod-master-3.preprod.pmc.vas-stream.ru" pod="kube-prometheus-stack-kube-state-metrics-857d997b65-wcgkq" service="kube-prometheus-stack-kube-state-metrics" severity="warning" firing 2025-01-08 01:00:53.279564141 +0000 UTC 1
KubeNodeReadinessFlapping (0 active)
alert: KubeNodeReadinessFlapping
expr: sum
  by(node) (changes(kube_node_status_condition{condition="Ready",status="true"}[15m]))
  > 2
for: 15m
labels:
  severity: warning
annotations:
  description: The readiness status of node {{ $labels.node }} has changed {{ $value
    }} times in the last 15 minutes.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
  summary: Node readiness status is flapping.
KubeletClientCertificateExpiration (0 active)
alert: KubeletClientCertificateExpiration
expr: kubelet_certificate_manager_client_ttl_seconds
  < 86400
labels:
  severity: critical
annotations:
  description: Client certificate for Kubelet on node {{ $labels.node }} expires in
    {{ $value | humanizeDuration }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
  summary: Kubelet client certificate is about to expire.
KubeletClientCertificateExpiration (0 active)
alert: KubeletClientCertificateExpiration
expr: kubelet_certificate_manager_client_ttl_seconds
  < 604800
labels:
  severity: warning
annotations:
  description: Client certificate for Kubelet on node {{ $labels.node }} expires in
    {{ $value | humanizeDuration }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
  summary: Kubelet client certificate is about to expire.
KubeletClientCertificateRenewalErrors (0 active)
alert: KubeletClientCertificateRenewalErrors
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m])
  > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate
    ({{ $value | humanize }} errors in the last 5 minutes).
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors
  summary: Kubelet has failed to renew its client certificate.
KubeletDown (0 active)
alert: KubeletDown
expr: absent(up{job="kubelet",metrics_path="/metrics"}
  == 1)
for: 15m
labels:
  severity: critical
annotations:
  description: Kubelet has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
  summary: Target disappeared from Prometheus target discovery.
KubeletPlegDurationHigh (0 active)
alert: KubeletPlegDurationHigh
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"}
  >= 10
for: 5m
labels:
  severity: warning
annotations:
  description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
    of {{ $value }} seconds on node {{ $labels.node }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
  summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
KubeletPodStartUpLatencyHigh (0 active)
alert: KubeletPodStartUpLatencyHigh
expr: histogram_quantile(0.99,
  sum by(instance, le) (rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet",metrics_path="/metrics"}[5m])))
  * on(instance) group_left(node) kubelet_node_name{job="kubelet",metrics_path="/metrics"}
  > 60
for: 15m
labels:
  severity: warning
annotations:
  description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
    on node {{ $labels.node }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
  summary: Kubelet Pod startup latency is too high.
KubeletServerCertificateExpiration (0 active)
alert: KubeletServerCertificateExpiration
expr: kubelet_certificate_manager_server_ttl_seconds
  < 604800
labels:
  severity: warning
annotations:
  description: Server certificate for Kubelet on node {{ $labels.node }} expires in
    {{ $value | humanizeDuration }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
  summary: Kubelet server certificate is about to expire.
KubeletServerCertificateExpiration (0 active)
alert: KubeletServerCertificateExpiration
expr: kubelet_certificate_manager_server_ttl_seconds
  < 86400
labels:
  severity: critical
annotations:
  description: Server certificate for Kubelet on node {{ $labels.node }} expires in
    {{ $value | humanizeDuration }}.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
  summary: Kubelet server certificate is about to expire.
KubeletServerCertificateRenewalErrors (0 active)
alert: KubeletServerCertificateRenewalErrors
expr: increase(kubelet_server_expiration_renew_errors[5m])
  > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate
    ({{ $value | humanize }} errors in the last 5 minutes).
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors
  summary: Kubelet has failed to renew its server certificate.
KubeletTooManyPods (0 active)
alert: KubeletTooManyPods
expr: count
  by(node) ((kube_pod_status_phase{job="kube-state-metrics",phase="Running"}
  == 1) * on(instance, pod, namespace, cluster) group_left(node) topk by(instance,
  pod, namespace, cluster) (1, kube_pod_info{job="kube-state-metrics"})) /
  max by(node) (kube_node_status_capacity_pods{job="kube-state-metrics"} !=
  1) > 0.95
for: 15m
labels:
  severity: warning
annotations:
  description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
    }} of its Pod capacity.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
  summary: Kubelet is running at capacity.
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-kubernetes-system-scheduler.yaml > kubernetes-system-scheduler
KubeSchedulerDown (0 active)
alert: KubeSchedulerDown
expr: absent(up{job="kube-scheduler"}
  == 1)
for: 15m
labels:
  severity: critical
annotations:
  description: KubeScheduler has disappeared from Prometheus target discovery.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
  summary: Target disappeared from Prometheus target discovery.
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-kubernetes-system.yaml > kubernetes-system
KubeClientErrors (0 active)
alert: KubeClientErrors
expr: (sum
  by(instance, job) (rate(rest_client_requests_total{code=~"5.."}[5m])) /
  sum by(instance, job) (rate(rest_client_requests_total[5m]))) > 0.01
for: 15m
labels:
  severity: warning
annotations:
  description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
    }}' is experiencing {{ $value | humanizePercentage }} errors.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
  summary: Kubernetes API server client is experiencing errors.
KubeVersionMismatch (0 active)
alert: KubeVersionMismatch
expr: count(count
  by(gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},
  "gitVersion", "$1", "gitVersion", "(v[0-9]*.[0-9]*).*")))
  > 1
for: 15m
labels:
  severity: warning
annotations:
  description: There are {{ $value }} different semantic versions of Kubernetes components
    running.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
  summary: Different semantic versions of Kubernetes components running.
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-node-exporter.yaml > node-exporter
NodeClockNotSynchronising (0 active)
alert: NodeClockNotSynchronising
expr: min_over_time(node_timex_sync_status[5m])
  == 0
for: 10m
labels:
  severity: warning
annotations:
  message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
    on this host.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising
  summary: Clock not synchronising.
NodeClockSkewDetected (0 active)
alert: NodeClockSkewDetected
expr: (node_timex_offset_seconds
  > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds
  < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 10m
labels:
  severity: warning
annotations:
  message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
    NTP is configured correctly on this host.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclockskewdetected
  summary: Clock skew detected.
NodeFilesystemAlmostOutOfFiles (0 active)
alert: NodeFilesystemAlmostOutOfFiles
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"}
  / node_filesystem_files{fstype!="",job="node-exporter"} * 100 <
  3 and node_filesystem_readonly{fstype!="",job="node-exporter"} ==
  0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available inodes left.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
  summary: Filesystem has less than 3% inodes left.
NodeFilesystemAlmostOutOfFiles (0 active)
alert: NodeFilesystemAlmostOutOfFiles
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"}
  / node_filesystem_files{fstype!="",job="node-exporter"} * 100 <
  5 and node_filesystem_readonly{fstype!="",job="node-exporter"} ==
  0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available inodes left.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
  summary: Filesystem has less than 5% inodes left.
NodeFilesystemAlmostOutOfSpace (0 active)
alert: NodeFilesystemAlmostOutOfSpace
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"}
  / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100
  < 5 and node_filesystem_readonly{fstype!="",job="node-exporter"}
  == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available space left.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
  summary: Filesystem has less than 5% space left.
NodeFilesystemAlmostOutOfSpace (0 active)
alert: NodeFilesystemAlmostOutOfSpace
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"}
  / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100
  < 3 and node_filesystem_readonly{fstype!="",job="node-exporter"}
  == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available space left.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
  summary: Filesystem has less than 3% space left.
NodeFilesystemFilesFillingUp (0 active)
alert: NodeFilesystemFilesFillingUp
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"}
  / node_filesystem_files{fstype!="",job="node-exporter"} * 100 <
  40 and predict_linear(node_filesystem_files_free{fstype!="",job="node-exporter"}[6h],
  24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"}
  == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available inodes left and is filling up.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
  summary: Filesystem is predicted to run out of inodes within the next 24 hours.
NodeFilesystemFilesFillingUp (0 active)
alert: NodeFilesystemFilesFillingUp
expr: (node_filesystem_files_free{fstype!="",job="node-exporter"}
  / node_filesystem_files{fstype!="",job="node-exporter"} * 100 <
  20 and predict_linear(node_filesystem_files_free{fstype!="",job="node-exporter"}[6h],
  4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"}
  == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
  summary: Filesystem is predicted to run out of inodes within the next 4 hours.
NodeFilesystemSpaceFillingUp (0 active)
alert: NodeFilesystemSpaceFillingUp
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"}
  / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100
  < 15 and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h],
  4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"}
  == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available space left and is filling up fast.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
  summary: Filesystem is predicted to run out of space within the next 4 hours.
NodeFilesystemSpaceFillingUp (0 active)
alert: NodeFilesystemSpaceFillingUp
expr: (node_filesystem_avail_bytes{fstype!="",job="node-exporter"}
  / node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100
  < 40 and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h],
  24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node-exporter"}
  == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available space left and is filling up.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
  summary: Filesystem is predicted to run out of space within the next 24 hours.
NodeHighNumberConntrackEntriesUsed (0 active)
alert: NodeHighNumberConntrackEntriesUsed
expr: (node_nf_conntrack_entries
  / node_nf_conntrack_entries_limit) > 0.75
labels:
  severity: warning
annotations:
  description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused
  summary: Number of conntrack are getting close to the limit.
NodeNetworkReceiveErrs (0 active)
alert: NodeNetworkReceiveErrs
expr: increase(node_network_receive_errs_total[2m])
  > 10
for: 1h
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
    {{ printf "%.0f" $value }} receive errors in the last two minutes.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs
  summary: Network interface is reporting many receive errors.
NodeNetworkTransmitErrs (0 active)
alert: NodeNetworkTransmitErrs
expr: increase(node_network_transmit_errs_total[2m])
  > 10
for: 1h
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
    {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs
  summary: Network interface is reporting many transmit errors.
NodeRAIDDegraded (0 active)
alert: NodeRAIDDegraded
expr: node_md_disks_required
  - ignoring(state) (node_md_disks{state="active"}) > 0
for: 15m
labels:
  severity: critical
annotations:
  description: RAID array '{{ $labels.device }}' on {{ $labels.instance }}
    is in degraded state due to one or more disks failures. Number of spare drives
    is insufficient to fix issue automatically.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddegraded
  summary: RAID Array is degraded
NodeRAIDDiskFailure (0 active)
alert: NodeRAIDDiskFailure
expr: node_md_disks{state="fail"}
  > 0
labels:
  severity: warning
annotations:
  description: At least one device in RAID array on {{ $labels.instance }} failed.
    Array '{{ $labels.device }}' needs attention and possibly a disk swap.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-noderaiddiskfailure
  summary: Failed device in RAID array
NodeTextFileCollectorScrapeError (0 active)
alert: NodeTextFileCollectorScrapeError
expr: node_textfile_scrape_error{job="node-exporter"}
  == 1
labels:
  severity: warning
annotations:
  description: Node Exporter text file collector failed to scrape.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror
  summary: Node Exporter text file collector failed to scrape.
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-node-network.yaml > node-network
NodeNetworkInterfaceFlapping (0 active)
alert: NodeNetworkInterfaceFlapping
expr: changes(node_network_up{device!~"veth.+",job="node-exporter"}[2m])
  > 2
for: 2m
labels:
  severity: warning
annotations:
  message: Network interface "{{ $labels.device }}" changing it's up status
    often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-prometheus-operator.yaml > prometheus-operator
PrometheusOperatorListErrors (0 active)
alert: PrometheusOperatorListErrors
expr: (sum
  by(controller, namespace) (rate(prometheus_operator_list_operations_failed_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[10m]))
  / sum by(controller, namespace) (rate(prometheus_operator_list_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[10m])))
  > 0.4
for: 15m
labels:
  severity: warning
annotations:
  description: Errors while performing List operations in controller {{$labels.controller}}
    in {{$labels.namespace}} namespace.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorlisterrors
  summary: Errors while performing list operations in controller.
PrometheusOperatorNodeLookupErrors (0 active)
alert: PrometheusOperatorNodeLookupErrors
expr: rate(prometheus_operator_node_address_lookup_errors_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m])
  > 0.1
for: 10m
labels:
  severity: warning
annotations:
  description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornodelookuperrors
  summary: Errors while reconciling Prometheus.
PrometheusOperatorReconcileErrors (0 active)
alert: PrometheusOperatorReconcileErrors
expr: (sum
  by(controller, namespace) (rate(prometheus_operator_reconcile_errors_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m])))
  / (sum by(controller, namespace) (rate(prometheus_operator_reconcile_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m])))
  > 0.1
for: 10m
labels:
  severity: warning
annotations:
  description: '{{ $value | humanizePercentage }} of reconciling operations failed
    for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorreconcileerrors
  summary: Errors while reconciling controller.
PrometheusOperatorWatchErrors (0 active)
alert: PrometheusOperatorWatchErrors
expr: (sum
  by(controller, namespace) (rate(prometheus_operator_watch_operations_failed_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[10m]))
  / sum by(controller, namespace) (rate(prometheus_operator_watch_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[10m])))
  > 0.4
for: 15m
labels:
  severity: warning
annotations:
  description: Errors while performing watch operations in controller {{$labels.controller}}
    in {{$labels.namespace}} namespace.
  runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorwatcherrors
  summary: Errors while performing watch operations in controller.
/etc/prometheus/rules/prometheus-kube-prometheus-stack-prometheus-rulefiles-0/monitoring-kube-prometheus-stack-prometheus.yaml > prometheus
PrometheusBadConfig (0 active)
alert: PrometheusBadConfig
expr: max_over_time(prometheus_config_last_reload_successful{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
  == 0
for: 10m
labels:
  severity: critical
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload
    its configuration.
  summary: Failed Prometheus configuration reload.
PrometheusDuplicateTimestamps (0 active)
alert: PrometheusDuplicateTimestamps
expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
  > 0
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf
    "%.4g" $value  }} samples/s with different values but duplicated timestamp.
  summary: Prometheus is dropping samples with duplicate timestamps.
PrometheusErrorSendingAlertsToAnyAlertmanager (0 active)
alert: PrometheusErrorSendingAlertsToAnyAlertmanager
expr: min
  without(alertmanager) (rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
  / rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))
  * 100 > 3
for: 15m
labels:
  severity: critical
annotations:
  description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
    from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
  summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
PrometheusErrorSendingAlertsToSomeAlertmanagers (0 active)
alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
expr: (rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
  / rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))
  * 100 > 1
for: 15m
labels:
  severity: warning
annotations:
  description: '{{ printf "%.1f" $value }}% errors while sending alerts from
    Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
  summary: Prometheus has encountered more than 1% errors sending alerts to a specific
    Alertmanager.
PrometheusMissingRuleEvaluations (0 active)
alert: PrometheusMissingRuleEvaluations
expr: increase(prometheus_rule_group_iterations_missed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
  > 0
for: 15m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf
    "%.0f" $value }} rule group evaluations in the last 5m.
  summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
PrometheusNotConnectedToAlertmanagers (0 active)
alert: PrometheusNotConnectedToAlertmanagers
expr: max_over_time(prometheus_notifications_alertmanagers_discovered{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
  < 1
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to
    any Alertmanagers.
  summary: Prometheus is not connected to any Alertmanagers.
PrometheusNotIngestingSamples (0 active)
alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
  <= 0
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples.
  summary: Prometheus is not ingesting samples.
PrometheusNotificationQueueRunningFull (0 active)
alert: PrometheusNotificationQueueRunningFull
expr: (predict_linear(prometheus_notifications_queue_length{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m],
  60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))
for: 15m
labels:
  severity: warning
annotations:
  description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}
    is running full.
  summary: Prometheus alert notification queue predicted to run full in less than
    30m.
PrometheusOutOfOrderTimestamps (0 active)
alert: PrometheusOutOfOrderTimestamps
expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
  > 0
for: 10m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf
    "%.4g" $value  }} samples/s with timestamps arriving out of order.
  summary: Prometheus drops samples with out-of-order timestamps.
PrometheusRemoteStorageFailures (0 active)
alert: PrometheusRemoteStorageFailures
expr: (rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
  / (rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
  + rate(prometheus_remote_storage_succeeded_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])))
  * 100 > 1
for: 15m
labels:
  severity: critical
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{
    printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{
    $labels.url }}
  summary: Prometheus fails to send samples to remote storage.
PrometheusRemoteWriteBehind (0 active)
alert: PrometheusRemoteWriteBehind
expr: (max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
  - on(job, instance) group_right() max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))
  > 120
for: 15m
labels:
  severity: critical
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{
    printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url
    }}.
  summary: Prometheus remote write is behind.
PrometheusRemoteWriteDesiredShards (0 active)
alert: PrometheusRemoteWriteDesiredShards
expr: (max_over_time(prometheus_remote_storage_shards_desired{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
  > max_over_time(prometheus_remote_storage_shards_max{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))
for: 15m
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired
    shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{
    $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="kube-prometheus-stack-prometheus",namespace="monitoring"}`
    $labels.instance | query | first | value }}.
  summary: Prometheus remote write desired shards calculation wants to run more than
    configured max shards.
PrometheusRuleFailures (0 active)
alert: PrometheusRuleFailures
expr: increase(prometheus_rule_evaluation_failures_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
  > 0
for: 15m
labels:
  severity: critical
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate
    {{ printf "%.0f" $value }} rules in the last 5m.
  summary: Prometheus is failing rule evaluations.
PrometheusTSDBCompactionsFailing (0 active)
alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[3h])
  > 0
for: 4h
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
    | humanize}} compaction failures over the last 3h.
  summary: Prometheus has issues compacting blocks.
PrometheusTSDBReloadsFailing (0 active)
alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[3h])
  > 0
for: 4h
labels:
  severity: warning
annotations:
  description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
    | humanize}} reload failures over the last 3h.
  summary: Prometheus has issues reloading blocks from disk.