2018-12-12 17:18:00 +00:00
apiVersion : monitoring.coreos.com/v1
kind : PrometheusRule
metadata :
labels :
prometheus : k8s
role : alert-rules
name : prometheus-k8s-rules
namespace : monitoring
spec :
groups :
2019-10-09 20:33:29 +00:00
- name : node-exporter.rules
rules :
- expr : |
count without (cpu) (
count without (mode) (
node_cpu_seconds_total{job="node-exporter"}
)
)
record : instance:node_num_cpu:sum
- expr : |
1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
)
record : instance:node_cpu_utilisation:rate1m
- expr : |
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record : instance:node_load1_per_cpu:ratio
- expr : |
1 - (
node_memory_MemAvailable_bytes{job="node-exporter"}
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record : instance:node_memory_utilisation:ratio
- expr : |
2019-11-17 15:05:27 +00:00
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
record : instance:node_vmstat_pgmajfault:rate1m
2019-10-09 20:33:29 +00:00
- expr : |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
record : instance_device:node_disk_io_time_seconds:rate1m
- expr : |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
record : instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr : |
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record : instance:node_network_receive_bytes_excluding_lo:rate1m
- expr : |
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record : instance:node_network_transmit_bytes_excluding_lo:rate1m
- expr : |
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
)
record : instance:node_network_receive_drop_excluding_lo:rate1m
- expr : |
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
)
record : instance:node_network_transmit_drop_excluding_lo:rate1m
2019-11-17 15:05:27 +00:00
- name : kube-apiserver.rules
rules :
- expr : |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
labels :
quantile : "0.99"
record : cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr : |
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
labels :
quantile : "0.9"
record : cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr : |
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
labels :
quantile : "0.5"
record : cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
2018-12-12 17:18:00 +00:00
- name : k8s.rules
rules :
- expr : |
2019-08-08 20:09:53 +00:00
sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])) by (namespace)
2018-12-12 17:18:00 +00:00
record : namespace:container_cpu_usage_seconds_total:sum_rate
- expr : |
2019-08-08 20:09:53 +00:00
sum by (namespace, pod, container) (
rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])
2019-11-17 15:05:27 +00:00
) * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
record : node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
- expr : |
container_memory_working_set_bytes{job="kubelet", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
record : node_namespace_pod_container:container_memory_working_set_bytes
- expr : |
container_memory_rss{job="kubelet", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
record : node_namespace_pod_container:container_memory_rss
- expr : |
container_memory_cache{job="kubelet", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
record : node_namespace_pod_container:container_memory_cache
- expr : |
container_memory_swap{job="kubelet", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
record : node_namespace_pod_container:container_memory_swap
2018-12-12 17:18:00 +00:00
- expr : |
2019-08-08 20:09:53 +00:00
sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace)
2018-12-12 17:18:00 +00:00
record : namespace:container_memory_usage_bytes:sum
- expr : |
sum by (namespace, label_name) (
2019-11-17 15:05:27 +00:00
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"Pending|Running"} == 1)) by (namespace, pod)
2019-08-08 20:09:53 +00:00
* on (namespace, pod)
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
2018-12-12 17:18:00 +00:00
)
2019-08-08 20:09:53 +00:00
record : namespace:kube_pod_container_resource_requests_memory_bytes:sum
2018-12-12 17:18:00 +00:00
- expr : |
sum by (namespace, label_name) (
2019-11-17 15:05:27 +00:00
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"Pending|Running"} == 1)) by (namespace, pod)
2019-08-08 20:09:53 +00:00
* on (namespace, pod)
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
2018-12-12 17:18:00 +00:00
)
2019-08-08 20:09:53 +00:00
record : namespace:kube_pod_container_resource_requests_cpu_cores:sum
2019-04-22 18:17:53 +00:00
- expr : |
sum(
label_replace(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset" , "$1" , "owner_name" , "(.*)"
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
"workload" , "$1" , "owner_name" , "(.*)"
)
) by (namespace, workload, pod)
labels :
workload_type : deployment
record : mixin_pod_workload
- expr : |
sum(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload" , "$1" , "owner_name" , "(.*)"
)
) by (namespace, workload, pod)
labels :
workload_type : daemonset
record : mixin_pod_workload
- expr : |
sum(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload" , "$1" , "owner_name" , "(.*)"
)
) by (namespace, workload, pod)
labels :
workload_type : statefulset
record : mixin_pod_workload
2018-12-12 17:18:00 +00:00
- name : kube-scheduler.rules
rules :
- expr : |
2019-08-08 20:09:53 +00:00
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
2018-12-12 17:18:00 +00:00
labels :
quantile : "0.99"
2019-08-08 20:09:53 +00:00
record : cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
2018-12-12 17:18:00 +00:00
- expr : |
2019-08-08 20:09:53 +00:00
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
2018-12-12 17:18:00 +00:00
labels :
quantile : "0.99"
2019-08-08 20:09:53 +00:00
record : cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
2018-12-12 17:18:00 +00:00
- expr : |
2019-08-08 20:09:53 +00:00
histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
2018-12-12 17:18:00 +00:00
labels :
quantile : "0.99"
2019-08-08 20:09:53 +00:00
record : cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
2018-12-12 17:18:00 +00:00
- expr : |
2019-08-08 20:09:53 +00:00
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
2018-12-12 17:18:00 +00:00
labels :
quantile : "0.9"
2019-08-08 20:09:53 +00:00
record : cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
2018-12-12 17:18:00 +00:00
- expr : |
2019-08-08 20:09:53 +00:00
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
2018-12-12 17:18:00 +00:00
labels :
quantile : "0.9"
2019-08-08 20:09:53 +00:00
record : cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
2018-12-12 17:18:00 +00:00
- expr : |
2019-08-08 20:09:53 +00:00
histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
2018-12-12 17:18:00 +00:00
labels :
quantile : "0.9"
2019-08-08 20:09:53 +00:00
record : cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
2018-12-12 17:18:00 +00:00
- expr : |
2019-08-08 20:09:53 +00:00
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
2018-12-12 17:18:00 +00:00
labels :
quantile : "0.5"
2019-08-08 20:09:53 +00:00
record : cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
2018-12-12 17:18:00 +00:00
- expr : |
2019-08-08 20:09:53 +00:00
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
2018-12-12 17:18:00 +00:00
labels :
quantile : "0.5"
2019-08-08 20:09:53 +00:00
record : cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
2018-12-12 17:18:00 +00:00
- expr : |
2019-08-08 20:09:53 +00:00
histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
2018-12-12 17:18:00 +00:00
labels :
quantile : "0.5"
2019-08-08 20:09:53 +00:00
record : cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
2018-12-12 17:18:00 +00:00
- name : node.rules
rules :
- expr : sum(min(kube_pod_info) by (node))
record : ':kube_pod_info_node_count:'
- expr : |
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
record : 'node_namespace_pod:kube_pod_info:'
- expr : |
count by (node) (sum by (node, cpu) (
node_cpu_seconds_total{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info :
))
record : node:node_num_cpu:sum
- expr : |
2019-11-17 15:05:27 +00:00
sum(
node_memory_MemAvailable_bytes{job="node-exporter"} or
(
node_memory_Buffers_bytes{job="node-exporter"} +
node_memory_Cached_bytes{job="node-exporter"} +
node_memory_MemFree_bytes{job="node-exporter"} +
node_memory_Slab_bytes{job="node-exporter"}
)
)
record : : node_memory_MemAvailable_bytes:sum
2018-12-12 17:18:00 +00:00
- name : kube-prometheus-node-recording.rules
rules :
2019-02-01 13:52:53 +00:00
- expr : sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
(instance)
2018-12-12 17:18:00 +00:00
record : instance:node_cpu:rate:sum
2019-02-01 13:52:53 +00:00
- expr : sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
2018-12-12 17:18:00 +00:00
BY (instance)
record : instance:node_filesystem_usage:sum
2019-02-01 13:52:53 +00:00
- expr : sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
2018-12-12 17:18:00 +00:00
record : instance:node_network_receive_bytes:rate:sum
2019-02-01 13:52:53 +00:00
- expr : sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
2018-12-12 17:18:00 +00:00
record : instance:node_network_transmit_bytes:rate:sum
2019-02-01 13:52:53 +00:00
- expr : sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT
(cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
BY (instance, cpu)) BY (instance)
2018-12-12 17:18:00 +00:00
record : instance:node_cpu:ratio
2019-02-01 13:52:53 +00:00
- expr : sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
2018-12-12 17:18:00 +00:00
record : cluster:node_cpu:sum_rate5m
2019-02-01 13:52:53 +00:00
- expr : cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
BY (instance, cpu))
2018-12-12 17:18:00 +00:00
record : cluster:node_cpu:ratio
2019-10-09 20:33:29 +00:00
- name : node-exporter
rules :
- alert : NodeFilesystemSpaceFillingUp
annotations :
description : Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
summary : Filesystem is predicted to run out of space within the next 24 hours.
expr : |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for : 1h
labels :
severity : warning
- alert : NodeFilesystemSpaceFillingUp
annotations :
description : Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up fast.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
summary : Filesystem is predicted to run out of space within the next 4 hours.
expr : |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 20
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for : 1h
labels :
severity : critical
- alert : NodeFilesystemAlmostOutOfSpace
annotations :
description : Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
summary : Filesystem has less than 5% space left.
expr : |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for : 1h
labels :
severity : warning
- alert : NodeFilesystemAlmostOutOfSpace
annotations :
description : Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
summary : Filesystem has less than 3% space left.
expr : |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for : 1h
labels :
severity : critical
- alert : NodeFilesystemFilesFillingUp
annotations :
description : Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
summary : Filesystem is predicted to run out of inodes within the next 24 hours.
expr : |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for : 1h
labels :
severity : warning
- alert : NodeFilesystemFilesFillingUp
annotations :
description : Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up fast.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
summary : Filesystem is predicted to run out of inodes within the next 4 hours.
expr : |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for : 1h
labels :
severity : critical
- alert : NodeFilesystemAlmostOutOfFiles
annotations :
description : Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
summary : Filesystem has less than 5% inodes left.
expr : |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for : 1h
labels :
severity : warning
- alert : NodeFilesystemAlmostOutOfFiles
annotations :
description : Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
summary : Filesystem has less than 3% inodes left.
expr : |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for : 1h
labels :
severity : critical
- alert : NodeNetworkReceiveErrs
annotations :
description : '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs
summary : Network interface is reporting many receive errors.
expr : |
increase(node_network_receive_errs_total[2m]) > 10
for : 1h
labels :
severity : warning
- alert : NodeNetworkTransmitErrs
annotations :
description : '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs
summary : Network interface is reporting many transmit errors.
expr : |
increase(node_network_transmit_errs_total[2m]) > 10
for : 1h
labels :
severity : warning
2018-12-12 17:18:00 +00:00
- name : kubernetes-apps
rules :
- alert : KubePodCrashLooping
annotations :
message : Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
expr : |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
2019-08-21 15:28:55 +00:00
for : 15m
2018-12-12 17:18:00 +00:00
labels :
severity : critical
- alert : KubePodNotReady
annotations :
message : Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
2019-08-21 15:28:55 +00:00
state for longer than 15 minutes.
2018-12-12 17:18:00 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr : |
2019-11-17 15:05:27 +00:00
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
2019-08-21 15:28:55 +00:00
for : 15m
2018-12-12 17:18:00 +00:00
labels :
severity : critical
- alert : KubeDeploymentGenerationMismatch
annotations :
message : Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has
not been rolled back.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
expr : |
kube_deployment_status_observed_generation{job="kube-state-metrics"}
!=
kube_deployment_metadata_generation{job="kube-state-metrics"}
for : 15m
labels :
severity : critical
- alert : KubeDeploymentReplicasMismatch
annotations :
message : Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
2019-08-21 15:28:55 +00:00
matched the expected number of replicas for longer than 15 minutes.
2018-12-12 17:18:00 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
expr : |
kube_deployment_spec_replicas{job="kube-state-metrics"}
!=
kube_deployment_status_replicas_available{job="kube-state-metrics"}
2019-08-21 15:28:55 +00:00
for : 15m
2018-12-12 17:18:00 +00:00
labels :
severity : critical
- alert : KubeStatefulSetReplicasMismatch
annotations :
message : StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
expr : |
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
!=
kube_statefulset_status_replicas{job="kube-state-metrics"}
for : 15m
labels :
severity : critical
- alert : KubeStatefulSetGenerationMismatch
annotations :
message : StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
expr : |
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics"}
for : 15m
labels :
severity : critical
- alert : KubeStatefulSetUpdateNotRolledOut
annotations :
message : StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
expr : |
max without (revision) (
kube_statefulset_status_current_revision{job="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
)
for : 15m
labels :
severity : critical
- alert : KubeDaemonSetRolloutStuck
annotations :
2019-10-09 20:33:29 +00:00
message : Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet
{{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.
2018-12-12 17:18:00 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
expr : |
kube_daemonset_status_number_ready{job="kube-state-metrics"}
/
2019-10-09 20:33:29 +00:00
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1.00
2018-12-12 17:18:00 +00:00
for : 15m
labels :
severity : critical
2019-11-17 15:05:27 +00:00
- alert : KubeContainerWaiting
annotations :
message : Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
has been in waiting state for longer than 1 hour.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
expr : |
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
for : 1h
labels :
severity : warning
2018-12-12 17:18:00 +00:00
- alert : KubeDaemonSetNotScheduled
annotations :
message : '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
expr : |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
-
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
for : 10m
labels :
severity : warning
- alert : KubeDaemonSetMisScheduled
annotations :
message : '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
expr : |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
for : 10m
labels :
severity : warning
- alert : KubeCronJobRunning
annotations :
message : CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more
than 1h to complete.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
expr : |
time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
for : 1h
labels :
severity : warning
- alert : KubeJobCompletion
annotations :
message : Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
than one hour to complete.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
expr : |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
for : 1h
labels :
severity : warning
- alert : KubeJobFailed
annotations :
message : Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
expr : |
2019-10-09 20:33:29 +00:00
kube_job_failed{job="kube-state-metrics"} > 0
for : 15m
labels :
severity : warning
- alert : KubeHpaReplicasMismatch
annotations :
message : HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the
desired number of replicas for longer than 15 minutes.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
expr : |
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
!=
kube_hpa_status_current_replicas{job="kube-state-metrics"})
and
changes(kube_hpa_status_current_replicas[15m]) == 0
for : 15m
labels :
severity : warning
- alert : KubeHpaMaxedOut
annotations :
message : HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at
max replicas for longer than 15 minutes.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
expr : |
kube_hpa_status_current_replicas{job="kube-state-metrics"}
==
kube_hpa_spec_max_replicas{job="kube-state-metrics"}
2019-08-21 15:28:55 +00:00
for : 15m
2018-12-12 17:18:00 +00:00
labels :
severity : warning
- name : kubernetes-resources
rules :
- alert : KubeCPUOvercommit
annotations :
message : Cluster has overcommitted CPU resource requests for Pods and cannot
tolerate node failure.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr : |
2019-08-08 20:09:53 +00:00
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum)
2018-12-12 17:18:00 +00:00
/
2019-08-08 20:09:53 +00:00
sum(kube_node_status_allocatable_cpu_cores)
2018-12-12 17:18:00 +00:00
>
2019-08-08 20:09:53 +00:00
(count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores)
2018-12-12 17:18:00 +00:00
for : 5m
labels :
severity : warning
- alert : KubeMemOvercommit
annotations :
message : Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
expr : |
2019-08-08 20:09:53 +00:00
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum)
2018-12-12 17:18:00 +00:00
/
2019-08-08 20:09:53 +00:00
sum(kube_node_status_allocatable_memory_bytes)
2018-12-12 17:18:00 +00:00
>
2019-08-08 20:09:53 +00:00
(count(kube_node_status_allocatable_memory_bytes)-1)
2018-12-12 17:18:00 +00:00
/
2019-08-08 20:09:53 +00:00
count(kube_node_status_allocatable_memory_bytes)
2018-12-12 17:18:00 +00:00
for : 5m
labels :
severity : warning
- alert : KubeCPUOvercommit
annotations :
message : Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr : |
2019-03-25 23:08:54 +00:00
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
2018-12-12 17:18:00 +00:00
/
2019-08-08 20:09:53 +00:00
sum(kube_node_status_allocatable_cpu_cores)
2018-12-12 17:18:00 +00:00
> 1.5
for : 5m
labels :
severity : warning
- alert : KubeMemOvercommit
annotations :
message : Cluster has overcommitted memory resource requests for Namespaces.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
expr : |
2019-03-25 23:08:54 +00:00
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
2018-12-12 17:18:00 +00:00
/
2019-08-08 20:09:53 +00:00
sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"})
2018-12-12 17:18:00 +00:00
> 1.5
for : 5m
labels :
severity : warning
- alert : KubeQuotaExceeded
annotations :
2019-10-09 20:33:29 +00:00
message : Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
2018-12-12 17:18:00 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
expr : |
2019-10-09 20:33:29 +00:00
kube_resourcequota{job="kube-state-metrics", type="used"}
2018-12-12 17:18:00 +00:00
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
2019-10-09 20:33:29 +00:00
> 0.90
2018-12-12 17:18:00 +00:00
for : 15m
labels :
severity : warning
- alert : CPUThrottlingHigh
annotations :
2019-10-09 20:33:29 +00:00
message : '{{ $value | humanizePercentage }} throttling of CPU in namespace
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{
$labels.pod }}.'
2018-12-12 17:18:00 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
2019-10-09 20:33:29 +00:00
expr : |
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
> ( 25 / 100 )
2018-12-12 17:18:00 +00:00
for : 15m
labels :
severity : warning
- name : kubernetes-storage
rules :
- alert : KubePersistentVolumeUsageCritical
annotations :
message : The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
2019-10-09 20:33:29 +00:00
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
}} free.
2018-12-12 17:18:00 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
expr : |
2019-10-09 20:33:29 +00:00
kubelet_volume_stats_available_bytes{job="kubelet"}
2018-12-12 17:18:00 +00:00
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
2019-10-09 20:33:29 +00:00
< 0.03
2018-12-12 17:18:00 +00:00
for : 1m
labels :
severity : critical
- alert : KubePersistentVolumeFullInFourDays
annotations :
message : Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
2019-10-09 20:33:29 +00:00
days. Currently {{ $value | humanizePercentage }} is available.
2018-12-12 17:18:00 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
expr : |
2019-10-09 20:33:29 +00:00
(
2018-12-12 17:18:00 +00:00
kubelet_volume_stats_available_bytes{job="kubelet"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
2019-10-09 20:33:29 +00:00
) < 0.15
2018-12-12 17:18:00 +00:00
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
2019-11-17 15:05:27 +00:00
for : 1h
2018-12-12 17:18:00 +00:00
labels :
severity : critical
- alert : KubePersistentVolumeErrors
annotations :
message : The persistent volume {{ $labels.persistentvolume }} has status {{
$labels.phase }}.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
expr : |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
for : 5m
labels :
severity : critical
- name : kubernetes-system
rules :
- alert : KubeVersionMismatch
annotations :
2019-02-15 12:22:45 +00:00
message : There are {{ $value }} different semantic versions of Kubernetes
components running.
2018-12-12 17:18:00 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
expr : |
2019-08-08 20:09:53 +00:00
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
2019-08-21 15:28:55 +00:00
for : 15m
2018-12-12 17:18:00 +00:00
labels :
severity : warning
- alert : KubeClientErrors
annotations :
message : Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
2019-10-09 20:33:29 +00:00
}}' is experiencing {{ $value | humanizePercentage }} errors.'
2018-12-12 17:18:00 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
expr : |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
/
sum(rate(rest_client_requests_total[5m])) by (instance, job))
2019-10-09 20:33:29 +00:00
> 0.01
2018-12-12 17:18:00 +00:00
for : 15m
labels :
severity : warning
2019-11-17 15:05:27 +00:00
- name : kubernetes-system-apiserver
rules :
2018-12-12 17:18:00 +00:00
- alert : KubeAPILatencyHigh
annotations :
message : The API server has a 99th percentile latency of {{ $value }} seconds
for {{ $labels.verb }} {{ $labels.resource }}.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr : |
2019-11-17 15:05:27 +00:00
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 1
2018-12-12 17:18:00 +00:00
for : 10m
labels :
severity : warning
- alert : KubeAPILatencyHigh
annotations :
message : The API server has a 99th percentile latency of {{ $value }} seconds
for {{ $labels.verb }} {{ $labels.resource }}.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr : |
2019-11-17 15:05:27 +00:00
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 4
2018-12-12 17:18:00 +00:00
for : 10m
labels :
severity : critical
- alert : KubeAPIErrorsHigh
annotations :
2019-10-09 20:33:29 +00:00
message : API server is returning errors for {{ $value | humanizePercentage
}} of requests.
2018-12-12 17:18:00 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr : |
2019-11-17 15:05:27 +00:00
sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m]))
2018-12-12 17:18:00 +00:00
/
2019-10-09 20:33:29 +00:00
sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.03
2018-12-12 17:18:00 +00:00
for : 10m
labels :
severity : critical
- alert : KubeAPIErrorsHigh
annotations :
2019-10-09 20:33:29 +00:00
message : API server is returning errors for {{ $value | humanizePercentage
}} of requests.
2018-12-12 17:18:00 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr : |
2019-11-17 15:05:27 +00:00
sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m]))
2018-12-12 17:18:00 +00:00
/
2019-10-09 20:33:29 +00:00
sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.01
2019-03-13 21:44:16 +00:00
for : 10m
labels :
severity : warning
- alert : KubeAPIErrorsHigh
annotations :
2019-10-09 20:33:29 +00:00
message : API server is returning errors for {{ $value | humanizePercentage
}} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource
}}.
2019-03-13 21:44:16 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr : |
2019-11-17 15:05:27 +00:00
sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) by (resource,subresource,verb)
2019-03-13 21:44:16 +00:00
/
2019-10-09 20:33:29 +00:00
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.10
2019-03-13 21:44:16 +00:00
for : 10m
labels :
severity : critical
- alert : KubeAPIErrorsHigh
annotations :
2019-10-09 20:33:29 +00:00
message : API server is returning errors for {{ $value | humanizePercentage
}} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource
}}.
2019-03-13 21:44:16 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr : |
2019-11-17 15:05:27 +00:00
sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) by (resource,subresource,verb)
2019-03-13 21:44:16 +00:00
/
2019-10-09 20:33:29 +00:00
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.05
2018-12-12 17:18:00 +00:00
for : 10m
labels :
severity : warning
- alert : KubeClientCertificateExpiration
annotations :
2019-02-01 13:52:53 +00:00
message : A client certificate used to authenticate to the apiserver is expiring
2019-04-22 18:17:53 +00:00
in less than 7.0 days.
2018-12-12 17:18:00 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr : |
2019-03-25 23:08:54 +00:00
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
2018-12-12 17:18:00 +00:00
labels :
severity : warning
- alert : KubeClientCertificateExpiration
annotations :
2019-02-01 13:52:53 +00:00
message : A client certificate used to authenticate to the apiserver is expiring
2019-04-22 18:17:53 +00:00
in less than 24.0 hours.
2018-12-12 17:18:00 +00:00
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr : |
2019-03-25 23:08:54 +00:00
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
2018-12-12 17:18:00 +00:00
labels :
severity : critical
2019-11-17 15:05:27 +00:00
- alert : KubeAPIDown
annotations :
message : KubeAPI has disappeared from Prometheus target discovery.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
expr : |
absent(up{job="apiserver"} == 1)
for : 15m
labels :
severity : critical
- name : kubernetes-system-kubelet
rules :
- alert : KubeNodeNotReady
annotations :
message : '{{ $labels.node }} has been unready for more than 15 minutes.'
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
expr : |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for : 15m
labels :
severity : warning
- alert : KubeNodeUnreachable
annotations :
message : '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
expr : |
kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} == 1
labels :
severity : warning
- alert : KubeletTooManyPods
annotations :
message : Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
}} of its Pod capacity.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr : |
max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
for : 15m
labels :
severity : warning
- alert : KubeletDown
annotations :
message : Kubelet has disappeared from Prometheus target discovery.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
expr : |
absent(up{job="kubelet"} == 1)
for : 15m
labels :
severity : critical
- name : kubernetes-system-scheduler
rules :
- alert : KubeSchedulerDown
annotations :
message : KubeScheduler has disappeared from Prometheus target discovery.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
expr : |
absent(up{job="kube-scheduler"} == 1)
for : 15m
labels :
severity : critical
- name : kubernetes-system-controller-manager
rules :
- alert : KubeControllerManagerDown
annotations :
message : KubeControllerManager has disappeared from Prometheus target discovery.
runbook_url : https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
expr : |
absent(up{job="kube-controller-manager"} == 1)
for : 15m
labels :
severity : critical
2019-08-08 20:09:53 +00:00
- name : prometheus
rules :
- alert : PrometheusBadConfig
annotations :
description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
reload its configuration.
summary : Failed Prometheus configuration reload.
expr : |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0
for : 10m
labels :
severity : critical
- alert : PrometheusNotificationQueueRunningFull
annotations :
description : Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}
is running full.
summary : Prometheus alert notification queue predicted to run full in less
than 30m.
expr : |
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30)
>
min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m])
)
for : 15m
labels :
severity : warning
- alert : PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations :
description : '{{ printf "%.1f" $value }}% errors while sending alerts from
Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
summary : Prometheus has encountered more than 1% errors sending alerts to
a specific Alertmanager.
expr : |
(
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
)
* 100
> 1
for : 15m
labels :
severity : warning
- alert : PrometheusErrorSendingAlertsToAnyAlertmanager
annotations :
description : '{{ printf "%.1f" $value }}% minimum errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
summary : Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr : |
min without(alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
)
* 100
> 3
for : 15m
labels :
severity : critical
- alert : PrometheusNotConnectedToAlertmanagers
annotations :
description : Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected
to any Alertmanagers.
summary : Prometheus is not connected to any Alertmanagers.
expr : |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1
for : 10m
labels :
severity : warning
- alert : PrometheusTSDBReloadsFailing
annotations :
description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
{{$value | humanize}} reload failures over the last 3h.
summary : Prometheus has issues reloading blocks from disk.
expr : |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
for : 4h
labels :
severity : warning
- alert : PrometheusTSDBCompactionsFailing
annotations :
description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
{{$value | humanize}} compaction failures over the last 3h.
summary : Prometheus has issues compacting blocks.
expr : |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
for : 4h
labels :
severity : warning
- alert : PrometheusNotIngestingSamples
annotations :
description : Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting
samples.
summary : Prometheus is not ingesting samples.
expr : |
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
for : 10m
labels :
severity : warning
- alert : PrometheusDuplicateTimestamps
annotations :
description : Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
2019-10-09 20:33:29 +00:00
{{ printf "%.4g" $value }} samples/s with different values but duplicated
timestamp.
2019-08-08 20:09:53 +00:00
summary : Prometheus is dropping samples with duplicate timestamps.
expr : |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for : 10m
labels :
severity : warning
- alert : PrometheusOutOfOrderTimestamps
annotations :
description : Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
2019-10-09 20:33:29 +00:00
{{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
2019-08-08 20:09:53 +00:00
summary : Prometheus drops samples with out-of-order timestamps.
expr : |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for : 10m
labels :
severity : warning
- alert : PrometheusRemoteStorageFailures
annotations :
description : Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send
{{ printf "%.1f" $value }}% of the samples to queue {{$labels.queue}}.
summary : Prometheus fails to send samples to remote storage.
expr : |
(
rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
/
(
rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
+
rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
)
)
* 100
> 1
for : 15m
labels :
severity : critical
- alert : PrometheusRemoteWriteBehind
annotations :
description : Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
is {{ printf "%.1f" $value }}s behind for queue {{$labels.queue}}.
summary : Prometheus remote write is behind.
expr : |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
- on (job, instance) group_right
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
)
> 120
for : 15m
labels :
severity : critical
2019-10-09 20:33:29 +00:00
- alert : PrometheusRemoteWriteDesiredShards
annotations :
description : Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
2019-11-17 15:05:27 +00:00
desired shards calculation wants to run {{ $value }} shards, which is more
than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}`
2019-10-09 20:33:29 +00:00
$labels.instance | query | first | value }}.
summary : Prometheus remote write desired shards calculation wants to run more
than configured max shards.
expr : |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m])
2019-11-17 15:05:27 +00:00
>
2019-10-09 20:33:29 +00:00
max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m])
)
for : 15m
labels :
severity : warning
2019-08-08 20:09:53 +00:00
- alert : PrometheusRuleFailures
annotations :
description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
evaluate {{ printf "%.0f" $value }} rules in the last 5m.
summary : Prometheus is failing rule evaluations.
expr : |
increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for : 15m
labels :
severity : critical
- alert : PrometheusMissingRuleEvaluations
annotations :
description : Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{
printf "%.0f" $value }} rule group evaluations in the last 5m.
summary : Prometheus is missing rule evaluations due to slow rule group evaluation.
expr : |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for : 15m
labels :
severity : warning
2018-12-12 17:18:00 +00:00
- name : alertmanager.rules
rules :
- alert : AlertmanagerConfigInconsistent
annotations :
message : The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
are out of sync.
expr : |
2019-08-21 15:28:55 +00:00
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1
2018-12-12 17:18:00 +00:00
for : 5m
labels :
severity : critical
- alert : AlertmanagerFailedReload
annotations :
message : Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}.
expr : |
2019-02-01 13:52:53 +00:00
alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0
2018-12-12 17:18:00 +00:00
for : 10m
labels :
severity : warning
- alert : AlertmanagerMembersInconsistent
annotations :
message : Alertmanager has not found all other members of the cluster.
expr : |
2019-02-01 13:52:53 +00:00
alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}
2018-12-12 17:18:00 +00:00
!= on (service) GROUP_LEFT()
2019-02-01 13:52:53 +00:00
count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"})
2018-12-12 17:18:00 +00:00
for : 5m
labels :
severity : critical
- name : general.rules
rules :
- alert : TargetDown
annotations :
2019-11-17 15:05:27 +00:00
message : '{{ printf "%.4g" $value }}% of the {{ $labels.job }} targets in
{{ $labels.namespace }} namespace are down.'
2019-10-09 20:33:29 +00:00
expr : 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job,
namespace, service)) > 10
2018-12-12 17:18:00 +00:00
for : 10m
labels :
severity : warning
2019-02-15 12:22:45 +00:00
- alert : Watchdog
annotations :
message : |
This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
2018-12-12 17:18:00 +00:00
expr : vector(1)
labels :
severity : none
2019-04-22 18:17:53 +00:00
- name : node-time
rules :
- alert : ClockSkewDetected
annotations :
message : Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod
}}. Ensure NTP is configured correctly on this host.
expr : |
2019-08-08 20:09:53 +00:00
abs(node_timex_offset_seconds{job="node-exporter"}) > 0.05
2019-04-22 18:17:53 +00:00
for : 2m
labels :
severity : warning
- name : node-network
rules :
- alert : NodeNetworkInterfaceFlapping
annotations :
message : Network interface "{{ $labels.device }}" changing it's up status
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
expr : |
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
for : 2m
labels :
severity : warning
2018-12-12 17:18:00 +00:00
- name : prometheus-operator
rules :
- alert : PrometheusOperatorReconcileErrors
annotations :
message : Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
}} Namespace.
expr : |
2019-02-01 13:52:53 +00:00
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
2018-12-12 17:18:00 +00:00
for : 10m
labels :
severity : warning
- alert : PrometheusOperatorNodeLookupErrors
annotations :
message : Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
expr : |
2019-02-01 13:52:53 +00:00
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
2018-12-12 17:18:00 +00:00
for : 10m
labels :
severity : warning