Bump libraries and use kube-rbac-proxy in K3s
parent
10b82768c2
commit
cdb23a0bcc
4
Makefile
4
Makefile
|
@ -50,8 +50,8 @@ ifeq (, $(shell which jsonnet))
|
|||
endif
|
||||
|
||||
change_suffix:
|
||||
@perl -p -i -e 's/^(\s*)\-\ host:.*/\1- host: alertmanager.${IP}.nip.io/g' manifests/ingress-alertmanager-main.yaml manifests/ingress-prometheus-k8s.yaml manifests/ingress-grafana.yaml
|
||||
@perl -p -i -e 's/^(\s*)\-\ host:.*/\1- host: alertmanager.${IP}.nip.io/g' manifests/ingress-alertmanager.yaml manifests/ingress-prometheus.yaml manifests/ingress-grafana.yaml
|
||||
@echo "Ingress IPs changed to [service].${IP}.nip.io"
|
||||
${K3S} kubectl apply -f manifests/ingress-alertmanager-main.yaml
|
||||
${K3S} kubectl apply -f manifests/ingress-alertmanager.yaml
|
||||
${K3S} kubectl apply -f manifests/ingress-grafana.yaml
|
||||
${K3S} kubectl apply -f manifests/ingress-prometheus-k8s.yaml
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
local vars = import 'vars.jsonnet';
|
||||
local utils = import 'utils.libsonnet';
|
||||
local vars = import 'vars.jsonnet';
|
||||
|
||||
{
|
||||
_config+:: {
|
||||
|
@ -65,7 +65,7 @@ local utils = import 'utils.libsonnet';
|
|||
//---------------------------------------
|
||||
|
||||
prometheus+:: {
|
||||
# Add option (from vars.yaml) to enable persistence
|
||||
// Add option (from vars.yaml) to enable persistence
|
||||
local pvc = k.core.v1.persistentVolumeClaim,
|
||||
prometheus+: {
|
||||
spec+: {
|
||||
|
@ -177,7 +177,7 @@ local utils = import 'utils.libsonnet';
|
|||
// secret.new('basic-auth', { auth: std.base64(importstr 'auth') }) +
|
||||
// secret.mixin.metadata.withNamespace($._config.namespace),
|
||||
} + if vars.UseProvidedCerts then {
|
||||
secret:
|
||||
utils.newTLSSecret('ingress-TLS-secret', $._config.namespace, vars.TLSCertificate, vars.TLSKey)
|
||||
} else {},
|
||||
secret:
|
||||
utils.newTLSSecret('ingress-TLS-secret', $._config.namespace, vars.TLSCertificate, vars.TLSKey),
|
||||
} else {},
|
||||
}
|
||||
|
|
|
@ -8,8 +8,8 @@
|
|||
"subdir": "Documentation/etcd-mixin"
|
||||
}
|
||||
},
|
||||
"version": "63dd73c1869f1784f907b922f61571176a2802e8",
|
||||
"sum": "bkp18AxkOUYnVC15Gh9EoIi+mMAn0IT3hMzb8mlzpSw="
|
||||
"version": "07a74d61cb6c07965c5b594748dc999d1644862b",
|
||||
"sum": "Ko3qhNfC2vN/houLh6C0Ryacjv70gl0DVPGU/PQ4OD0="
|
||||
},
|
||||
{
|
||||
"name": "grafana",
|
||||
|
@ -19,8 +19,8 @@
|
|||
"subdir": "grafana"
|
||||
}
|
||||
},
|
||||
"version": "539a90dbf63c812ad0194d8078dd776868a11c81",
|
||||
"sum": "b8faWX1qqLGyN67sA36oRqYZ5HX+tHBRMPtrWRqIysE="
|
||||
"version": "57b4365eacda291b82e0d55ba7eec573a8198dda",
|
||||
"sum": "92DWADwGjnCfpZaL7Q07C0GZayxBziGla/O03qWea34="
|
||||
},
|
||||
{
|
||||
"name": "grafana-builder",
|
||||
|
@ -30,8 +30,8 @@
|
|||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "250bf5499d81e5e77e1e5ed2242c89ad27485aec",
|
||||
"sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE="
|
||||
"version": "c19a92e586a6752f11745b47f309b13f02ef7147",
|
||||
"sum": "slxrtftVDiTlQK22ertdfrg4Epnq97gdrLI63ftUfaE="
|
||||
},
|
||||
{
|
||||
"name": "grafonnet",
|
||||
|
@ -41,8 +41,8 @@
|
|||
"subdir": "grafonnet"
|
||||
}
|
||||
},
|
||||
"version": "cb9e43f59558ff6338a76ae1806a0fb4b70b1b16",
|
||||
"sum": "YIo2bziNlqzZtlnpLtoi9qa1aztcAX7j1UuKRjjEzVY="
|
||||
"version": "7a932c9cfc6ccdb1efca9535f165e055949be42a",
|
||||
"sum": "HbCbHRvgA9a6K5FlOAYOUnErDHnNPWOCYPvDFU++bQE="
|
||||
},
|
||||
{
|
||||
"name": "ksonnet",
|
||||
|
@ -63,8 +63,30 @@
|
|||
"subdir": "jsonnet/kube-prometheus"
|
||||
}
|
||||
},
|
||||
"version": "ce5fe790ee9f1772ad52d935b320d545c8f88722",
|
||||
"sum": "AerKgmCkb6FsMAjPHqExxYSr6C/uYYq5qV9pAZULaxY="
|
||||
"version": "502f81b235a84484b55493af5cf96623ae37ef80",
|
||||
"sum": "weorIzfuzEqgRWW5mtt/p8cXMRhmilW20ppYruOpSZs="
|
||||
},
|
||||
{
|
||||
"name": "kube-state-metrics",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/kubernetes/kube-state-metrics",
|
||||
"subdir": "jsonnet/kube-state-metrics"
|
||||
}
|
||||
},
|
||||
"version": "fdd2ef120e5d9b56a29e7c3eeeda153acfb446ce",
|
||||
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
|
||||
},
|
||||
{
|
||||
"name": "kube-state-metrics-mixin",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/kubernetes/kube-state-metrics",
|
||||
"subdir": "jsonnet/kube-state-metrics-mixin"
|
||||
}
|
||||
},
|
||||
"version": "fdd2ef120e5d9b56a29e7c3eeeda153acfb446ce",
|
||||
"sum": "E1GGavnf9PCWBm4WVrxWnc0FIj72UcbcweqGioWrOdU="
|
||||
},
|
||||
{
|
||||
"name": "kubernetes-mixin",
|
||||
|
@ -74,8 +96,8 @@
|
|||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "23416282b86f75bff14bf94732c397764d7ba9db",
|
||||
"sum": "NIE+/dQ5RelahRoLKLfxqAZEel8OTpDbDr1y05glb0E="
|
||||
"version": "16ff3841fea16a0f2151479ab67d8d34893759f3",
|
||||
"sum": "UdI7A4jYc5PxmUHZBIGymx9Hk3eStqYSzXuUHot4oTQ="
|
||||
},
|
||||
{
|
||||
"name": "node-mixin",
|
||||
|
@ -85,8 +107,8 @@
|
|||
"subdir": "docs/node-mixin"
|
||||
}
|
||||
},
|
||||
"version": "c4c5f1f062b141d0111b4068a52eb5917048b3ea",
|
||||
"sum": "7vEamDTP9AApeiF4Zu9ZyXzDIs3rYHzwf9k7g8X+wsg="
|
||||
"version": "0107bc794204f50d887898da60032da890637471",
|
||||
"sum": "VKdF0zPMSCiuIuXWblSz2VOeBaXzQ7fp40vz9sxj+Bo="
|
||||
},
|
||||
{
|
||||
"name": "prometheus",
|
||||
|
@ -96,8 +118,8 @@
|
|||
"subdir": "documentation/prometheus-mixin"
|
||||
}
|
||||
},
|
||||
"version": "23c0299d85bfeb5d9b59e994861553a25ca578e5",
|
||||
"sum": "wSDLAXS5Xzla9RFRE2IW5mRToeRFULHb7dSYYBDfEsM="
|
||||
"version": "012161d90d6a8a6bb930b90601fb89ff6cc3ae60",
|
||||
"sum": "5EUgr6Spr1zNR8Y2/NevjvEkGV9WMvKo6nEScNER1Lc="
|
||||
},
|
||||
{
|
||||
"name": "prometheus-operator",
|
||||
|
@ -107,8 +129,8 @@
|
|||
"subdir": "jsonnet/prometheus-operator"
|
||||
}
|
||||
},
|
||||
"version": "8d44e0990230144177f97cf62ae4f43b1c4e3168",
|
||||
"sum": "5U7/8MD3pF9O0YDTtUhg4vctkUBRVFxZxWUyhtNiBM8="
|
||||
"version": "59bdf55453ba08b4ed7c271cb3c6627058945ed5",
|
||||
"sum": "qwMbUQkdPhAn9Sl4OVLgzmNOuOTnRLUmvv14I0unsa8="
|
||||
},
|
||||
{
|
||||
"name": "promgrafonnet",
|
||||
|
@ -118,8 +140,19 @@
|
|||
"subdir": "lib/promgrafonnet"
|
||||
}
|
||||
},
|
||||
"version": "23416282b86f75bff14bf94732c397764d7ba9db",
|
||||
"version": "16ff3841fea16a0f2151479ab67d8d34893759f3",
|
||||
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
|
||||
},
|
||||
{
|
||||
"name": "slo-libsonnet",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/metalmatze/slo-libsonnet",
|
||||
"subdir": "slo-libsonnet"
|
||||
}
|
||||
},
|
||||
"version": "5ddd7ffc39e7a54c9aca997c2c389a8046fab0ff",
|
||||
"sum": "S7/+tnAkzVh8Li7sg7Hu4aeIQAWHCtxhRQ+k1OKjoQk="
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||
local vars = import 'vars.jsonnet';
|
||||
local utils = import 'utils.libsonnet';
|
||||
local vars = import 'vars.jsonnet';
|
||||
|
||||
{
|
||||
prometheus+:: {
|
||||
|
@ -9,143 +9,20 @@ local utils = import 'utils.libsonnet';
|
|||
|
||||
kubeSchedulerPrometheusDiscoveryEndpoints:
|
||||
utils.newEndpoint('kube-scheduler-prometheus-discovery', 'kube-system', vars.k3s.master_ip, 'http-metrics', 10251),
|
||||
|
||||
serviceMonitorKubelet+:
|
||||
{
|
||||
spec+: {
|
||||
endpoints: [
|
||||
{
|
||||
port: 'https-metrics',
|
||||
scheme: 'https',
|
||||
interval: '30s',
|
||||
honorLabels: true,
|
||||
tlsConfig: {
|
||||
insecureSkipVerify: true,
|
||||
},
|
||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
||||
},
|
||||
{
|
||||
port: 'https-metrics',
|
||||
scheme: 'https',
|
||||
path: '/metrics/cadvisor',
|
||||
interval: '30s',
|
||||
honorLabels: true,
|
||||
tlsConfig: {
|
||||
insecureSkipVerify: true,
|
||||
},
|
||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
||||
metricRelabelings: [
|
||||
// Drop a bunch of metrics which are disabled but still sent, see
|
||||
// https://github.com/google/cadvisor/issues/1925.
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)',
|
||||
action: 'drop',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
nodeExporter+:: {
|
||||
daemonset+: {
|
||||
spec+: {
|
||||
template+: {
|
||||
spec+: {
|
||||
containers:
|
||||
std.filterMap(
|
||||
function(c) std.startsWith(c.name, 'kube-rbac') != true,
|
||||
function(c)
|
||||
if std.startsWith(c.name, 'node-exporter') then
|
||||
c {
|
||||
args: [
|
||||
'--web.listen-address=:' + $._config.nodeExporter.port,
|
||||
'--path.procfs=/host/proc',
|
||||
'--path.sysfs=/host/sys',
|
||||
'--path.rootfs=/host/root',
|
||||
// The following settings have been taken from
|
||||
// https://github.com/prometheus/node_exporter/blob/0662673/collector/filesystem_linux.go#L30-L31
|
||||
// Once node exporter is being released with those settings, this can be removed.
|
||||
'--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)',
|
||||
'--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$',
|
||||
],
|
||||
ports: [
|
||||
{
|
||||
containerPort: 9100,
|
||||
name: 'http'
|
||||
}],
|
||||
|
||||
}
|
||||
else
|
||||
c,
|
||||
super.containers,
|
||||
),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
service+:
|
||||
{
|
||||
spec+: {
|
||||
ports: [{
|
||||
name: 'http',
|
||||
port: 9100,
|
||||
targetPort: 'http'
|
||||
}]
|
||||
}
|
||||
},
|
||||
|
||||
serviceMonitor+:
|
||||
{
|
||||
spec+: {
|
||||
endpoints: [
|
||||
{
|
||||
port: 'http',
|
||||
scheme: 'http',
|
||||
interval: '30s',
|
||||
relabelings: [
|
||||
{
|
||||
action: 'replace',
|
||||
regex: '(.*)',
|
||||
replacment: '$1',
|
||||
sourceLabels: ['__meta_kubernetes_pod_node_name'],
|
||||
targetLabel: 'instance',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
|
||||
// Temporary workaround until merge of https://github.com/coreos/kube-prometheus/pull/456
|
||||
kubeStateMetrics+:: {
|
||||
deployment+: {
|
||||
spec+: {
|
||||
template+: {
|
||||
spec+: {
|
||||
containers:
|
||||
std.filterMap(
|
||||
function(c) std.startsWith(c.name, 'kube-rbac') != true,
|
||||
std.map(
|
||||
function(c)
|
||||
if std.startsWith(c.name, 'kube-state-metrics') then
|
||||
c {
|
||||
args: [
|
||||
'--port=8080',
|
||||
'--telemetry-port=8081',
|
||||
],
|
||||
ports: [
|
||||
{
|
||||
containerPort: 8080,
|
||||
name: 'http-main'
|
||||
},
|
||||
{
|
||||
containerPort: 8081,
|
||||
name: 'http-self'
|
||||
}],
|
||||
image: $._config.imageRepos.kubeStateMetrics + ':' + $._config.versions.kubeStateMetrics,
|
||||
}
|
||||
else
|
||||
c,
|
||||
|
@ -155,48 +32,5 @@ local utils = import 'utils.libsonnet';
|
|||
},
|
||||
},
|
||||
},
|
||||
|
||||
service+:
|
||||
{
|
||||
spec+: {
|
||||
ports: [{
|
||||
name: 'http-main',
|
||||
port: 8080,
|
||||
targetPort: 'http-main'
|
||||
},
|
||||
{
|
||||
name: 'http-self',
|
||||
port: 8081,
|
||||
targetPort: 'http-self'
|
||||
}]
|
||||
}
|
||||
},
|
||||
|
||||
serviceMonitor+:
|
||||
{
|
||||
spec+: {
|
||||
endpoints: [
|
||||
{
|
||||
port: 'http-main',
|
||||
scheme: 'http',
|
||||
interval: $._config.kubeStateMetrics.scrapeInterval,
|
||||
scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout,
|
||||
honorLabels: true,
|
||||
tlsConfig: {
|
||||
insecureSkipVerify: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
port: 'http-self',
|
||||
scheme: 'http',
|
||||
interval: '30s',
|
||||
tlsConfig: {
|
||||
insecureSkipVerify: true,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,16 +1,21 @@
|
|||
apiVersion: apiextensions.k8s.io/v1beta1
|
||||
kind: CustomResourceDefinition
|
||||
metadata:
|
||||
annotations:
|
||||
controller-gen.kubebuilder.io/version: v0.2.4
|
||||
creationTimestamp: null
|
||||
name: podmonitors.monitoring.coreos.com
|
||||
spec:
|
||||
group: monitoring.coreos.com
|
||||
names:
|
||||
kind: PodMonitor
|
||||
listKind: PodMonitorList
|
||||
plural: podmonitors
|
||||
singular: podmonitor
|
||||
scope: Namespaced
|
||||
validation:
|
||||
openAPIV3Schema:
|
||||
description: PodMonitor defines monitoring for a set of pods.
|
||||
properties:
|
||||
apiVersion:
|
||||
description: 'APIVersion defines the versioned schema of this representation
|
||||
|
@ -22,15 +27,18 @@ spec:
|
|||
object represents. Servers may infer this from the endpoint the client
|
||||
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
|
||||
type: string
|
||||
metadata:
|
||||
type: object
|
||||
spec:
|
||||
description: PodMonitorSpec contains specification parameters for a PodMonitor.
|
||||
description: Specification of desired Pod selection for target discovery
|
||||
by Prometheus.
|
||||
properties:
|
||||
jobLabel:
|
||||
description: The label to use to retrieve the job name from.
|
||||
type: string
|
||||
namespaceSelector:
|
||||
description: NamespaceSelector is a selector for selecting either all
|
||||
namespaces or a list of namespaces.
|
||||
description: Selector to select which namespaces the Endpoints objects
|
||||
are discovered from.
|
||||
properties:
|
||||
any:
|
||||
description: Boolean describing whether all namespaces are selected
|
||||
|
@ -78,7 +86,7 @@ spec:
|
|||
type: integer
|
||||
regex:
|
||||
description: Regular expression against which the extracted
|
||||
value is matched. defailt is '(.*)'
|
||||
value is matched. Default is '(.*)'
|
||||
type: string
|
||||
replacement:
|
||||
description: Replacement value against which a regex replace
|
||||
|
@ -105,6 +113,10 @@ spec:
|
|||
type: object
|
||||
type: array
|
||||
params:
|
||||
additionalProperties:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
description: Optional HTTP URL parameters
|
||||
type: object
|
||||
path:
|
||||
|
@ -138,7 +150,7 @@ spec:
|
|||
type: integer
|
||||
regex:
|
||||
description: Regular expression against which the extracted
|
||||
value is matched. defailt is '(.*)'
|
||||
value is matched. Default is '(.*)'
|
||||
type: string
|
||||
replacement:
|
||||
description: Replacement value against which a regex replace
|
||||
|
@ -172,8 +184,11 @@ spec:
|
|||
type: string
|
||||
targetPort:
|
||||
anyOf:
|
||||
- type: string
|
||||
- type: integer
|
||||
- type: string
|
||||
description: Name or number of the target port of the endpoint.
|
||||
Mutually exclusive with port.
|
||||
x-kubernetes-int-or-string: true
|
||||
type: object
|
||||
type: array
|
||||
podTargetLabels:
|
||||
|
@ -188,10 +203,7 @@ spec:
|
|||
format: int64
|
||||
type: integer
|
||||
selector:
|
||||
description: A label selector is a label query over a set of resources.
|
||||
The result of matchLabels and matchExpressions are ANDed. An empty
|
||||
label selector matches all objects. A null label selector matches
|
||||
no objects.
|
||||
description: Selector to select Pod objects.
|
||||
properties:
|
||||
matchExpressions:
|
||||
description: matchExpressions is a list of label selector requirements.
|
||||
|
@ -224,6 +236,8 @@ spec:
|
|||
type: object
|
||||
type: array
|
||||
matchLabels:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: matchLabels is a map of {key,value} pairs. A single
|
||||
{key,value} in the matchLabels map is equivalent to an element
|
||||
of matchExpressions, whose key field is "key", the operator is
|
||||
|
@ -235,5 +249,17 @@ spec:
|
|||
- podMetricsEndpoints
|
||||
- selector
|
||||
type: object
|
||||
required:
|
||||
- spec
|
||||
type: object
|
||||
version: v1
|
||||
versions:
|
||||
- name: v1
|
||||
served: true
|
||||
storage: true
|
||||
status:
|
||||
acceptedNames:
|
||||
kind: ""
|
||||
plural: ""
|
||||
conditions: []
|
||||
storedVersions: []
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,16 +1,21 @@
|
|||
apiVersion: apiextensions.k8s.io/v1beta1
|
||||
kind: CustomResourceDefinition
|
||||
metadata:
|
||||
annotations:
|
||||
controller-gen.kubebuilder.io/version: v0.2.4
|
||||
creationTimestamp: null
|
||||
name: prometheusrules.monitoring.coreos.com
|
||||
spec:
|
||||
group: monitoring.coreos.com
|
||||
names:
|
||||
kind: PrometheusRule
|
||||
listKind: PrometheusRuleList
|
||||
plural: prometheusrules
|
||||
singular: prometheusrule
|
||||
scope: Namespaced
|
||||
validation:
|
||||
openAPIV3Schema:
|
||||
description: PrometheusRule defines alerting rules for a Prometheus instance
|
||||
properties:
|
||||
apiVersion:
|
||||
description: 'APIVersion defines the versioned schema of this representation
|
||||
|
@ -23,201 +28,24 @@ spec:
|
|||
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
|
||||
type: string
|
||||
metadata:
|
||||
description: ObjectMeta is metadata that all persisted resources must have,
|
||||
which includes all objects users must create.
|
||||
properties:
|
||||
annotations:
|
||||
description: 'Annotations is an unstructured key value map stored with
|
||||
a resource that may be set by external tools to store and retrieve
|
||||
arbitrary metadata. They are not queryable and should be preserved
|
||||
when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations'
|
||||
type: object
|
||||
clusterName:
|
||||
description: The name of the cluster which the object belongs to. This
|
||||
is used to distinguish resources with same name and namespace in different
|
||||
clusters. This field is not set anywhere right now and apiserver is
|
||||
going to ignore it if set in create or update request.
|
||||
type: string
|
||||
creationTimestamp:
|
||||
description: Time is a wrapper around time.Time which supports correct
|
||||
marshaling to YAML and JSON. Wrappers are provided for many of the
|
||||
factory methods that the time package offers.
|
||||
format: date-time
|
||||
type: string
|
||||
deletionGracePeriodSeconds:
|
||||
description: Number of seconds allowed for this object to gracefully
|
||||
terminate before it will be removed from the system. Only set when
|
||||
deletionTimestamp is also set. May only be shortened. Read-only.
|
||||
format: int64
|
||||
type: integer
|
||||
deletionTimestamp:
|
||||
description: Time is a wrapper around time.Time which supports correct
|
||||
marshaling to YAML and JSON. Wrappers are provided for many of the
|
||||
factory methods that the time package offers.
|
||||
format: date-time
|
||||
type: string
|
||||
finalizers:
|
||||
description: Must be empty before the object is deleted from the registry.
|
||||
Each entry is an identifier for the responsible component that will
|
||||
remove the entry from the list. If the deletionTimestamp of the object
|
||||
is non-nil, entries in this list can only be removed.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
generateName:
|
||||
description: |-
|
||||
GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.
|
||||
|
||||
If this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).
|
||||
|
||||
Applied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#idempotency
|
||||
type: string
|
||||
generation:
|
||||
description: A sequence number representing a specific generation of
|
||||
the desired state. Populated by the system. Read-only.
|
||||
format: int64
|
||||
type: integer
|
||||
labels:
|
||||
description: 'Map of string keys and values that can be used to organize
|
||||
and categorize (scope and select) objects. May match selectors of
|
||||
replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels'
|
||||
type: object
|
||||
managedFields:
|
||||
description: ManagedFields maps workflow-id and version to the set of
|
||||
fields that are managed by that workflow. This is mostly for internal
|
||||
housekeeping, and users typically shouldn't need to set or understand
|
||||
this field. A workflow can be the user's name, a controller's name,
|
||||
or the name of a specific apply path like "ci-cd". The set of fields
|
||||
is always in the version that the workflow used when modifying the
|
||||
object.
|
||||
items:
|
||||
description: ManagedFieldsEntry is a workflow-id, a FieldSet and the
|
||||
group version of the resource that the fieldset applies to.
|
||||
properties:
|
||||
apiVersion:
|
||||
description: APIVersion defines the version of this resource that
|
||||
this field set applies to. The format is "group/version" just
|
||||
like the top-level APIVersion field. It is necessary to track
|
||||
the version of a field set because it cannot be automatically
|
||||
converted.
|
||||
type: string
|
||||
fieldsType:
|
||||
description: 'FieldsType is the discriminator for the different
|
||||
fields format and version. There is currently only one possible
|
||||
value: "FieldsV1"'
|
||||
type: string
|
||||
fieldsV1:
|
||||
description: |-
|
||||
FieldsV1 stores a set of fields in a data structure like a Trie, in JSON format.
|
||||
|
||||
Each key is either a '.' representing the field itself, and will always map to an empty set, or a string representing a sub-field or item. The string will follow one of these four formats: 'f:<name>', where <name> is the name of a field in a struct, or key in a map 'v:<value>', where <value> is the exact json formatted value of a list item 'i:<index>', where <index> is position of a item in a list 'k:<keys>', where <keys> is a map of a list item's key fields to their unique values If a key maps to an empty Fields value, the field that key represents is part of the set.
|
||||
|
||||
The exact format is defined in sigs.k8s.io/structured-merge-diff
|
||||
type: object
|
||||
manager:
|
||||
description: Manager is an identifier of the workflow managing
|
||||
these fields.
|
||||
type: string
|
||||
operation:
|
||||
description: Operation is the type of operation which lead to
|
||||
this ManagedFieldsEntry being created. The only valid values
|
||||
for this field are 'Apply' and 'Update'.
|
||||
type: string
|
||||
time:
|
||||
description: Time is a wrapper around time.Time which supports
|
||||
correct marshaling to YAML and JSON. Wrappers are provided
|
||||
for many of the factory methods that the time package offers.
|
||||
format: date-time
|
||||
type: string
|
||||
type: object
|
||||
type: array
|
||||
name:
|
||||
description: 'Name must be unique within a namespace. Is required when
|
||||
creating resources, although some resources may allow a client to
|
||||
request the generation of an appropriate name automatically. Name
|
||||
is primarily intended for creation idempotence and configuration definition.
|
||||
Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names'
|
||||
type: string
|
||||
namespace:
|
||||
description: |-
|
||||
Namespace defines the space within each name must be unique. An empty namespace is equivalent to the "default" namespace, but "default" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.
|
||||
|
||||
Must be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces
|
||||
type: string
|
||||
ownerReferences:
|
||||
description: List of objects depended by this object. If ALL objects
|
||||
in the list have been deleted, this object will be garbage collected.
|
||||
If this object is managed by a controller, then an entry in this list
|
||||
will point to this controller, with the controller field set to true.
|
||||
There cannot be more than one managing controller.
|
||||
items:
|
||||
description: OwnerReference contains enough information to let you
|
||||
identify an owning object. An owning object must be in the same
|
||||
namespace as the dependent, or be cluster-scoped, so there is no
|
||||
namespace field.
|
||||
properties:
|
||||
apiVersion:
|
||||
description: API version of the referent.
|
||||
type: string
|
||||
blockOwnerDeletion:
|
||||
description: If true, AND if the owner has the "foregroundDeletion"
|
||||
finalizer, then the owner cannot be deleted from the key-value
|
||||
store until this reference is removed. Defaults to false. To
|
||||
set this field, a user needs "delete" permission of the owner,
|
||||
otherwise 422 (Unprocessable Entity) will be returned.
|
||||
type: boolean
|
||||
controller:
|
||||
description: If true, this reference points to the managing controller.
|
||||
type: boolean
|
||||
kind:
|
||||
description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
|
||||
type: string
|
||||
name:
|
||||
description: 'Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names'
|
||||
type: string
|
||||
uid:
|
||||
description: 'UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids'
|
||||
type: string
|
||||
required:
|
||||
- apiVersion
|
||||
- kind
|
||||
- name
|
||||
- uid
|
||||
type: object
|
||||
type: array
|
||||
resourceVersion:
|
||||
description: |-
|
||||
An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.
|
||||
|
||||
Populated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency
|
||||
type: string
|
||||
selfLink:
|
||||
description: |-
|
||||
SelfLink is a URL representing this object. Populated by the system. Read-only.
|
||||
|
||||
DEPRECATED Kubernetes will stop propagating this field in 1.20 release and the field is planned to be removed in 1.21 release.
|
||||
type: string
|
||||
uid:
|
||||
description: |-
|
||||
UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.
|
||||
|
||||
Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids
|
||||
type: string
|
||||
type: object
|
||||
spec:
|
||||
description: PrometheusRuleSpec contains specification parameters for a
|
||||
Rule.
|
||||
description: Specification of desired alerting rule definitions for Prometheus.
|
||||
properties:
|
||||
groups:
|
||||
description: Content of Prometheus rule file
|
||||
items:
|
||||
description: RuleGroup is a list of sequentially evaluated recording
|
||||
and alerting rules.
|
||||
description: 'RuleGroup is a list of sequentially evaluated recording
|
||||
and alerting rules. Note: PartialResponseStrategy is only used by
|
||||
ThanosRuler and will be ignored by Prometheus instances. Valid
|
||||
values for this field are ''warn'' or ''abort''. More info: https://github.com/thanos-io/thanos/blob/master/docs/components/rule.md#partial-response'
|
||||
properties:
|
||||
interval:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
partial_response_strategy:
|
||||
type: string
|
||||
rules:
|
||||
items:
|
||||
description: Rule describes an alerting or recording rule.
|
||||
|
@ -225,14 +53,19 @@ spec:
|
|||
alert:
|
||||
type: string
|
||||
annotations:
|
||||
additionalProperties:
|
||||
type: string
|
||||
type: object
|
||||
expr:
|
||||
anyOf:
|
||||
- type: string
|
||||
- type: integer
|
||||
- type: string
|
||||
x-kubernetes-int-or-string: true
|
||||
for:
|
||||
type: string
|
||||
labels:
|
||||
additionalProperties:
|
||||
type: string
|
||||
type: object
|
||||
record:
|
||||
type: string
|
||||
|
@ -246,5 +79,17 @@ spec:
|
|||
type: object
|
||||
type: array
|
||||
type: object
|
||||
required:
|
||||
- spec
|
||||
type: object
|
||||
version: v1
|
||||
versions:
|
||||
- name: v1
|
||||
served: true
|
||||
storage: true
|
||||
status:
|
||||
acceptedNames:
|
||||
kind: ""
|
||||
plural: ""
|
||||
conditions: []
|
||||
storedVersions: []
|
||||
|
|
|
@ -1,16 +1,21 @@
|
|||
apiVersion: apiextensions.k8s.io/v1beta1
|
||||
kind: CustomResourceDefinition
|
||||
metadata:
|
||||
annotations:
|
||||
controller-gen.kubebuilder.io/version: v0.2.4
|
||||
creationTimestamp: null
|
||||
name: servicemonitors.monitoring.coreos.com
|
||||
spec:
|
||||
group: monitoring.coreos.com
|
||||
names:
|
||||
kind: ServiceMonitor
|
||||
listKind: ServiceMonitorList
|
||||
plural: servicemonitors
|
||||
singular: servicemonitor
|
||||
scope: Namespaced
|
||||
validation:
|
||||
openAPIV3Schema:
|
||||
description: ServiceMonitor defines monitoring for a set of services.
|
||||
properties:
|
||||
apiVersion:
|
||||
description: 'APIVersion defines the versioned schema of this representation
|
||||
|
@ -22,9 +27,11 @@ spec:
|
|||
object represents. Servers may infer this from the endpoint the client
|
||||
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
|
||||
type: string
|
||||
metadata:
|
||||
type: object
|
||||
spec:
|
||||
description: ServiceMonitorSpec contains specification parameters for a
|
||||
ServiceMonitor.
|
||||
description: Specification of desired Service selection for target discovery
|
||||
by Prometheus.
|
||||
properties:
|
||||
endpoints:
|
||||
description: A list of endpoints allowed as part of this ServiceMonitor.
|
||||
|
@ -37,14 +44,16 @@ spec:
|
|||
basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints'
|
||||
properties:
|
||||
password:
|
||||
description: SecretKeySelector selects a key of a Secret.
|
||||
description: The secret in the service monitor namespace that
|
||||
contains the password for authentication.
|
||||
properties:
|
||||
key:
|
||||
description: The key of the secret to select from. Must
|
||||
be a valid secret key.
|
||||
type: string
|
||||
name:
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names'
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
|
||||
TODO: Add other useful fields. apiVersion, kind, uid?'
|
||||
type: string
|
||||
optional:
|
||||
description: Specify whether the Secret or its key must
|
||||
|
@ -54,14 +63,16 @@ spec:
|
|||
- key
|
||||
type: object
|
||||
username:
|
||||
description: SecretKeySelector selects a key of a Secret.
|
||||
description: The secret in the service monitor namespace that
|
||||
contains the username for authentication.
|
||||
properties:
|
||||
key:
|
||||
description: The key of the secret to select from. Must
|
||||
be a valid secret key.
|
||||
type: string
|
||||
name:
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names'
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
|
||||
TODO: Add other useful fields. apiVersion, kind, uid?'
|
||||
type: string
|
||||
optional:
|
||||
description: Specify whether the Secret or its key must
|
||||
|
@ -75,14 +86,17 @@ spec:
|
|||
description: File to read bearer token for scraping targets.
|
||||
type: string
|
||||
bearerTokenSecret:
|
||||
description: SecretKeySelector selects a key of a Secret.
|
||||
description: Secret to mount to read bearer token for scraping
|
||||
targets. The secret needs to be in the same namespace as the
|
||||
service monitor and accessible by the Prometheus Operator.
|
||||
properties:
|
||||
key:
|
||||
description: The key of the secret to select from. Must be
|
||||
a valid secret key.
|
||||
type: string
|
||||
name:
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names'
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
|
||||
TODO: Add other useful fields. apiVersion, kind, uid?'
|
||||
type: string
|
||||
optional:
|
||||
description: Specify whether the Secret or its key must be
|
||||
|
@ -121,7 +135,7 @@ spec:
|
|||
type: integer
|
||||
regex:
|
||||
description: Regular expression against which the extracted
|
||||
value is matched. defailt is '(.*)'
|
||||
value is matched. Default is '(.*)'
|
||||
type: string
|
||||
replacement:
|
||||
description: Replacement value against which a regex replace
|
||||
|
@ -148,6 +162,10 @@ spec:
|
|||
type: object
|
||||
type: array
|
||||
params:
|
||||
additionalProperties:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
description: Optional HTTP URL parameters
|
||||
type: object
|
||||
path:
|
||||
|
@ -181,7 +199,7 @@ spec:
|
|||
type: integer
|
||||
regex:
|
||||
description: Regular expression against which the extracted
|
||||
value is matched. defailt is '(.*)'
|
||||
value is matched. Default is '(.*)'
|
||||
type: string
|
||||
replacement:
|
||||
description: Replacement value against which a regex replace
|
||||
|
@ -215,17 +233,103 @@ spec:
|
|||
type: string
|
||||
targetPort:
|
||||
anyOf:
|
||||
- type: string
|
||||
- type: integer
|
||||
- type: string
|
||||
description: Name or number of the target port of the endpoint.
|
||||
Mutually exclusive with port.
|
||||
x-kubernetes-int-or-string: true
|
||||
tlsConfig:
|
||||
description: TLSConfig specifies TLS configuration parameters.
|
||||
description: TLS configuration to use when scraping the endpoint
|
||||
properties:
|
||||
ca: {}
|
||||
ca:
|
||||
description: Stuct containing the CA cert to use for the targets.
|
||||
properties:
|
||||
configMap:
|
||||
description: ConfigMap containing data to use for the
|
||||
targets.
|
||||
properties:
|
||||
key:
|
||||
description: The key to select.
|
||||
type: string
|
||||
name:
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
|
||||
TODO: Add other useful fields. apiVersion, kind,
|
||||
uid?'
|
||||
type: string
|
||||
optional:
|
||||
description: Specify whether the ConfigMap or its
|
||||
key must be defined
|
||||
type: boolean
|
||||
required:
|
||||
- key
|
||||
type: object
|
||||
secret:
|
||||
description: Secret containing data to use for the targets.
|
||||
properties:
|
||||
key:
|
||||
description: The key of the secret to select from. Must
|
||||
be a valid secret key.
|
||||
type: string
|
||||
name:
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
|
||||
TODO: Add other useful fields. apiVersion, kind,
|
||||
uid?'
|
||||
type: string
|
||||
optional:
|
||||
description: Specify whether the Secret or its key
|
||||
must be defined
|
||||
type: boolean
|
||||
required:
|
||||
- key
|
||||
type: object
|
||||
type: object
|
||||
caFile:
|
||||
description: Path to the CA cert in the Prometheus container
|
||||
to use for the targets.
|
||||
type: string
|
||||
cert: {}
|
||||
cert:
|
||||
description: Struct containing the client cert file for the
|
||||
targets.
|
||||
properties:
|
||||
configMap:
|
||||
description: ConfigMap containing data to use for the
|
||||
targets.
|
||||
properties:
|
||||
key:
|
||||
description: The key to select.
|
||||
type: string
|
||||
name:
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
|
||||
TODO: Add other useful fields. apiVersion, kind,
|
||||
uid?'
|
||||
type: string
|
||||
optional:
|
||||
description: Specify whether the ConfigMap or its
|
||||
key must be defined
|
||||
type: boolean
|
||||
required:
|
||||
- key
|
||||
type: object
|
||||
secret:
|
||||
description: Secret containing data to use for the targets.
|
||||
properties:
|
||||
key:
|
||||
description: The key of the secret to select from. Must
|
||||
be a valid secret key.
|
||||
type: string
|
||||
name:
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
|
||||
TODO: Add other useful fields. apiVersion, kind,
|
||||
uid?'
|
||||
type: string
|
||||
optional:
|
||||
description: Specify whether the Secret or its key
|
||||
must be defined
|
||||
type: boolean
|
||||
required:
|
||||
- key
|
||||
type: object
|
||||
type: object
|
||||
certFile:
|
||||
description: Path to the client cert file in the Prometheus
|
||||
container for the targets.
|
||||
|
@ -238,14 +342,16 @@ spec:
|
|||
container for the targets.
|
||||
type: string
|
||||
keySecret:
|
||||
description: SecretKeySelector selects a key of a Secret.
|
||||
description: Secret containing the client key file for the
|
||||
targets.
|
||||
properties:
|
||||
key:
|
||||
description: The key of the secret to select from. Must
|
||||
be a valid secret key.
|
||||
type: string
|
||||
name:
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names'
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
|
||||
TODO: Add other useful fields. apiVersion, kind, uid?'
|
||||
type: string
|
||||
optional:
|
||||
description: Specify whether the Secret or its key must
|
||||
|
@ -264,8 +370,8 @@ spec:
|
|||
description: The label to use to retrieve the job name from.
|
||||
type: string
|
||||
namespaceSelector:
|
||||
description: NamespaceSelector is a selector for selecting either all
|
||||
namespaces or a list of namespaces.
|
||||
description: Selector to select which namespaces the Endpoints objects
|
||||
are discovered from.
|
||||
properties:
|
||||
any:
|
||||
description: Boolean describing whether all namespaces are selected
|
||||
|
@ -289,10 +395,7 @@ spec:
|
|||
format: int64
|
||||
type: integer
|
||||
selector:
|
||||
description: A label selector is a label query over a set of resources.
|
||||
The result of matchLabels and matchExpressions are ANDed. An empty
|
||||
label selector matches all objects. A null label selector matches
|
||||
no objects.
|
||||
description: Selector to select Endpoints objects.
|
||||
properties:
|
||||
matchExpressions:
|
||||
description: matchExpressions is a list of label selector requirements.
|
||||
|
@ -325,6 +428,8 @@ spec:
|
|||
type: object
|
||||
type: array
|
||||
matchLabels:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: matchLabels is a map of {key,value} pairs. A single
|
||||
{key,value} in the matchLabels map is equivalent to an element
|
||||
of matchExpressions, whose key field is "key", the operator is
|
||||
|
@ -342,5 +447,17 @@ spec:
|
|||
- endpoints
|
||||
- selector
|
||||
type: object
|
||||
required:
|
||||
- spec
|
||||
type: object
|
||||
version: v1
|
||||
versions:
|
||||
- name: v1
|
||||
served: true
|
||||
storage: true
|
||||
status:
|
||||
acceptedNames:
|
||||
kind: ""
|
||||
plural: ""
|
||||
conditions: []
|
||||
storedVersions: []
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -12,14 +12,30 @@ rules:
|
|||
resources:
|
||||
- customresourcedefinitions
|
||||
verbs:
|
||||
- '*'
|
||||
- create
|
||||
- apiGroups:
|
||||
- apiextensions.k8s.io
|
||||
resourceNames:
|
||||
- alertmanagers.monitoring.coreos.com
|
||||
- podmonitors.monitoring.coreos.com
|
||||
- prometheuses.monitoring.coreos.com
|
||||
- prometheusrules.monitoring.coreos.com
|
||||
- servicemonitors.monitoring.coreos.com
|
||||
- thanosrulers.monitoring.coreos.com
|
||||
resources:
|
||||
- customresourcedefinitions
|
||||
verbs:
|
||||
- get
|
||||
- update
|
||||
- apiGroups:
|
||||
- monitoring.coreos.com
|
||||
resources:
|
||||
- alertmanagers
|
||||
- alertmanagers/finalizers
|
||||
- prometheuses
|
||||
- prometheuses/finalizers
|
||||
- alertmanagers/finalizers
|
||||
- thanosrulers
|
||||
- thanosrulers/finalizers
|
||||
- servicemonitors
|
||||
- podmonitors
|
||||
- prometheusrules
|
||||
|
|
|
@ -20,7 +20,7 @@ spec:
|
|||
- monitoring
|
||||
topologyKey: kubernetes.io/hostname
|
||||
weight: 100
|
||||
baseImage: prom/alertmanager
|
||||
image: prom/alertmanager:v0.20.0
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
replicas: 1
|
||||
|
@ -29,4 +29,4 @@ spec:
|
|||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
serviceAccountName: alertmanager-main
|
||||
version: v0.18.0
|
||||
version: v0.20.0
|
||||
|
|
|
@ -1,8 +1,44 @@
|
|||
apiVersion: v1
|
||||
data:
|
||||
alertmanager.yaml: Imdsb2JhbCI6CiAgInJlc29sdmVfdGltZW91dCI6ICI1bSIKInJlY2VpdmVycyI6Ci0gIm5hbWUiOiAibnVsbCIKInJvdXRlIjoKICAiZ3JvdXBfYnkiOgogIC0gImpvYiIKICAiZ3JvdXBfaW50ZXJ2YWwiOiAiNW0iCiAgImdyb3VwX3dhaXQiOiAiMzBzIgogICJyZWNlaXZlciI6ICJudWxsIgogICJyZXBlYXRfaW50ZXJ2YWwiOiAiMTJoIgogICJyb3V0ZXMiOgogIC0gIm1hdGNoIjoKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJudWxsIg==
|
||||
data: {}
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: alertmanager-main
|
||||
namespace: monitoring
|
||||
stringData:
|
||||
alertmanager.yaml: |-
|
||||
"global":
|
||||
"resolve_timeout": "5m"
|
||||
"inhibit_rules":
|
||||
- "equal":
|
||||
- "namespace"
|
||||
- "alertname"
|
||||
"source_match":
|
||||
"severity": "critical"
|
||||
"target_match_re":
|
||||
"severity": "warning|info"
|
||||
- "equal":
|
||||
- "namespace"
|
||||
- "alertname"
|
||||
"source_match":
|
||||
"severity": "warning"
|
||||
"target_match_re":
|
||||
"severity": "info"
|
||||
"receivers":
|
||||
- "name": "Default"
|
||||
- "name": "Watchdog"
|
||||
- "name": "Critical"
|
||||
"route":
|
||||
"group_by":
|
||||
- "namespace"
|
||||
"group_interval": "5m"
|
||||
"group_wait": "30s"
|
||||
"receiver": "Default"
|
||||
"repeat_interval": "12h"
|
||||
"routes":
|
||||
- "match":
|
||||
"alertname": "Watchdog"
|
||||
"receiver": "Watchdog"
|
||||
- "match":
|
||||
"severity": "critical"
|
||||
"receiver": "Critical"
|
||||
type: Opaque
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -5,7 +5,7 @@ data:
|
|||
"apiVersion": 1,
|
||||
"providers": [
|
||||
{
|
||||
"folder": "",
|
||||
"folder": "Default",
|
||||
"name": "0",
|
||||
"options": {
|
||||
"path": "/grafana-dashboard-definitions/0"
|
||||
|
|
|
@ -16,7 +16,8 @@ spec:
|
|||
app: grafana
|
||||
spec:
|
||||
containers:
|
||||
- image: grafana/grafana:6.3.2
|
||||
- env: []
|
||||
image: grafana/grafana:6.6.2
|
||||
name: grafana
|
||||
ports:
|
||||
- containerPort: 3000
|
||||
|
@ -99,9 +100,6 @@ spec:
|
|||
- mountPath: /grafana-dashboard-definitions/0/pod-total
|
||||
name: grafana-dashboard-pod-total
|
||||
readOnly: false
|
||||
- mountPath: /grafana-dashboard-definitions/0/pods
|
||||
name: grafana-dashboard-pods
|
||||
readOnly: false
|
||||
- mountPath: /grafana-dashboard-definitions/0/prometheus-dashboard
|
||||
name: grafana-dashboard-prometheus-dashboard
|
||||
readOnly: false
|
||||
|
@ -198,9 +196,6 @@ spec:
|
|||
- configMap:
|
||||
name: grafana-dashboard-pod-total
|
||||
name: grafana-dashboard-pod-total
|
||||
- configMap:
|
||||
name: grafana-dashboard-pods
|
||||
name: grafana-dashboard-pods
|
||||
- configMap:
|
||||
name: grafana-dashboard-prometheus-dashboard
|
||||
name: grafana-dashboard-prometheus-dashboard
|
||||
|
|
|
@ -5,7 +5,7 @@ metadata:
|
|||
namespace: monitoring
|
||||
spec:
|
||||
rules:
|
||||
- host: alertmanager.192.168.99.100.nip.io
|
||||
- host: alertmanager.192.168.15.15.nip.io
|
||||
http:
|
||||
paths:
|
||||
- backend:
|
||||
|
@ -14,4 +14,4 @@ spec:
|
|||
path: /
|
||||
tls:
|
||||
- hosts:
|
||||
- alertmanager.192.168.99.100.nip.io
|
||||
- alertmanager.192.168.15.15.nip.io
|
||||
|
|
|
@ -5,7 +5,7 @@ metadata:
|
|||
namespace: monitoring
|
||||
spec:
|
||||
rules:
|
||||
- host: grafana.192.168.99.100.nip.io
|
||||
- host: grafana.192.168.15.15.nip.io
|
||||
http:
|
||||
paths:
|
||||
- backend:
|
||||
|
@ -14,4 +14,4 @@ spec:
|
|||
path: /
|
||||
tls:
|
||||
- hosts:
|
||||
- grafana.192.168.99.100.nip.io
|
||||
- grafana.192.168.15.15.nip.io
|
||||
|
|
|
@ -5,7 +5,7 @@ metadata:
|
|||
namespace: monitoring
|
||||
spec:
|
||||
rules:
|
||||
- host: prometheus.192.168.99.100.nip.io
|
||||
- host: prometheus.192.168.15.15.nip.io
|
||||
http:
|
||||
paths:
|
||||
- backend:
|
||||
|
@ -14,4 +14,4 @@ spec:
|
|||
path: /
|
||||
tls:
|
||||
- hosts:
|
||||
- prometheus.192.168.99.100.nip.io
|
||||
- prometheus.192.168.15.15.nip.io
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/version: 1.9.5
|
||||
name: kube-state-metrics
|
||||
rules:
|
||||
- apiGroups:
|
||||
|
@ -86,6 +89,29 @@ rules:
|
|||
- storage.k8s.io
|
||||
resources:
|
||||
- storageclasses
|
||||
- volumeattachments
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- admissionregistration.k8s.io
|
||||
resources:
|
||||
- mutatingwebhookconfigurations
|
||||
- validatingwebhookconfigurations
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- networking.k8s.io
|
||||
resources:
|
||||
- networkpolicies
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- coordination.k8s.io
|
||||
resources:
|
||||
- leases
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/version: 1.9.5
|
||||
name: kube-state-metrics
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
|
|
|
@ -2,20 +2,31 @@ apiVersion: apps/v1
|
|||
kind: Deployment
|
||||
metadata:
|
||||
labels:
|
||||
app: kube-state-metrics
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/version: 1.9.5
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: kube-state-metrics
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: kube-state-metrics
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/version: 1.9.5
|
||||
spec:
|
||||
containers:
|
||||
- args:
|
||||
- --host=127.0.0.1
|
||||
- --port=8081
|
||||
- --telemetry-host=127.0.0.1
|
||||
- --telemetry-port=8082
|
||||
image: quay.io/coreos/kube-state-metrics:v1.9.5
|
||||
name: kube-state-metrics
|
||||
securityContext:
|
||||
runAsUser: 65534
|
||||
- args:
|
||||
- --logtostderr
|
||||
- --secure-listen-address=:8443
|
||||
|
@ -26,13 +37,6 @@ spec:
|
|||
ports:
|
||||
- containerPort: 8443
|
||||
name: https-main
|
||||
resources:
|
||||
limits:
|
||||
cpu: 20m
|
||||
memory: 40Mi
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 20Mi
|
||||
- args:
|
||||
- --logtostderr
|
||||
- --secure-listen-address=:9443
|
||||
|
@ -43,30 +47,6 @@ spec:
|
|||
ports:
|
||||
- containerPort: 9443
|
||||
name: https-self
|
||||
resources:
|
||||
limits:
|
||||
cpu: 20m
|
||||
memory: 40Mi
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 20Mi
|
||||
- args:
|
||||
- --host=127.0.0.1
|
||||
- --port=8081
|
||||
- --telemetry-host=127.0.0.1
|
||||
- --telemetry-port=8082
|
||||
image: carlosedp/kube-state-metrics:v1.7.2
|
||||
name: kube-state-metrics
|
||||
resources:
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 150Mi
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 150Mi
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
serviceAccountName: kube-state-metrics
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- pods
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- extensions
|
||||
resourceNames:
|
||||
- kube-state-metrics
|
||||
resources:
|
||||
- deployments
|
||||
verbs:
|
||||
- get
|
||||
- update
|
||||
- apiGroups:
|
||||
- apps
|
||||
resourceNames:
|
||||
- kube-state-metrics
|
||||
resources:
|
||||
- deployments
|
||||
verbs:
|
||||
- get
|
||||
- update
|
|
@ -1,12 +0,0 @@
|
|||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: kube-state-metrics
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: kube-state-metrics
|
|
@ -2,7 +2,8 @@ apiVersion: v1
|
|||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: kube-state-metrics
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/version: 1.9.5
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
spec:
|
||||
|
@ -15,4 +16,4 @@ spec:
|
|||
port: 9443
|
||||
targetPort: https-self
|
||||
selector:
|
||||
app: kube-state-metrics
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/version: 1.9.5
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
|
|
|
@ -2,7 +2,8 @@ apiVersion: monitoring.coreos.com/v1
|
|||
kind: ServiceMonitor
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: kube-state-metrics
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/version: 1.9.5
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
spec:
|
||||
|
@ -24,7 +25,7 @@ spec:
|
|||
scheme: https
|
||||
tlsConfig:
|
||||
insecureSkipVerify: true
|
||||
jobLabel: k8s-app
|
||||
jobLabel: app.kubernetes.io/name
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: kube-state-metrics
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
|
|
|
@ -20,6 +20,8 @@ spec:
|
|||
- --path.procfs=/host/proc
|
||||
- --path.sysfs=/host/sys
|
||||
- --path.rootfs=/host/root
|
||||
- --no-collector.wifi
|
||||
- --no-collector.hwmon
|
||||
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
|
||||
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
|
||||
image: prom/node-exporter:v0.18.1
|
||||
|
@ -44,7 +46,7 @@ spec:
|
|||
readOnly: true
|
||||
- args:
|
||||
- --logtostderr
|
||||
- --secure-listen-address=$(IP):9100
|
||||
- --secure-listen-address=[$(IP)]:9100
|
||||
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
|
||||
- --upstream=http://127.0.0.1:9100/
|
||||
env:
|
||||
|
|
|
@ -8,7 +8,7 @@ metadata:
|
|||
spec:
|
||||
endpoints:
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
interval: 30s
|
||||
interval: 15s
|
||||
port: https
|
||||
relabelings:
|
||||
- action: replace
|
||||
|
|
|
@ -11,6 +11,7 @@ rules:
|
|||
- metrics.k8s.io
|
||||
resources:
|
||||
- pods
|
||||
- nodes
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
|
|
|
@ -3,8 +3,8 @@ data:
|
|||
config.yaml: |
|
||||
resourceRules:
|
||||
cpu:
|
||||
containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}[5m])) by (<<.GroupBy>>)
|
||||
nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||
containerQuery: sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}[5m])) by (<<.GroupBy>>)
|
||||
nodeQuery: sum(1 - irate(node_cpu_seconds_total{mode="idle"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||
resources:
|
||||
overrides:
|
||||
node:
|
||||
|
|
|
@ -25,10 +25,11 @@ spec:
|
|||
- name: alertmanager-main
|
||||
namespace: monitoring
|
||||
port: web
|
||||
baseImage: prom/prometheus
|
||||
externalUrl: http://prometheus.192.168.99.100.nip.io
|
||||
externalUrl: http://prometheus.192.168.15.15.nip.io
|
||||
image: prom/prometheus:v2.16.0
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
podMonitorNamespaceSelector: {}
|
||||
podMonitorSelector: {}
|
||||
replicas: 1
|
||||
resources:
|
||||
|
@ -46,4 +47,4 @@ spec:
|
|||
serviceAccountName: prometheus-k8s
|
||||
serviceMonitorNamespaceSelector: {}
|
||||
serviceMonitorSelector: {}
|
||||
version: v2.11.1
|
||||
version: v2.16.0
|
||||
|
|
|
@ -40,10 +40,10 @@ spec:
|
|||
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
|
||||
record: instance:node_vmstat_pgmajfault:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
|
@ -65,96 +65,237 @@ spec:
|
|||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate1m
|
||||
- name: kube-apiserver-error
|
||||
rules:
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[5m]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate5m
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[30m]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate30m
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[1h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate1h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[2h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate2h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[6h]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate6h
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[1d]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate1d
|
||||
- expr: |
|
||||
sum by (status_class) (
|
||||
label_replace(
|
||||
rate(apiserver_request_total{job="apiserver"}[3d]
|
||||
), "status_class", "${1}xx", "code", "([0-9])..")
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class:apiserver_request_total:rate3d
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate5m{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate5m
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate30m{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate30m
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate1h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate1h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate2h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate2h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate6h{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate6h
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate1d{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate1d
|
||||
- expr: |
|
||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"})
|
||||
/
|
||||
sum(status_class:apiserver_request_total:rate3d{job="apiserver"})
|
||||
labels:
|
||||
job: apiserver
|
||||
record: status_class_5xx:apiserver_request_total:ratio_rate3d
|
||||
- name: kube-apiserver.rules
|
||||
rules:
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
/
|
||||
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
|
||||
record: cluster:apiserver_request_duration_seconds:mean5m
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
- name: k8s.rules
|
||||
rules:
|
||||
- expr: |
|
||||
sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])) by (namespace)
|
||||
sum(rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])) by (namespace)
|
||||
record: namespace:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: |
|
||||
sum by (namespace, pod, container) (
|
||||
rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])
|
||||
) * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
sum by (cluster, namespace, pod, container) (
|
||||
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
|
||||
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
|
||||
1, max by(cluster, namespace, pod, node) (kube_pod_info)
|
||||
)
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: |
|
||||
container_memory_working_set_bytes{job="kubelet", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info)
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
||||
- expr: |
|
||||
container_memory_rss{job="kubelet", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info)
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_rss
|
||||
- expr: |
|
||||
container_memory_cache{job="kubelet", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info)
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_cache
|
||||
- expr: |
|
||||
container_memory_swap{job="kubelet", image!=""}
|
||||
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)
|
||||
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
|
||||
max by(namespace, pod, node) (kube_pod_info)
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_swap
|
||||
- expr: |
|
||||
sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace)
|
||||
sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace)
|
||||
record: namespace:container_memory_usage_bytes:sum
|
||||
- expr: |
|
||||
sum by (namespace, label_name) (
|
||||
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"Pending|Running"} == 1)) by (namespace, pod)
|
||||
* on (namespace, pod)
|
||||
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
||||
sum by (namespace) (
|
||||
sum by (namespace, pod) (
|
||||
max by (namespace, pod, container) (
|
||||
kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}
|
||||
) * on(namespace, pod) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum
|
||||
- expr: |
|
||||
sum by (namespace, label_name) (
|
||||
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"Pending|Running"} == 1)) by (namespace, pod)
|
||||
* on (namespace, pod)
|
||||
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
||||
sum by (namespace) (
|
||||
sum by (namespace, pod) (
|
||||
max by (namespace, pod, container) (
|
||||
kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}
|
||||
) * on(namespace, pod) group_left() max by (namespace, pod) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
|
||||
- expr: |
|
||||
sum(
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||
"replicaset", "$1", "owner_name", "(.*)"
|
||||
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
|
||||
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
|
||||
1, max by (replicaset, namespace, owner_name) (
|
||||
kube_replicaset_owner{job="kube-state-metrics"}
|
||||
)
|
||||
),
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: mixin_pod_workload
|
||||
- expr: |
|
||||
sum(
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: mixin_pod_workload
|
||||
- expr: |
|
||||
sum(
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: mixin_pod_workload
|
||||
|
@ -207,13 +348,17 @@ spec:
|
|||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- name: node.rules
|
||||
rules:
|
||||
- expr: sum(min(kube_pod_info) by (node))
|
||||
- expr: |
|
||||
sum(min(kube_pod_info) by (cluster, node))
|
||||
record: ':kube_pod_info_node_count:'
|
||||
- expr: |
|
||||
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
|
||||
topk by(namespace, pod) (1,
|
||||
max by (node, namespace, pod) (
|
||||
label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
|
||||
))
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
- expr: |
|
||||
count by (node) (sum by (node, cpu) (
|
||||
count by (cluster, node) (sum by (node, cpu) (
|
||||
node_cpu_seconds_total{job="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
|
@ -228,8 +373,25 @@ spec:
|
|||
node_memory_MemFree_bytes{job="node-exporter"} +
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
)
|
||||
) by (cluster)
|
||||
record: :node_memory_MemAvailable_bytes:sum
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
|
||||
|
@ -251,6 +413,42 @@ spec:
|
|||
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
|
||||
BY (instance, cpu))
|
||||
record: cluster:node_cpu:ratio
|
||||
- name: kube-prometheus-general.rules
|
||||
rules:
|
||||
- expr: count without(instance, pod, node) (up == 1)
|
||||
record: count:up1
|
||||
- expr: count without(instance, pod, node) (up == 0)
|
||||
record: count:up0
|
||||
- name: kube-state-metrics
|
||||
rules:
|
||||
- alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
message: kube-state-metrics is experiencing errors at an elevated rate in
|
||||
list operations. This is likely causing it to not be able to expose metrics
|
||||
about Kubernetes objects correctly or at all.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
message: kube-state-metrics is experiencing errors at an elevated rate in
|
||||
watch operations. This is likely causing it to not be able to expose metrics
|
||||
about Kubernetes objects correctly or at all.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors
|
||||
expr: |
|
||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
||||
/
|
||||
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: node-exporter
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
|
@ -280,7 +478,7 @@ spec:
|
|||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 20
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
|
@ -425,7 +623,7 @@ spec:
|
|||
state for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||
expr: |
|
||||
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
|
||||
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -448,9 +646,15 @@ spec:
|
|||
matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
||||
expr: |
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics"}
|
||||
(
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics"}
|
||||
) and (
|
||||
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -460,9 +664,15 @@ spec:
|
|||
not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
||||
expr: |
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics"}
|
||||
(
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics"}
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -606,7 +816,7 @@ spec:
|
|||
tolerate node failure.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
||||
expr: |
|
||||
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum)
|
||||
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
|
||||
/
|
||||
sum(kube_node_status_allocatable_cpu_cores)
|
||||
>
|
||||
|
@ -620,7 +830,7 @@ spec:
|
|||
tolerate node failure.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
||||
expr: |
|
||||
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum)
|
||||
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
||||
/
|
||||
sum(kube_node_status_allocatable_memory_bytes)
|
||||
>
|
||||
|
@ -690,9 +900,9 @@ spec:
|
|||
}} free.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
|
||||
expr: |
|
||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
< 0.03
|
||||
for: 1m
|
||||
labels:
|
||||
|
@ -705,12 +915,12 @@ spec:
|
|||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
|
||||
expr: |
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -749,16 +959,72 @@ spec:
|
|||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: kube-apiserver-error-alerts
|
||||
rules:
|
||||
- alert: ErrorBudgetBurn
|
||||
annotations:
|
||||
message: 'High requests error budget burn for job=apiserver (current value:
|
||||
{{ $value }})'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
|
||||
expr: |
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate1h{job="apiserver"} > (14.4*0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate5m{job="apiserver"} > (14.4*0.010000)
|
||||
)
|
||||
or
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (6*0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate30m{job="apiserver"} > (6*0.010000)
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
severity: critical
|
||||
- alert: ErrorBudgetBurn
|
||||
annotations:
|
||||
message: 'High requests error budget burn for job=apiserver (current value:
|
||||
{{ $value }})'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
|
||||
expr: |
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate1d{job="apiserver"} > (3*0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate2h{job="apiserver"} > (3*0.010000)
|
||||
)
|
||||
or
|
||||
(
|
||||
status_class_5xx:apiserver_request_total:ratio_rate3d{job="apiserver"} > (0.010000)
|
||||
and
|
||||
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (0.010000)
|
||||
)
|
||||
labels:
|
||||
job: apiserver
|
||||
severity: warning
|
||||
- name: kubernetes-system-apiserver
|
||||
rules:
|
||||
- alert: KubeAPILatencyHigh
|
||||
annotations:
|
||||
message: The API server has a 99th percentile latency of {{ $value }} seconds
|
||||
for {{ $labels.verb }} {{ $labels.resource }}.
|
||||
message: The API server has an abnormal latency of {{ $value }} seconds for
|
||||
{{ $labels.verb }} {{ $labels.resource }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
||||
expr: |
|
||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 1
|
||||
for: 10m
|
||||
(
|
||||
cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"}
|
||||
>
|
||||
on (verb) group_left()
|
||||
(
|
||||
avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
|
||||
+
|
||||
2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
|
||||
)
|
||||
) > on (verb) group_left()
|
||||
1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
|
||||
and on (verb,resource)
|
||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
|
||||
>
|
||||
1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPILatencyHigh
|
||||
|
@ -767,34 +1033,10 @@ spec:
|
|||
for {{ $labels.verb }} {{ $labels.resource }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
||||
expr: |
|
||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 4
|
||||
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} > 4
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value | humanizePercentage
|
||||
}} of requests.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.03
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value | humanizePercentage
|
||||
}} of requests.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value | humanizePercentage
|
||||
|
@ -827,7 +1069,7 @@ spec:
|
|||
in less than 7.0 days.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeClientCertificateExpiration
|
||||
|
@ -836,9 +1078,30 @@ spec:
|
|||
in less than 24.0 hours.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AggregatedAPIErrors
|
||||
annotations:
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
|
||||
reported errors. The number of errors have increased for it in the past
|
||||
five minutes. High values indicate that the availability of the service
|
||||
changes too often.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
||||
expr: |
|
||||
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: AggregatedAPIDown
|
||||
annotations:
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down.
|
||||
It has not been available at least for the past five minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
|
||||
expr: |
|
||||
sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
message: KubeAPI has disappeared from Prometheus target discovery.
|
||||
|
@ -865,6 +1128,7 @@ spec:
|
|||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
|
||||
expr: |
|
||||
kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} == 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletTooManyPods
|
||||
|
@ -873,7 +1137,37 @@ spec:
|
|||
}} of its Pod capacity.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||
expr: |
|
||||
max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
|
||||
max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
message: The readiness status of node {{ $labels.node }} has changed {{ $value
|
||||
}} times in the last 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
||||
expr: |
|
||||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
|
||||
of {{ $value }} seconds on node {{ $labels.node }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
|
||||
expr: |
|
||||
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
|
||||
on node {{ $labels.node }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name > 60
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -882,7 +1176,7 @@ spec:
|
|||
message: Kubelet has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
|
||||
expr: |
|
||||
absent(up{job="kubelet"} == 1)
|
||||
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -1038,7 +1332,8 @@ spec:
|
|||
- alert: PrometheusRemoteStorageFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send
|
||||
{{ printf "%.1f" $value }}% of the samples to queue {{$labels.queue}}.
|
||||
{{ printf "%.1f" $value }}% of the samples to {{ if $labels.queue }}{{ $labels.queue
|
||||
}}{{ else }}{{ $labels.url }}{{ end }}.
|
||||
summary: Prometheus fails to send samples to remote storage.
|
||||
expr: |
|
||||
(
|
||||
|
@ -1058,7 +1353,8 @@ spec:
|
|||
- alert: PrometheusRemoteWriteBehind
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
|
||||
is {{ printf "%.1f" $value }}s behind for queue {{$labels.queue}}.
|
||||
is {{ printf "%.1f" $value }}s behind for {{ if $labels.queue }}{{ $labels.queue
|
||||
}}{{ else }}{{ $labels.url }}{{ end }}.
|
||||
summary: Prometheus remote write is behind.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
|
@ -1145,8 +1441,8 @@ spec:
|
|||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }} targets in
|
||||
{{ $labels.namespace }} namespace are down.'
|
||||
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
|
||||
}} targets in {{ $labels.namespace }} namespace are down.'
|
||||
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job,
|
||||
namespace, service)) > 10
|
||||
for: 10m
|
||||
|
|
|
@ -10,6 +10,38 @@ spec:
|
|||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
interval: 30s
|
||||
metricRelabelings:
|
||||
- action: drop
|
||||
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: transformation_(transformation_latencies_microseconds|failures_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(debugging|disk|request|server).*
|
||||
sourceLabels:
|
||||
|
@ -22,6 +54,11 @@ spec:
|
|||
regex: apiserver_admission_step_admission_latencies_seconds_.*
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- le
|
||||
port: https
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
|
|
|
@ -9,6 +9,38 @@ spec:
|
|||
endpoints:
|
||||
- interval: 30s
|
||||
metricRelabelings:
|
||||
- action: drop
|
||||
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: transformation_(transformation_latencies_microseconds|failures_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(debugging|disk|request|server).*
|
||||
sourceLabels:
|
||||
|
|
|
@ -8,13 +8,49 @@ metadata:
|
|||
spec:
|
||||
endpoints:
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
honorLabels: true
|
||||
interval: 30s
|
||||
port: http-metrics
|
||||
metricRelabelings:
|
||||
- action: drop
|
||||
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: transformation_(transformation_latencies_microseconds|failures_total)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
|
||||
sourceLabels:
|
||||
- __name__
|
||||
port: https-metrics
|
||||
relabelings:
|
||||
- sourceLabels:
|
||||
- __metrics_path__
|
||||
targetLabel: metrics_path
|
||||
scheme: http
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
insecureSkipVerify: true
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
honorLabels: true
|
||||
interval: 30s
|
||||
|
@ -24,12 +60,14 @@ spec:
|
|||
sourceLabels:
|
||||
- __name__
|
||||
path: /metrics/cadvisor
|
||||
port: http-metrics
|
||||
port: https-metrics
|
||||
relabelings:
|
||||
- sourceLabels:
|
||||
- __metrics_path__
|
||||
targetLabel: metrics_path
|
||||
scheme: http
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
insecureSkipVerify: true
|
||||
jobLabel: k8s-app
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
|
|
|
@ -1,38 +0,0 @@
|
|||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
labels:
|
||||
run: smtp-server
|
||||
name: smtp-server
|
||||
namespace: monitoring
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
run: smtp-server
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
run: smtp-server
|
||||
spec:
|
||||
containers:
|
||||
- env:
|
||||
- name: GMAIL_USER
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: username
|
||||
name: smtp-account
|
||||
- name: GMAIL_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: password
|
||||
name: smtp-account
|
||||
- name: DISABLE_IPV6
|
||||
value: "True"
|
||||
- name: RELAY_DOMAINS
|
||||
value: :192.168.0.0/24:10.0.0.0/16
|
||||
image: carlosedp/docker-smtp:v1.0.1
|
||||
name: smtp-server
|
||||
ports:
|
||||
- containerPort: 25
|
||||
name: smtp
|
|
@ -1,14 +0,0 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
run: smtp-server
|
||||
name: smtp-server
|
||||
namespace: monitoring
|
||||
spec:
|
||||
ports:
|
||||
- name: smtp
|
||||
port: 25
|
||||
targetPort: smtp
|
||||
selector:
|
||||
run: smtp-server
|
Loading…
Reference in New Issue