Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
gitpod-io
GitHub Repository: gitpod-io/gitpod
Path: blob/main/operations/observability/mixins/platform/rules/kubernetes/kubernetes.yaml
2501 views
1
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.
2
# Licensed under the GNU Affero General Public License (AGPL).
3
# See License.AGPL.txt in the project root for license information.
4
5
apiVersion: monitoring.coreos.com/v1
6
kind: PrometheusRule
7
metadata:
8
labels:
9
app.kubernetes.io/name: kubernetes
10
app.kubernetes.io/part-of: kube-prometheus
11
prometheus: k8s
12
role: alert-rules
13
name: kubernetes-monitoring-rules
14
namespace: monitoring-satellite
15
spec:
16
groups:
17
- name: kubernetes
18
rules:
19
- alert: KubeDaemonSetNotScheduled
20
annotations:
21
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
22
summary: DaemonSet pods are not scheduled.
23
expr: |
24
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
25
-
26
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
27
for: 10m
28
labels:
29
severity: warning
30
team: platform
31
- alert: KubeJobNotCompleted
32
annotations:
33
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete.
34
summary: Job did not complete in time
35
expr: |
36
time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"}
37
and
38
kube_job_status_active{job="kube-state-metrics"} > 0) > 43200
39
labels:
40
severity: warning
41
team: platform
42
- alert: KubeJobFailed
43
annotations:
44
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
45
summary: Job failed to complete.
46
expr: |
47
kube_job_failed{job="kube-state-metrics"} > 0
48
for: 15m
49
labels:
50
severity: warning
51
team: platform
52
- alert: KubePersistentVolumeFillingUp
53
annotations:
54
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.
55
summary: PersistentVolume is filling up.
56
expr: |
57
(
58
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
59
/
60
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
61
) < 0.03
62
and
63
kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0
64
unless on(namespace, persistentvolumeclaim)
65
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
66
unless on(namespace, persistentvolumeclaim)
67
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
68
for: 1m
69
labels:
70
severity: critical
71
team: platform
72
- alert: KubePersistentVolumeErrors
73
annotations:
74
description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
75
summary: PersistentVolume is having issues with provisioning.
76
expr: |
77
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
78
for: 5m
79
labels:
80
severity: critical
81
team: platform
82
- alert: KubeVersionMismatch
83
annotations:
84
description: There are {{ $value }} different semantic versions of Kubernetes components running.
85
summary: Different semantic versions of Kubernetes components running.
86
expr: |
87
count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
88
for: 15m
89
labels:
90
severity: warning
91
team: platform
92
- alert: KubeNodeNotReady
93
annotations:
94
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
95
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/KubeNodeNotReady.md
96
summary: Node is not ready.
97
expr: |
98
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
99
for: 15m
100
labels:
101
severity: critical
102
team: platform
103
- alert: KubeletDown
104
annotations:
105
description: Kubelet has disappeared from Prometheus target discovery.
106
summary: Target disappeared from Prometheus target discovery.
107
expr: |
108
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
109
for: 15m
110
labels:
111
severity: critical
112
team: platform
113
114