Path: blob/main/operations/observability/mixins/platform/rules/kubernetes/kubernetes.yaml
2501 views
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.1# Licensed under the GNU Affero General Public License (AGPL).2# See License.AGPL.txt in the project root for license information.34apiVersion: monitoring.coreos.com/v15kind: PrometheusRule6metadata:7labels:8app.kubernetes.io/name: kubernetes9app.kubernetes.io/part-of: kube-prometheus10prometheus: k8s11role: alert-rules12name: kubernetes-monitoring-rules13namespace: monitoring-satellite14spec:15groups:16- name: kubernetes17rules:18- alert: KubeDaemonSetNotScheduled19annotations:20description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'21summary: DaemonSet pods are not scheduled.22expr: |23kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}24-25kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 026for: 10m27labels:28severity: warning29team: platform30- alert: KubeJobNotCompleted31annotations:32description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete.33summary: Job did not complete in time34expr: |35time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"}36and37kube_job_status_active{job="kube-state-metrics"} > 0) > 4320038labels:39severity: warning40team: platform41- alert: KubeJobFailed42annotations:43description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.44summary: Job failed to complete.45expr: |46kube_job_failed{job="kube-state-metrics"} > 047for: 15m48labels:49severity: warning50team: platform51- alert: KubePersistentVolumeFillingUp52annotations:53description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.54summary: PersistentVolume is filling up.55expr: |56(57kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}58/59kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}60) < 0.0361and62kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 063unless on(namespace, persistentvolumeclaim)64kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 165unless on(namespace, persistentvolumeclaim)66kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 167for: 1m68labels:69severity: critical70team: platform71- alert: KubePersistentVolumeErrors72annotations:73description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.74summary: PersistentVolume is having issues with provisioning.75expr: |76kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 077for: 5m78labels:79severity: critical80team: platform81- alert: KubeVersionMismatch82annotations:83description: There are {{ $value }} different semantic versions of Kubernetes components running.84summary: Different semantic versions of Kubernetes components running.85expr: |86count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 187for: 15m88labels:89severity: warning90team: platform91- alert: KubeNodeNotReady92annotations:93description: '{{ $labels.node }} has been unready for more than 15 minutes.'94runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/KubeNodeNotReady.md95summary: Node is not ready.96expr: |97kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 098for: 15m99labels:100severity: critical101team: platform102- alert: KubeletDown103annotations:104description: Kubelet has disappeared from Prometheus target discovery.105summary: Target disappeared from Prometheus target discovery.106expr: |107absent(up{job="kubelet", metrics_path="/metrics"} == 1)108for: 15m109labels:110severity: critical111team: platform112113114