Path: blob/main/operations/observability/mixins/self-hosted/rules/observability-stack/prometheus.yaml
3608 views
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.1# Licensed under the GNU Affero General Public License (AGPL).2# See License.AGPL.txt in the project root for license information.34apiVersion: monitoring.coreos.com/v15kind: PrometheusRule6metadata:7labels:8app.kubernetes.io/name: kubernetes9app.kubernetes.io/part-of: kube-prometheus10prometheus: k8s11role: alert-rules12name: prometheus-monitoring-rules13namespace: monitoring-satellite14spec:15groups:16- name: prometheus17rules:18- alert: PrometheusBadConfig19annotations:20description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration.21summary: Failed Prometheus configuration reload.22expr: |23# Without max_over_time, failed scrapes could create false negatives, see24# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.25max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) == 026for: 10m27labels:28severity: critical29team: delivery-operations-experience30- alert: PrometheusRemoteStorageFailures31annotations:32description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}33summary: Prometheus fails to send samples to remote storage.34expr: |35(36(rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]))37/38(39(rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]))40+41(rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]))42)43)44* 10045> 146for: 15m47labels:48severity: critical49team: delivery-operations-experience50- alert: PrometheusRuleFailures51annotations:52description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m.53summary: Prometheus is failing rule evaluations.54expr: |55increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring-satellite"}[5m]) > 056for: 15m57labels:58severity: warning59team: delivery-operations-experience60- alert: PrometheusCrashlooped61annotations:62description: Prometheus' container restarted in the last 5m. While this alert will resolve itself if prometheus stopped crashing, it is important to understand why it crashed in the first place.63summary: Prometheus has just crashlooped.64expr: |65increase(kube_pod_container_status_restarts_total{cluster=~"$cluster", pod="prometheus-k8s-0", container="prometheus"}[5m]) > 066for: 15m67labels:68severity: info69team: delivery-operations-experience707172