Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
gitpod-io
GitHub Repository: gitpod-io/gitpod
Path: blob/main/operations/observability/mixins/self-hosted/rules/observability-stack/prometheus-operator.yaml
3608 views
1
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.
2
# Licensed under the GNU Affero General Public License (AGPL).
3
# See License.AGPL.txt in the project root for license information.
4
5
apiVersion: monitoring.coreos.com/v1
6
kind: PrometheusRule
7
metadata:
8
labels:
9
app.kubernetes.io/name: kubernetes
10
app.kubernetes.io/part-of: kube-prometheus
11
prometheus: k8s
12
role: alert-rules
13
name: prometheus-operator-monitoring-rules
14
namespace: monitoring-satellite
15
spec:
16
groups:
17
- name: prometheus-operator
18
rules:
19
- alert: PrometheusOperatorListErrors
20
annotations:
21
description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
22
summary: Errors while performing list operations in controller.
23
expr: |
24
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring-satellite"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring-satellite"}[10m]))) > 0.4
25
for: 15m
26
labels:
27
severity: warning
28
team: delivery-operations-experience
29
- alert: PrometheusOperatorWatchErrors
30
annotations:
31
description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.
32
summary: Errors while performing watch operations in controller.
33
expr: |
34
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring-satellite"}[5m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring-satellite"}[5m]))) > 0.4
35
for: 15m
36
labels:
37
severity: warning
38
team: delivery-operations-experience
39
- alert: PrometheusOperatorReconcileErrors
40
annotations:
41
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
42
summary: Errors while reconciling controller.
43
expr: |
44
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring-satellite"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring-satellite"}[5m]))) > 0.1
45
for: 10m
46
labels:
47
severity: warning
48
team: delivery-operations-experience
49
- alert: ConfigReloaderSidecarErrors
50
annotations:
51
description: |-
52
Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace.
53
As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.
54
summary: config-reloader sidecar has not had a successful reload for 10m
55
expr: |
56
max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
57
for: 10m
58
labels:
59
severity: warning
60
team: delivery-operations-experience
61
62