Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
gitpod-io
GitHub Repository: gitpod-io/gitpod
Path: blob/main/operations/observability/mixins/workspace/rules/central/nodes.yaml
2506 views
1
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.
2
# Licensed under the GNU Affero General Public License (AGPL).
3
# See License.AGPL.txt in the project root for license information.
4
5
apiVersion: monitoring.coreos.com/v1
6
kind: PrometheusRule
7
metadata:
8
labels:
9
prometheus: k8s
10
role: alert-rules
11
name: workspace-nodes-monitoring-rules
12
spec:
13
groups:
14
- name: workspace-nodes-rules
15
rules:
16
- record: nodepool:node_load1:normalized
17
expr: |
18
node_load1 * on(node) group_left(nodepool) kube_node_labels
19
/
20
count without (cpu) (
21
count without (mode) (
22
node_cpu_seconds_total * on(node) group_left(nodepool) kube_node_labels
23
)
24
)
25
- name: workspace-nodes-alerts
26
rules:
27
- alert: GitpodWorkspaceNodeHighNormalizedLoadAverage
28
labels:
29
severity: warning
30
team: engine
31
for: 60m
32
annotations:
33
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/NodePoolLoad.md
34
summary: Workspace node's normalized load average is higher than 10 for more than 60 minutes. Check for abuse.
35
description: Node {{ $labels.node }} in {{ $labels.cluster }} is reporting {{ printf "%.2f" $value }}% normalized load average. Normalized load average is current load average divided by number of CPU cores of the node.
36
expr: nodepool:node_load1:normalized{nodepool=~".*workspace.*", cluster!~"ephemeral.*"} > 10
37
38
- alert: GitpodHeadlessNodeHighNormalizedLoadAverage
39
labels:
40
severity: warning
41
team: engine
42
for: 60m
43
annotations:
44
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/NodePoolLoad.md
45
summary: Workspace node's normalized load average is higher than 10 for more than 60 minutes. Check for abuse.
46
description: Node {{ $labels.node }} in {{ $labels.cluster }} is reporting {{ printf "%.2f" $value }}% normalized load average. Normalized load average is current load average divided by number of CPU cores of the node.
47
expr: nodepool:node_load1:normalized{nodepool=~".*headless.*", cluster!~"ephemeral.*"} > 10
48
49
- alert: AutoscalerAddsNodesTooFast
50
labels:
51
severity: critical
52
annotations:
53
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AutoscalerAddsNodesTooFast.md
54
summary: Autoscaler is adding new nodes rapidly
55
description: Autoscaler in cluster {{ $labels.cluster }} is rapidly adding new nodes.
56
expr: ((sum(kube_node_labels{nodepool=~"workspace-.*", cluster!~"ephemeral.*"}) by (cluster)) - (sum(kube_node_labels{nodepool=~"workspace-.*", cluster!~"ephemeral.*"} offset 10m) by (cluster))) > 15
57
58
- alert: AutoscaleFailure
59
labels:
60
severity: warning
61
team: engine
62
annotations:
63
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AutoscaleFailure.md
64
summary: Automatic scale-up failed for some reason.
65
description: Automatic scale-up in cluster {{ $labels.cluster }} failed due to {{ $labels.reason }}.
66
expr: |
67
increase(cluster_autoscaler_failed_scale_ups_total{cluster!~"ephemeral.*"}[1m]) != 0
68
69
- alert: NodePoolLoad
70
labels:
71
severity: critical
72
team: engine
73
for: 60m
74
annotations:
75
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/NodePoolLoad.md
76
summary: Node pool load has been high for too long for 4 or more nodes
77
description: Node pool {{ $labels.nodepool }} in cluster {{ $labels.cluster }} has high, sustained load
78
expr: |
79
sum by(nodepool, cluster) (count by(node, nodepool, cluster) (sum by(node, nodepool, cluster) (nodepool:node_load1:normalized{nodepool=~".*workspace.*", cluster!~"ephemeral.*"}) >= 1)) >= 4
80
81