CoCalc -- nodes.yaml

GitHub Repository: gitpod-io/gitpod
Path: blob/main/operations/observability/mixins/workspace/rules/central/nodes.yaml
²⁵⁰⁶ views
1
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.
2
# Licensed under the GNU Affero General Public License (AGPL).
3
# See License.AGPL.txt in the project root for license information.
4

5
apiVersion: monitoring.coreos.com/v1
6
kind: PrometheusRule
7
metadata:
8
    labels:
9
        prometheus: k8s
10
        role: alert-rules
11
    name: workspace-nodes-monitoring-rules
12
spec:
13
    groups:
14
        - name: workspace-nodes-rules
15
          rules:
16
              - record: nodepool:node_load1:normalized
17
                expr: |
18
                    node_load1 * on(node) group_left(nodepool) kube_node_labels
19
                    /
20
                    count without (cpu) (
21
                      count without (mode) (
22
                        node_cpu_seconds_total * on(node) group_left(nodepool) kube_node_labels
23
                      )
24
                    )
25
        - name: workspace-nodes-alerts
26
          rules:
27
              - alert: GitpodWorkspaceNodeHighNormalizedLoadAverage
28
                labels:
29
                    severity: warning
30
                    team: engine
31
                for: 60m
32
                annotations:
33
                    runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/NodePoolLoad.md
34
                    summary: Workspace node's normalized load average is higher than 10 for more than 60 minutes. Check for abuse.
35
                    description: Node {{ $labels.node }} in {{ $labels.cluster }} is reporting {{ printf "%.2f" $value }}% normalized load average. Normalized load average is current load average divided by number of CPU cores of the node.
36
                expr: nodepool:node_load1:normalized{nodepool=~".*workspace.*", cluster!~"ephemeral.*"} > 10
37

38
              - alert: GitpodHeadlessNodeHighNormalizedLoadAverage
39
                labels:
40
                    severity: warning
41
                    team: engine
42
                for: 60m
43
                annotations:
44
                    runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/NodePoolLoad.md
45
                    summary: Workspace node's normalized load average is higher than 10 for more than 60 minutes. Check for abuse.
46
                    description: Node {{ $labels.node }} in {{ $labels.cluster }} is reporting {{ printf "%.2f" $value }}% normalized load average. Normalized load average is current load average divided by number of CPU cores of the node.
47
                expr: nodepool:node_load1:normalized{nodepool=~".*headless.*", cluster!~"ephemeral.*"} > 10
48

49
              - alert: AutoscalerAddsNodesTooFast
50
                labels:
51
                    severity: critical
52
                annotations:
53
                    runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AutoscalerAddsNodesTooFast.md
54
                    summary: Autoscaler is adding new nodes rapidly
55
                    description: Autoscaler in cluster {{ $labels.cluster }} is rapidly adding new nodes.
56
                expr: ((sum(kube_node_labels{nodepool=~"workspace-.*", cluster!~"ephemeral.*"}) by (cluster)) - (sum(kube_node_labels{nodepool=~"workspace-.*", cluster!~"ephemeral.*"} offset 10m) by (cluster))) > 15
57

58
              - alert: AutoscaleFailure
59
                labels:
60
                    severity: warning
61
                    team: engine
62
                annotations:
63
                    runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AutoscaleFailure.md
64
                    summary: Automatic scale-up failed for some reason.
65
                    description: Automatic scale-up in cluster {{ $labels.cluster }} failed due to {{ $labels.reason }}.
66
                expr: |
67
                    increase(cluster_autoscaler_failed_scale_ups_total{cluster!~"ephemeral.*"}[1m]) != 0
68

69
              - alert: NodePoolLoad
70
                labels:
71
                    severity: critical
72
                    team: engine
73
                for: 60m
74
                annotations:
75
                    runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/NodePoolLoad.md
76
                    summary: Node pool load has been high for too long for 4 or more nodes
77
                    description: Node pool {{ $labels.nodepool }} in cluster {{ $labels.cluster }} has high, sustained load
78
                expr: |
79
                    sum by(nodepool, cluster) (count by(node, nodepool, cluster) (sum by(node, nodepool, cluster) (nodepool:node_load1:normalized{nodepool=~".*workspace.*", cluster!~"ephemeral.*"}) >= 1)) >= 4
80

81
Product

Resources

Company