Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
gitpod-io
GitHub Repository: gitpod-io/gitpod
Path: blob/main/operations/observability/mixins/meta/rules/server.yaml
2500 views
1
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.
2
# Licensed under the GNU Affero General Public License (AGPL).
3
# See License.AGPL.txt in the project root for license information.
4
5
apiVersion: monitoring.coreos.com/v1
6
kind: PrometheusRule
7
metadata:
8
labels:
9
prometheus: k8s
10
role: alert-rules
11
name: server-monitoring-rules
12
spec:
13
groups:
14
- name: server
15
rules:
16
- alert: WebsocketConnectionsNotClosing
17
expr: sum(server_websocket_connection_count) by (cluster) == 10000
18
for: 10m
19
labels:
20
severity: critical
21
annotations:
22
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionsNotClosing.md
23
summary: Open websocket connections are not closing for the last 10 minutes and accumulating in {{ $labels.cluster }}.
24
description: We have accumulated {{ printf "%.2f" $value }} open websocket connections.
25
26
- alert: ServerEventLoopLagTooHigh
27
expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) > 0.35
28
for: 5m
29
labels:
30
severity: critical
31
annotations:
32
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/ServerEventLoopLagTooHigh.md
33
summary: Server accumulated too much "event loop lag" on {{ $labels.cluster }}. The webapp will become unresponsive if we don't act here.
34
description: Server has accumulated {{ printf "%.2f" $value }}s event loop lag.
35
36
- alert: InstanceStartFailures
37
# Reasoning: 10% failure rate over 5m
38
expr: sum(increase(gitpod_server_instance_starts_failed_total{reason!~"imageBuildFailed|imageBuildFailedUser|scmAccessFailed"}[5m])) / (sum(increase(gitpod_server_instance_starts_failed_total{reason!~"imageBuildFailed|imageBuildFailedUser|scmAccessFailed"}[5m])) + sum(increase(gitpod_server_instance_starts_success_total[5m]))) > 0.1
39
for: 30s
40
labels:
41
severity: critical
42
annotations:
43
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/InstanceStartFailures.md
44
summary: Server tries to start an instance, but cannot for whatever reason. Investigation required.
45
description: Server cannot start workspace instances on workspace clusters.
46
47
# Rollout alerts
48
- alert: JsonRpcApiErrorRates
49
expr: sum(rate(gitpod_server_api_calls_total{statusCode!~"2.*|3.*|4.*|640"}[5m])) by (cluster) / sum(rate(gitpod_server_api_calls_total[5m])) by (cluster) > 0.04
50
for: 5m
51
labels:
52
severity: critical
53
team: webapp
54
dedicated: included
55
annotations:
56
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodApiErrorRate.md
57
summary: The error rate of the JSON RPC API is high on {{ $labels.cluster }}. Investigation required.
58
description: JSON RPC API error rate high
59
60
- alert: GitpodV1APIServerErrors
61
expr: sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unknown|DataLoss", grpc_service=~"gitpod.v1.*"}[5m])) by (cluster) / sum(rate(grpc_server_handled_total{grpc_service=~"gitpod.v1.*"}[5m])) by (cluster) > 0.01
62
for: 5m
63
labels:
64
severity: critical
65
team: webapp
66
dedicated: included
67
annotations:
68
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodV1APIServerErrors.md
69
summary: gitpod.v1 API is returning higher number of errors on {{ $labels.cluster }}. Investigation required.
70
description: gitpod.v1 API is returning higher number of errors
71
72
73
- alert: WebsocketConnectionRateHigh
74
# Reasoning: the values are taken from past data
75
expr: sum(rate(gitpod_server_api_connections_total[2m])) by (pod) > 5
76
for: 10m
77
labels:
78
# sent to the team internal channel until we fine tuned it
79
severity: warning
80
team: webapp
81
annotations:
82
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionRateHigh.md
83
summary: The websocket connection rate is higher than usual. Investigation required.
84
description: Websocket connection rate high
85
86
- alert: ServerHighCPUUsage
87
# Reasoning: high rates of CPU consumption should only be temporary.
88
expr: avg(rate(container_cpu_usage_seconds_total{container!="POD", pod=~"server-.*"}[5m])) by (cluster) > 0.4
89
for: 10m
90
labels:
91
# sent to the team internal channel until we fine tuned it
92
severity: warning
93
team: webapp
94
annotations:
95
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesHighCPUUsage.md
96
summary: Server has excessive CPU usage.
97
description: Server is consumming too much CPU. Please investigate.
98
dashboard_url: https://grafana.gitpod.io/d/6581e46e4e5c7ba40a07646395ef7b23/kubernetes-compute-resources-pod?var-cluster={{ $labels.cluster }}&var-namespace=default
99
100
- alert: WebAppServicesCrashlooping
101
# Reasoning: alert if any pod is stuck in crashlooping more than 5 minute.
102
expr: sum(increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (cluster, pod) > 0
103
# Five minutes sound high, but that's the only value that's higher than recent history
104
for: 5m
105
labels:
106
severity: critical
107
team: webapp
108
dedicated: included
109
annotations:
110
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md
111
summary: Pod is crash looping in {{ $labels.cluster }}.
112
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is stuck in crashlooping for at least 5 minutes
113
114
- alert: WebAppServicesCrashloopingInternal
115
# Reasoning: alert if any pod is stuck in crashlooping more than 1 minute.
116
# Used for fine tuning the alert above (WebAppServicesCrashlooping)!
117
expr: sum(increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (cluster, pod) > 0
118
# Let's be more ambitious than 5m
119
for: 1m
120
labels:
121
# sent to the team internal channel until we can propagate these values to CRITICAL
122
severity: warning
123
team: webapp
124
annotations:
125
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md
126
summary: Pod is crash looping in {{ $labels.cluster }}.
127
description: "FINE TUNE: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is stuck in crashlooping for at least 1 minute"
128
129
- alert: StartWorkspace_InternalErrors
130
expr: sum(increase(gitpod_server_api_calls_total{method="startWorkspace", statusCode=~"5.*"}[1m])) by (cluster) > 5
131
for: 5m
132
labels:
133
severity: critical
134
team: webapp
135
annotations:
136
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/StartWorkspace_InternalErrors.md
137
summary: Failing to start workspaces in {{ $labels.cluster }}.
138
description: Server is failed to start {{ printf "%.2f" $value }} workspaces in {{ $labels.cluster }} in the last 5 minutes
139
140