Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
gitpod-io
GitHub Repository: gitpod-io/gitpod
Path: blob/main/operations/observability/mixins/workspace/rules/central/workspaces.yaml
2506 views
1
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.
2
# Licensed under the GNU Affero General Public License (AGPL).
3
# See License.AGPL.txt in the project root for license information.
4
5
apiVersion: monitoring.coreos.com/v1
6
kind: PrometheusRule
7
metadata:
8
labels:
9
prometheus: k8s
10
role: alert-rules
11
name: workspace-monitoring-central-rules
12
spec:
13
groups:
14
15
- name: workspace-alerts
16
rules:
17
- alert: GitpodWorkspaceStuckOnStoppingMk2
18
labels:
19
severity: critical
20
dedicated: included
21
for: 20m
22
annotations:
23
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceStuckOnStopping.md
24
summary: '{{ printf "%.2f" $value }}% of Regular workspaces stopping in {{ $labels.cluster }}'
25
description: '{{ printf "%.2f" $value }}% of Regular workspaces stopping in {{ $labels.cluster }} is too high.'
26
expr: |
27
sum(
28
gitpod_ws_manager_mk2_workspace_phase_total{type="Regular", phase="Stopping", cluster!~"ephemeral.*"}) / sum(gitpod_ws_manager_mk2_workspace_phase_total{type="Regular", cluster!~"ephemeral.*"}
29
) >= .2
30
and
31
sum(
32
gitpod_ws_manager_mk2_workspace_phase_total{type="Regular", phase="Stopping", cluster!~"ephemeral.*"}
33
) without(phase) > 15
34
35
- alert: GitpodWorkspaceHighStartFailureRate
36
labels:
37
severity: critical
38
dedicated: included
39
annotations:
40
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceHighFailureRate.md
41
summary: Workspaces are failing to start in cluster {{ $labels.cluster }}.
42
description: For the last 15 minutes more than 20% of hew workspaces have failed to start
43
for: 15m
44
expr: |
45
sum(increase(gitpod_ws_manager_mk2_workspace_starts_failure_total{type="Regular", cluster!~"ephemeral.*"}[5m])) by (cluster) / sum(increase(gitpod_ws_manager_mk2_workspace_starts_total{type="Regular", cluster!~"ephemeral.*"}[5m])) by (cluster) > 0.2
46
47
- alert: GitpodWorkspaceHighFailureRateMk2
48
labels:
49
severity: critical
50
dedicated: included
51
annotations:
52
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceHighFailureRate.md
53
summary: Workspaces are failing in cluster {{ $labels.cluster }}.
54
description: Multiple workspaces are failing for the last 5 minutes
55
expr: |
56
rate(gitpod_ws_manager_mk2_workspace_stops_total{reason="failed", type="Regular", cluster!~"ephemeral.*"}[5m]) >= 1
57
58
- alert: GitpodWorkspaceStatusUpdatesCeased
59
labels:
60
severity: warning
61
for: 10m
62
annotations:
63
summary: meta has not seen a workspace update in the last 10 minutes despite starting workspaces
64
description: meta has not seen a workspace update in the last 10 minutes despite starting workspaces
65
expr: |
66
sum(rate(gitpod_ws_manager_bridge_status_updates_total[1m])) == 0 AND sum(rate(grpc_client_handled_total{grpc_method="StartWorkspace", grpc_service="wsman.WorkspaceManager"}[1m])) != 0
67
68
- alert: GitpodTooManyWorkspacesInPendingMk2
69
labels:
70
severity: critical
71
dedicated: included
72
for: 15m
73
annotations:
74
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodTooManyWorkspacesInPending.md
75
summary: workspaces are in pending phase
76
description: regular workspaces are stuck in pending phase in cluster {{ $labels.cluster }}.
77
expr: |
78
gitpod_ws_manager_mk2_workspace_phase_total{phase="Pending", type="Regular", cluster!~"ephemeral.*"} > 20
79
80
- alert: GitpodTooManyPrebuildsInPendingMk2
81
labels:
82
severity: critical
83
dedicated: included
84
for: 15m
85
annotations:
86
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodTooManyPrebuildsInPending.md
87
summary: workspaces are in pending phase
88
description: prebuilds are stuck in pending phase in cluster {{ $labels.cluster }}.
89
expr: |
90
gitpod_ws_manager_mk2_workspace_phase_total{phase="Pending", type="Prebuild", cluster!~"ephemeral.*"} > 20
91
92
- alert: GitpodWorkspaceTooLongTerminating
93
labels:
94
severity: warning
95
annotations:
96
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceTooLongTerminating.md
97
summary: workspace pods are terminating for too long.
98
description: workspace pods are terminating for too long in cluster {{ $labels.cluster }}.
99
expr: |
100
sum(time() - kube_pod_deletion_timestamp{namespace="default", pod=~"^ws-.*", cluster!~"ephemeral.*"}) by (pod) > 24 * 60 * 60
101
102
- alert: GitpodImagebuildDoneSuccess
103
labels:
104
severity: warning
105
team: engine
106
for: 12h
107
annotations:
108
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodImagebuildDoneSuccess.md
109
summary: imagebuilds done are failing at a high rate in cluster {{ $labels.cluster }}.
110
description: imagebuilds are not reaching done at too high of a rate in cluster {{ $labels.cluster }}.
111
expr: |
112
(1 - (sum(rate(gitpod_image_builder_builds_done_total{success="false", cluster!~"ephemeral.*"}[4h])) / sum(rate(gitpod_image_builder_builds_done_total{cluster!~"ephemeral.*"}[4h])))) < 0.60
113
114