Path: blob/main/operations/observability/mixins/workspace/rules/central/workspaces.yaml
2506 views
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.1# Licensed under the GNU Affero General Public License (AGPL).2# See License.AGPL.txt in the project root for license information.34apiVersion: monitoring.coreos.com/v15kind: PrometheusRule6metadata:7labels:8prometheus: k8s9role: alert-rules10name: workspace-monitoring-central-rules11spec:12groups:1314- name: workspace-alerts15rules:16- alert: GitpodWorkspaceStuckOnStoppingMk217labels:18severity: critical19dedicated: included20for: 20m21annotations:22runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceStuckOnStopping.md23summary: '{{ printf "%.2f" $value }}% of Regular workspaces stopping in {{ $labels.cluster }}'24description: '{{ printf "%.2f" $value }}% of Regular workspaces stopping in {{ $labels.cluster }} is too high.'25expr: |26sum(27gitpod_ws_manager_mk2_workspace_phase_total{type="Regular", phase="Stopping", cluster!~"ephemeral.*"}) / sum(gitpod_ws_manager_mk2_workspace_phase_total{type="Regular", cluster!~"ephemeral.*"}28) >= .229and30sum(31gitpod_ws_manager_mk2_workspace_phase_total{type="Regular", phase="Stopping", cluster!~"ephemeral.*"}32) without(phase) > 153334- alert: GitpodWorkspaceHighStartFailureRate35labels:36severity: critical37dedicated: included38annotations:39runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceHighFailureRate.md40summary: Workspaces are failing to start in cluster {{ $labels.cluster }}.41description: For the last 15 minutes more than 20% of hew workspaces have failed to start42for: 15m43expr: |44sum(increase(gitpod_ws_manager_mk2_workspace_starts_failure_total{type="Regular", cluster!~"ephemeral.*"}[5m])) by (cluster) / sum(increase(gitpod_ws_manager_mk2_workspace_starts_total{type="Regular", cluster!~"ephemeral.*"}[5m])) by (cluster) > 0.24546- alert: GitpodWorkspaceHighFailureRateMk247labels:48severity: critical49dedicated: included50annotations:51runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceHighFailureRate.md52summary: Workspaces are failing in cluster {{ $labels.cluster }}.53description: Multiple workspaces are failing for the last 5 minutes54expr: |55rate(gitpod_ws_manager_mk2_workspace_stops_total{reason="failed", type="Regular", cluster!~"ephemeral.*"}[5m]) >= 15657- alert: GitpodWorkspaceStatusUpdatesCeased58labels:59severity: warning60for: 10m61annotations:62summary: meta has not seen a workspace update in the last 10 minutes despite starting workspaces63description: meta has not seen a workspace update in the last 10 minutes despite starting workspaces64expr: |65sum(rate(gitpod_ws_manager_bridge_status_updates_total[1m])) == 0 AND sum(rate(grpc_client_handled_total{grpc_method="StartWorkspace", grpc_service="wsman.WorkspaceManager"}[1m])) != 06667- alert: GitpodTooManyWorkspacesInPendingMk268labels:69severity: critical70dedicated: included71for: 15m72annotations:73runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodTooManyWorkspacesInPending.md74summary: workspaces are in pending phase75description: regular workspaces are stuck in pending phase in cluster {{ $labels.cluster }}.76expr: |77gitpod_ws_manager_mk2_workspace_phase_total{phase="Pending", type="Regular", cluster!~"ephemeral.*"} > 207879- alert: GitpodTooManyPrebuildsInPendingMk280labels:81severity: critical82dedicated: included83for: 15m84annotations:85runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodTooManyPrebuildsInPending.md86summary: workspaces are in pending phase87description: prebuilds are stuck in pending phase in cluster {{ $labels.cluster }}.88expr: |89gitpod_ws_manager_mk2_workspace_phase_total{phase="Pending", type="Prebuild", cluster!~"ephemeral.*"} > 209091- alert: GitpodWorkspaceTooLongTerminating92labels:93severity: warning94annotations:95runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceTooLongTerminating.md96summary: workspace pods are terminating for too long.97description: workspace pods are terminating for too long in cluster {{ $labels.cluster }}.98expr: |99sum(time() - kube_pod_deletion_timestamp{namespace="default", pod=~"^ws-.*", cluster!~"ephemeral.*"}) by (pod) > 24 * 60 * 60100101- alert: GitpodImagebuildDoneSuccess102labels:103severity: warning104team: engine105for: 12h106annotations:107runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodImagebuildDoneSuccess.md108summary: imagebuilds done are failing at a high rate in cluster {{ $labels.cluster }}.109description: imagebuilds are not reaching done at too high of a rate in cluster {{ $labels.cluster }}.110expr: |111(1 - (sum(rate(gitpod_image_builder_builds_done_total{success="false", cluster!~"ephemeral.*"}[4h])) / sum(rate(gitpod_image_builder_builds_done_total{cluster!~"ephemeral.*"}[4h])))) < 0.60112113114