Path: blob/main/operations/observability/mixins/meta/rules/server.yaml
2500 views
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.1# Licensed under the GNU Affero General Public License (AGPL).2# See License.AGPL.txt in the project root for license information.34apiVersion: monitoring.coreos.com/v15kind: PrometheusRule6metadata:7labels:8prometheus: k8s9role: alert-rules10name: server-monitoring-rules11spec:12groups:13- name: server14rules:15- alert: WebsocketConnectionsNotClosing16expr: sum(server_websocket_connection_count) by (cluster) == 1000017for: 10m18labels:19severity: critical20annotations:21runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionsNotClosing.md22summary: Open websocket connections are not closing for the last 10 minutes and accumulating in {{ $labels.cluster }}.23description: We have accumulated {{ printf "%.2f" $value }} open websocket connections.2425- alert: ServerEventLoopLagTooHigh26expr: avg_over_time(nodejs_eventloop_lag_seconds{job="server"}[20m]) > 0.3527for: 5m28labels:29severity: critical30annotations:31runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/ServerEventLoopLagTooHigh.md32summary: Server accumulated too much "event loop lag" on {{ $labels.cluster }}. The webapp will become unresponsive if we don't act here.33description: Server has accumulated {{ printf "%.2f" $value }}s event loop lag.3435- alert: InstanceStartFailures36# Reasoning: 10% failure rate over 5m37expr: sum(increase(gitpod_server_instance_starts_failed_total{reason!~"imageBuildFailed|imageBuildFailedUser|scmAccessFailed"}[5m])) / (sum(increase(gitpod_server_instance_starts_failed_total{reason!~"imageBuildFailed|imageBuildFailedUser|scmAccessFailed"}[5m])) + sum(increase(gitpod_server_instance_starts_success_total[5m]))) > 0.138for: 30s39labels:40severity: critical41annotations:42runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/InstanceStartFailures.md43summary: Server tries to start an instance, but cannot for whatever reason. Investigation required.44description: Server cannot start workspace instances on workspace clusters.4546# Rollout alerts47- alert: JsonRpcApiErrorRates48expr: sum(rate(gitpod_server_api_calls_total{statusCode!~"2.*|3.*|4.*|640"}[5m])) by (cluster) / sum(rate(gitpod_server_api_calls_total[5m])) by (cluster) > 0.0449for: 5m50labels:51severity: critical52team: webapp53dedicated: included54annotations:55runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodApiErrorRate.md56summary: The error rate of the JSON RPC API is high on {{ $labels.cluster }}. Investigation required.57description: JSON RPC API error rate high5859- alert: GitpodV1APIServerErrors60expr: sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unknown|DataLoss", grpc_service=~"gitpod.v1.*"}[5m])) by (cluster) / sum(rate(grpc_server_handled_total{grpc_service=~"gitpod.v1.*"}[5m])) by (cluster) > 0.0161for: 5m62labels:63severity: critical64team: webapp65dedicated: included66annotations:67runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodV1APIServerErrors.md68summary: gitpod.v1 API is returning higher number of errors on {{ $labels.cluster }}. Investigation required.69description: gitpod.v1 API is returning higher number of errors707172- alert: WebsocketConnectionRateHigh73# Reasoning: the values are taken from past data74expr: sum(rate(gitpod_server_api_connections_total[2m])) by (pod) > 575for: 10m76labels:77# sent to the team internal channel until we fine tuned it78severity: warning79team: webapp80annotations:81runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebsocketConnectionRateHigh.md82summary: The websocket connection rate is higher than usual. Investigation required.83description: Websocket connection rate high8485- alert: ServerHighCPUUsage86# Reasoning: high rates of CPU consumption should only be temporary.87expr: avg(rate(container_cpu_usage_seconds_total{container!="POD", pod=~"server-.*"}[5m])) by (cluster) > 0.488for: 10m89labels:90# sent to the team internal channel until we fine tuned it91severity: warning92team: webapp93annotations:94runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesHighCPUUsage.md95summary: Server has excessive CPU usage.96description: Server is consumming too much CPU. Please investigate.97dashboard_url: https://grafana.gitpod.io/d/6581e46e4e5c7ba40a07646395ef7b23/kubernetes-compute-resources-pod?var-cluster={{ $labels.cluster }}&var-namespace=default9899- alert: WebAppServicesCrashlooping100# Reasoning: alert if any pod is stuck in crashlooping more than 5 minute.101expr: sum(increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (cluster, pod) > 0102# Five minutes sound high, but that's the only value that's higher than recent history103for: 5m104labels:105severity: critical106team: webapp107dedicated: included108annotations:109runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md110summary: Pod is crash looping in {{ $labels.cluster }}.111description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is stuck in crashlooping for at least 5 minutes112113- alert: WebAppServicesCrashloopingInternal114# Reasoning: alert if any pod is stuck in crashlooping more than 1 minute.115# Used for fine tuning the alert above (WebAppServicesCrashlooping)!116expr: sum(increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (cluster, pod) > 0117# Let's be more ambitious than 5m118for: 1m119labels:120# sent to the team internal channel until we can propagate these values to CRITICAL121severity: warning122team: webapp123annotations:124runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesCrashlooping.md125summary: Pod is crash looping in {{ $labels.cluster }}.126description: "FINE TUNE: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is stuck in crashlooping for at least 1 minute"127128- alert: StartWorkspace_InternalErrors129expr: sum(increase(gitpod_server_api_calls_total{method="startWorkspace", statusCode=~"5.*"}[1m])) by (cluster) > 5130for: 5m131labels:132severity: critical133team: webapp134annotations:135runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/StartWorkspace_InternalErrors.md136summary: Failing to start workspaces in {{ $labels.cluster }}.137description: Server is failed to start {{ printf "%.2f" $value }} workspaces in {{ $labels.cluster }} in the last 5 minutes138139140