Path: blob/main/operations/observability/mixins/meta/rules/usage.yaml
2500 views
# Copyright (c) 2022 Gitpod GmbH. All rights reserved.1# Licensed under the GNU Affero General Public License (AGPL).2# See License.AGPL.txt in the project root for license information.34apiVersion: monitoring.coreos.com/v15kind: PrometheusRule6metadata:7labels:8prometheus: k8s9role: alert-rules10name: usage-monitoring-rules11spec:12groups:13- name: usage14rules:15- alert: GitpodUsageReconcileUsageFailures16expr: sum(increase(grpc_server_handled_total{grpc_service="usage.v1.UsageService", grpc_method="ReconcileUsage", grpc_code!="OK"}[30m])) > 117for: 30m18labels:19severity: warning20team: webapp21annotations:22runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodUsageReconcileUsageFailures.md23summary: There are failed usage reconciliations.24description: We have accumulated {{ printf "%.2f" $value }} failures. This affects how up-to-date usage data is.2526- alert: GitpodUsageReconcileInvoicesFailures27expr: sum(increase(grpc_server_handled_total{grpc_service="usage.v1.BillingService", grpc_method="ReconcileInvoices", grpc_code!="OK"}[30m])) > 128for: 30m29labels:30severity: warning31team: webapp32annotations:33runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodUsageReconcileInvoicesFailures.md34summary: There are failed Stripe invoice reconciliations.35description: We have accumulated {{ printf "%.2f" $value }} failures. This affects how much customers will be billed.3637- alert: GitpodUsageBillingServiceFinalizeInvoiceFailures38expr: sum(increase(grpc_server_handled_total{grpc_service="usage.v1.BillingService", grpc_method="FinalizeInvoice", grpc_code!="OK"}[30m])) > 139for: 30m40labels:41severity: warning42team: webapp43annotations:44runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodUsageBillingServiceFinalizeInvoiceFailures.md45summary: Invoice finalization is failing. We are not balancing out user/team usage.46description: We have accumulated {{ printf "%.2f" $value }} failures. This affects if customers have their balance reset and can therefore start new workspaces.4748- alert: GitpodUsageTooLongSinceLastSuccessfulLedgerReconciliation49expr: (time() - gitpod_usage_ledger_last_completed_time{outcome="success"}) > 60 * 6050for: 30m51labels:52severity: warning53team: webapp54annotations:55runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodUsageTooLongSinceLastSuccessfulLedgerReconciliation.md56summary: Usage reconciliation has not run successfully for {{ printf "%.2f" $value }} seconds. Usage data is stale.57description: We have not executed scheduled usage reconciliation for {{ printf "%.2f" $value }} seconds. We expect the data to update every 15 minutes to avoid stale usage records and stale invoices.5859- alert: UsageHighCPUUsage60# Reasoning: high rates of CPU consumption should only be temporary.61expr: avg(rate(container_cpu_usage_seconds_total{container!="POD", pod=~"usage-.*"}[5m])) by (cluster) > 0.262for: 10m63labels:64# sent to the team internal channel until we fine tuned it65severity: warning66team: webapp67annotations:68runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WebAppServicesHighCPUUsage.md69summary: Usage has excessive CPU usage.70description: Usage is consumming too much CPU. Please investigate.71dashboard_url: https://grafana.gitpod.io/d/6581e46e4e5c7ba40a07646395ef7b23/kubernetes-compute-resources-pod?var-cluster={{ $labels.cluster }}&var-namespace=default727374