Path: blob/main/components/ws-manager-mk2/controllers/metrics.go
2498 views
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.1// Licensed under the GNU Affero General Public License (AGPL).2// See License.AGPL.txt in the project root for license information.34package controllers56import (7"context"8"fmt"9"strings"10"time"1112wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"13"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"14"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/maintenance"15workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"16"github.com/go-logr/logr"17lru "github.com/hashicorp/golang-lru"18"github.com/prometheus/client_golang/prometheus"19corev1 "k8s.io/api/core/v1"20"sigs.k8s.io/controller-runtime/pkg/client"21"sigs.k8s.io/controller-runtime/pkg/log"22)2324const (25maintenanceEnabled string = "maintenance_enabled"26workspaceStartupSeconds string = "workspace_startup_seconds"27workspacePendingSeconds string = "workspace_pending_seconds"28workspaceCreatingSeconds string = "workspace_creating_seconds"29workspaceStartFailuresTotal string = "workspace_starts_failure_total"30workspaceFailuresTotal string = "workspace_failure_total"31workspaceStopsTotal string = "workspace_stops_total"32workspaceRecreationsTotal string = "workspace_recreations_total"33workspaceBackupsTotal string = "workspace_backups_total"34workspaceBackupFailuresTotal string = "workspace_backups_failure_total"35workspaceRestoresTotal string = "workspace_restores_total"36workspaceRestoresFailureTotal string = "workspace_restores_failure_total"37workspaceNodeUtilization string = "workspace_node_utilization"38workspaceActivityTotal string = "workspace_activity_total"39)4041type StopReason string4243const (44StopReasonFailed = "failed"45StopReasonStartFailure = "start-failure"46StopReasonAborted = "aborted"47StopReasonOutOfSpace = "out-of-space"48StopReasonTimeout = "timeout"49StopReasonTabClosed = "tab-closed"50StopReasonRegular = "regular-stop"51)5253type controllerMetrics struct {54startupTimeHistVec *prometheus.HistogramVec55pendingTimeHistVec *prometheus.HistogramVec56creatingTimeHistVec *prometheus.HistogramVec57totalStartsFailureCounterVec *prometheus.CounterVec58totalFailuresCounterVec *prometheus.CounterVec59totalStopsCounterVec *prometheus.CounterVec60totalRecreationsCounterVec *prometheus.CounterVec6162totalBackupCounterVec *prometheus.CounterVec63totalBackupFailureCounterVec *prometheus.CounterVec64totalRestoreCounterVec *prometheus.CounterVec65totalRestoreFailureCounterVec *prometheus.CounterVec6667workspacePhases *phaseTotalVec68timeoutSettings *timeoutSettingsVec6970workspaceNodeUtilization *nodeUtilizationVec7172workspaceActivityTotal *workspaceActivityVec7374// used to prevent recording metrics multiple times75cache *lru.Cache76}7778func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) {79cache, err := lru.New(6000)80if err != nil {81return nil, err82}8384return &controllerMetrics{85startupTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{86Namespace: metricsNamespace,87Subsystem: metricsWorkspaceSubsystem,88Name: workspaceStartupSeconds,89Help: "time it took for workspace pods to reach the running phase",90Buckets: prometheus.ExponentialBuckets(2, 2, 10),91}, []string{"type", "class"}),92pendingTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{93Namespace: metricsNamespace,94Subsystem: metricsWorkspaceSubsystem,95Name: workspacePendingSeconds,96Help: "time the workspace spent in pending",97Buckets: prometheus.ExponentialBuckets(2, 2, 10),98}, []string{"type", "class"}),99creatingTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{100Namespace: metricsNamespace,101Subsystem: metricsWorkspaceSubsystem,102Name: workspaceCreatingSeconds,103Help: "time the workspace spent in creation",104Buckets: prometheus.ExponentialBuckets(2, 2, 10),105}, []string{"type", "class"}),106totalStartsFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{107Namespace: metricsNamespace,108Subsystem: metricsWorkspaceSubsystem,109Name: workspaceStartFailuresTotal,110Help: "total number of workspaces that failed to start",111}, []string{"type", "class"}),112totalFailuresCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{113Namespace: metricsNamespace,114Subsystem: metricsWorkspaceSubsystem,115Name: workspaceFailuresTotal,116Help: "total number of workspaces that had a failed condition",117}, []string{"type", "class"}),118totalStopsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{119Namespace: metricsNamespace,120Subsystem: metricsWorkspaceSubsystem,121Name: workspaceStopsTotal,122Help: "total number of workspaces stopped",123}, []string{"reason", "type", "class"}),124totalRecreationsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{125Namespace: metricsNamespace,126Subsystem: metricsWorkspaceSubsystem,127Name: workspaceRecreationsTotal,128Help: "total number of workspace recreations",129}, []string{"type", "class", "attempt"}),130131totalBackupCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{132Namespace: metricsNamespace,133Subsystem: metricsWorkspaceSubsystem,134Name: workspaceBackupsTotal,135Help: "total number of workspace backups",136}, []string{"type", "class"}),137totalBackupFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{138Namespace: metricsNamespace,139Subsystem: metricsWorkspaceSubsystem,140Name: workspaceBackupFailuresTotal,141Help: "total number of workspace backup failures",142}, []string{"type", "class"}),143totalRestoreCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{144Namespace: metricsNamespace,145Subsystem: metricsWorkspaceSubsystem,146Name: workspaceRestoresTotal,147Help: "total number of workspace restores",148}, []string{"type", "class"}),149totalRestoreFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{150Namespace: metricsNamespace,151Subsystem: metricsWorkspaceSubsystem,152Name: workspaceRestoresFailureTotal,153Help: "total number of workspace restore failures",154}, []string{"type", "class"}),155156workspacePhases: newPhaseTotalVec(r),157timeoutSettings: newTimeoutSettingsVec(r),158workspaceNodeUtilization: newNodeUtilizationVec(r),159workspaceActivityTotal: newWorkspaceActivityVec(r),160cache: cache,161}, nil162}163164func (m *controllerMetrics) recordWorkspaceStartupTime(log *logr.Logger, ws *workspacev1.Workspace) {165class := ws.Spec.Class166tpe := string(ws.Spec.Type)167168hist, err := m.startupTimeHistVec.GetMetricWithLabelValues(tpe, class)169if err != nil {170log.Error(err, "could not record workspace startup time", "type", tpe, "class", class)171}172173duration := time.Since(ws.CreationTimestamp.Time)174hist.Observe(float64(duration.Seconds()))175}176177func (m *controllerMetrics) recordWorkspacePendingTime(log *logr.Logger, ws *workspacev1.Workspace, pendingTs time.Time) {178class := ws.Spec.Class179tpe := string(ws.Spec.Type)180181hist, err := m.pendingTimeHistVec.GetMetricWithLabelValues(tpe, class)182if err != nil {183log.Error(err, "could not record workspace pending time", "type", tpe, "class", class)184}185186hist.Observe(time.Since(pendingTs).Seconds())187}188189func (m *controllerMetrics) recordWorkspaceCreatingTime(log *logr.Logger, ws *workspacev1.Workspace, creatingTs time.Time) {190class := ws.Spec.Class191tpe := string(ws.Spec.Type)192193hist, err := m.creatingTimeHistVec.GetMetricWithLabelValues(tpe, class)194if err != nil {195log.Error(err, "could not record workspace creating time", "type", tpe, "class", class)196}197198hist.Observe(time.Since(creatingTs).Seconds())199}200201func (m *controllerMetrics) countWorkspaceStartFailures(log *logr.Logger, ws *workspacev1.Workspace) {202class := ws.Spec.Class203tpe := string(ws.Spec.Type)204205m.totalStartsFailureCounterVec.WithLabelValues(tpe, class).Inc()206}207208func (m *controllerMetrics) countWorkspaceFailure(log *logr.Logger, ws *workspacev1.Workspace) {209class := ws.Spec.Class210tpe := string(ws.Spec.Type)211212m.totalFailuresCounterVec.WithLabelValues(tpe, class).Inc()213}214215func (m *controllerMetrics) countWorkspaceStop(log *logr.Logger, ws *workspacev1.Workspace) {216var reason string217if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)); c != nil {218reason = StopReasonFailed219if !ws.IsConditionTrue(workspacev1.WorkspaceConditionEverReady) {220// Don't record 'failed' if there was a start failure.221reason = StopReasonStartFailure222} else if strings.Contains(c.Message, "Pod ephemeral local storage usage exceeds the total limit of containers") {223reason = StopReasonOutOfSpace224}225} else if ws.IsConditionTrue(workspacev1.WorkspaceConditionAborted) {226reason = StopReasonAborted227} else if ws.IsConditionTrue(workspacev1.WorkspaceConditionTimeout) {228reason = StopReasonTimeout229} else if ws.IsConditionTrue(workspacev1.WorkspaceConditionClosed) {230reason = StopReasonTabClosed231} else {232reason = StopReasonRegular233}234235class := ws.Spec.Class236tpe := string(ws.Spec.Type)237238log.Info("workspace stop reason", "type", tpe, "class", class, "reason", reason)239240m.totalStopsCounterVec.WithLabelValues(reason, tpe, class).Inc()241}242243func (m *controllerMetrics) countWorkspaceRecreations(log *logr.Logger, ws *workspacev1.Workspace) {244class := ws.Spec.Class245tpe := string(ws.Spec.Type)246attempt := fmt.Sprint(ws.Status.PodRecreated)247248m.totalRecreationsCounterVec.WithLabelValues(tpe, class, attempt).Inc()249}250251func (m *controllerMetrics) countTotalBackups(log *logr.Logger, ws *workspacev1.Workspace) {252class := ws.Spec.Class253tpe := string(ws.Spec.Type)254255m.totalBackupCounterVec.WithLabelValues(tpe, class).Inc()256}257258func (m *controllerMetrics) countTotalBackupFailures(log *logr.Logger, ws *workspacev1.Workspace) {259class := ws.Spec.Class260tpe := string(ws.Spec.Type)261262m.totalBackupFailureCounterVec.WithLabelValues(tpe, class).Inc()263}264265func (m *controllerMetrics) countTotalRestores(log *logr.Logger, ws *workspacev1.Workspace) {266class := ws.Spec.Class267tpe := string(ws.Spec.Type)268269m.totalRestoreCounterVec.WithLabelValues(tpe, class).Inc()270}271272func (m *controllerMetrics) countTotalRestoreFailures(log *logr.Logger, ws *workspacev1.Workspace) {273class := ws.Spec.Class274tpe := string(ws.Spec.Type)275276m.totalRestoreFailureCounterVec.WithLabelValues(tpe, class).Inc()277}278279func (m *controllerMetrics) containsWorkspace(ws *workspacev1.Workspace) bool {280return m.cache.Contains(ws.Name)281}282283func (m *controllerMetrics) rememberWorkspace(ws *workspacev1.Workspace, state *metricState) {284var s metricState285if state != nil {286s = *state287} else {288s = newMetricState(ws)289}290m.cache.Add(ws.Name, s)291}292293func (m *controllerMetrics) forgetWorkspace(ws *workspacev1.Workspace) {294m.cache.Remove(ws.Name)295}296297// metricState is used to track which metrics have been recorded for a workspace.298type metricState struct {299phase workspacev1.WorkspacePhase300pendingStartTime time.Time301creatingStartTime time.Time302recordedStartTime bool303recordedInitFailure bool304recordedStartFailure bool305recordedFailure bool306recordedContentReady bool307recordedBackupFailed bool308recordedBackupCompleted bool309recordedRecreations int310}311312func newMetricState(ws *workspacev1.Workspace) metricState {313return metricState{314phase: ws.Status.Phase,315// Here we assume that we've recorded metrics for the following states already if their conditions already exist.316// This is to prevent these from being re-recorded after the controller restarts and clears the metric state for317// each workspace.318recordedStartTime: ws.Status.Phase == workspacev1.WorkspacePhaseRunning,319recordedInitFailure: wsk8s.ConditionWithStatusAndReason(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, workspacev1.ReasonInitializationFailure),320recordedStartFailure: ws.Status.Phase == workspacev1.WorkspacePhaseStopped && isStartFailure(ws),321recordedFailure: ws.IsConditionTrue(workspacev1.WorkspaceConditionFailed),322recordedContentReady: ws.IsConditionTrue(workspacev1.WorkspaceConditionContentReady),323recordedBackupFailed: ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupFailure),324recordedBackupCompleted: ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupComplete),325recordedRecreations: ws.Status.PodRecreated,326}327}328329// getWorkspace returns the last recorded metric state for that workspace.330func (m *controllerMetrics) getWorkspace(log *logr.Logger, ws *workspacev1.Workspace) (bool, metricState) {331s, ok := m.cache.Get(ws.Name)332if !ok {333return false, metricState{}334}335336return true, s.(metricState)337}338339// Describe implements Collector. It will send exactly one Desc to the provided channel.340func (m *controllerMetrics) Describe(ch chan<- *prometheus.Desc) {341m.startupTimeHistVec.Describe(ch)342m.pendingTimeHistVec.Describe(ch)343m.creatingTimeHistVec.Describe(ch)344m.totalStopsCounterVec.Describe(ch)345m.totalStartsFailureCounterVec.Describe(ch)346m.totalFailuresCounterVec.Describe(ch)347348m.totalBackupCounterVec.Describe(ch)349m.totalBackupFailureCounterVec.Describe(ch)350m.totalRestoreCounterVec.Describe(ch)351m.totalRestoreFailureCounterVec.Describe(ch)352353m.workspacePhases.Describe(ch)354m.timeoutSettings.Describe(ch)355m.workspaceNodeUtilization.Describe(ch)356m.workspaceActivityTotal.Describe(ch)357}358359// Collect implements Collector.360func (m *controllerMetrics) Collect(ch chan<- prometheus.Metric) {361m.startupTimeHistVec.Collect(ch)362m.pendingTimeHistVec.Collect(ch)363m.creatingTimeHistVec.Collect(ch)364m.totalStopsCounterVec.Collect(ch)365m.totalStartsFailureCounterVec.Collect(ch)366m.totalFailuresCounterVec.Collect(ch)367368m.totalBackupCounterVec.Collect(ch)369m.totalBackupFailureCounterVec.Collect(ch)370m.totalRestoreCounterVec.Collect(ch)371m.totalRestoreFailureCounterVec.Collect(ch)372373m.workspacePhases.Collect(ch)374m.timeoutSettings.Collect(ch)375m.workspaceNodeUtilization.Collect(ch)376m.workspaceActivityTotal.Collect(ch)377}378379// phaseTotalVec returns a gauge vector counting the workspaces per phase380type phaseTotalVec struct {381name string382desc *prometheus.Desc383reconciler *WorkspaceReconciler384}385386func newPhaseTotalVec(r *WorkspaceReconciler) *phaseTotalVec {387name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, "workspace_phase_total")388return &phaseTotalVec{389name: name,390desc: prometheus.NewDesc(name, "Current number of workspaces per phase", []string{"phase", "type", "class"}, prometheus.Labels(map[string]string{})),391reconciler: r,392}393}394395// Describe implements Collector. It will send exactly one Desc to the provided channel.396func (ptv *phaseTotalVec) Describe(ch chan<- *prometheus.Desc) {397ch <- ptv.desc398}399400// Collect implements Collector.401func (ptv *phaseTotalVec) Collect(ch chan<- prometheus.Metric) {402ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)403defer cancel()404405var workspaces workspacev1.WorkspaceList406err := ptv.reconciler.List(ctx, &workspaces, client.InNamespace(ptv.reconciler.Config.Namespace))407if err != nil {408return409}410411counts := make(map[string]int)412for _, ws := range workspaces.Items {413counts[string(ws.Spec.Type)+"::"+string(ws.Status.Phase)+"::"+ws.Spec.Class]++414}415416for key, count := range counts {417segs := strings.Split(key, "::")418tpe, phase, class := segs[0], segs[1], segs[2]419420metric, err := prometheus.NewConstMetric(ptv.desc, prometheus.GaugeValue, float64(count), phase, tpe, class)421if err != nil {422continue423}424425ch <- metric426}427}428429// timeoutSettingsVec provides a gauge of the currently active/inactive workspaces.430// Adding both up returns the total number of workspaces.431type timeoutSettingsVec struct {432name string433reconciler *WorkspaceReconciler434desc *prometheus.Desc435}436437func newTimeoutSettingsVec(r *WorkspaceReconciler) *timeoutSettingsVec {438name := prometheus.BuildFQName("wsman", "workspace", "timeout_settings_total")439desc := prometheus.NewDesc(440name,441"Current number of workspaces per timeout setting",442[]string{"timeout"},443prometheus.Labels(map[string]string{}),444)445return &timeoutSettingsVec{446name: name,447reconciler: r,448desc: desc,449}450}451452// Describe implements Collector. It will send exactly one Desc to the provided channel.453func (vec *timeoutSettingsVec) Describe(ch chan<- *prometheus.Desc) {454ch <- vec.desc455}456457// Collect implements Collector.458func (tsv *timeoutSettingsVec) Collect(ch chan<- prometheus.Metric) {459ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)460defer cancel()461462var workspaces workspacev1.WorkspaceList463err := tsv.reconciler.List(ctx, &workspaces, client.InNamespace(tsv.reconciler.Config.Namespace))464if err != nil {465return466}467468timeouts := make(map[time.Duration]int)469for _, ws := range workspaces.Items {470if ws.Spec.Timeout.Time == nil {471continue472}473474timeouts[ws.Spec.Timeout.Time.Duration]++475}476477for phase, cnt := range timeouts {478// metrics cannot be re-used, we have to create them every single time479metric, err := prometheus.NewConstMetric(tsv.desc, prometheus.GaugeValue, float64(cnt), phase.String())480if err != nil {481continue482}483484ch <- metric485}486}487488type maintenanceEnabledGauge struct {489name string490desc *prometheus.Desc491maintenance maintenance.Maintenance492}493494func newMaintenanceEnabledGauge(m maintenance.Maintenance) *maintenanceEnabledGauge {495name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, maintenanceEnabled)496return &maintenanceEnabledGauge{497name: name,498desc: prometheus.NewDesc(name, "Whether the cluster is in maintenance mode", nil, prometheus.Labels(map[string]string{})),499maintenance: m,500}501}502503func (m *maintenanceEnabledGauge) Describe(ch chan<- *prometheus.Desc) {504ch <- m.desc505}506507func (m *maintenanceEnabledGauge) Collect(ch chan<- prometheus.Metric) {508var value float64509if m.maintenance.IsEnabled(context.Background()) {510value = 1511}512513metric, err := prometheus.NewConstMetric(m.desc, prometheus.GaugeValue, value)514if err != nil {515return516}517518ch <- metric519}520521// nodeUtilizationVec provides metrics per workspace node on:522// - the amount of cpu/memory requested by workspaces on the node (size of the workspace class)523// CPU is measured in cores, memory in bytes.524// Differentiates between headless and regular workspace nodes using the type label.525// Useful to determine node utilization and capacity.526type nodeUtilizationVec struct {527name string528desc *prometheus.Desc529reconciler *WorkspaceReconciler530}531532func newNodeUtilizationVec(r *WorkspaceReconciler) *nodeUtilizationVec {533name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, workspaceNodeUtilization)534desc := prometheus.NewDesc(535name,536"Amount of resources requested by workspaces on the node (cpu/memory, workspace type)",537[]string{"node", "resource", "type"},538prometheus.Labels(map[string]string{}),539)540return &nodeUtilizationVec{541name: name,542reconciler: r,543desc: desc,544}545}546547// Describe implements Collector. It will send exactly one Desc to the provided channel.548func (n *nodeUtilizationVec) Describe(ch chan<- *prometheus.Desc) {549ch <- n.desc550}551552// Collect implements Collector.553func (n *nodeUtilizationVec) Collect(ch chan<- prometheus.Metric) {554ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)555defer cancel()556557var nodes corev1.NodeList558err := n.reconciler.List(ctx, &nodes)559if err != nil {560log.FromContext(ctx).Error(err, "cannot list nodes for node utilization metric")561return562}563564var (565nodeUtilization = make(map[string]map[corev1.ResourceName]float64)566nodeTypes = make(map[string]string)567)568for _, node := range nodes.Items {569isRegular := node.Labels["gitpod.io/workload_workspace_regular"] == "true"570isHeadless := node.Labels["gitpod.io/workload_workspace_headless"] == "true"571if !isRegular && !isHeadless {572// Ignore non-workspace nodes.573continue574}575576nodeUtilization[node.Name] = map[corev1.ResourceName]float64{577corev1.ResourceCPU: 0,578corev1.ResourceMemory: 0,579}580nodeTypes[node.Name] = "regular"581if !isRegular && isHeadless {582// In case a node is both regular and headless (e.g. a preview env), mark it as regular.583nodeTypes[node.Name] = "headless"584}585}586587var workspaces workspacev1.WorkspaceList588if err = n.reconciler.List(ctx, &workspaces, client.InNamespace(n.reconciler.Config.Namespace)); err != nil {589log.FromContext(ctx).Error(err, "cannot list workspaces for node utilization metric")590return591}592593// Aggregate workspace pod resource requests per node.594for _, ws := range workspaces.Items {595// This list is indexed and reads from memory, so it's not that expensive to do this for every workspace.596pods, err := n.reconciler.listWorkspacePods(ctx, &ws)597if err != nil {598log.FromContext(ctx).Error(err, "cannot list workspace pods for node utilization metric", "workspace", ws.Name)599continue600}601602if len(pods.Items) == 0 {603// No pods (yet), not consuming resources on the node.604continue605}606607for _, pod := range pods.Items {608nodeName := pod.Spec.NodeName609if nodeName == "" {610// Not yet scheduled.611continue612}613614if _, ok := nodeUtilization[nodeName]; !ok {615nodeUtilization[nodeName] = map[corev1.ResourceName]float64{616corev1.ResourceCPU: 0,617corev1.ResourceMemory: 0,618}619}620621for _, container := range pod.Spec.Containers {622requests := container.Resources.Requests623nodeUtilization[nodeName][corev1.ResourceCPU] += float64(requests.Cpu().MilliValue()) / 1000.0624nodeUtilization[nodeName][corev1.ResourceMemory] += float64(requests.Memory().Value())625}626}627}628629for nodeName, metrics := range nodeUtilization {630for resource, value := range metrics {631nodeType := nodeTypes[nodeName]632metric, err := prometheus.NewConstMetric(n.desc, prometheus.GaugeValue, value, nodeName, resource.String(), nodeType)633if err != nil {634log.FromContext(ctx).Error(err, "cannot create node utilization metric", "node", nodeName, "resource", resource.String(), "type", nodeType)635continue636}637638ch <- metric639}640}641}642643type workspaceActivityVec struct {644name string645desc *prometheus.Desc646reconciler *WorkspaceReconciler647}648649func newWorkspaceActivityVec(r *WorkspaceReconciler) *workspaceActivityVec {650name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, workspaceActivityTotal)651desc := prometheus.NewDesc(652name,653"total number of active workspaces",654[]string{"active"},655prometheus.Labels(map[string]string{}),656)657return &workspaceActivityVec{658name: name,659desc: desc,660reconciler: r,661}662}663664// Describe implements Collector. It will send exactly one Desc to the provided channel.665func (wav *workspaceActivityVec) Describe(ch chan<- *prometheus.Desc) {666ch <- wav.desc667}668669func (wav *workspaceActivityVec) Collect(ch chan<- prometheus.Metric) {670ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)671defer cancel()672673active, notActive, err := wav.getWorkspaceActivityCounts()674if err != nil {675log.FromContext(ctx).Error(err, fmt.Sprintf("cannot determine active/inactive counts - %s will be inaccurate", wav.name))676return677}678679activeMetrics, err := prometheus.NewConstMetric(wav.desc, prometheus.GaugeValue, float64(active), "true")680if err != nil {681log.FromContext(ctx).Error(err, "cannot create wrokspace activity metric", "active", "true")682return683}684notActiveMetrics, err := prometheus.NewConstMetric(wav.desc, prometheus.GaugeValue, float64(notActive), "false")685if err != nil {686log.FromContext(ctx).Error(err, "cannot create wrokspace activity metric", "active", "false")687return688}689690ch <- activeMetrics691ch <- notActiveMetrics692}693694func (wav *workspaceActivityVec) getWorkspaceActivityCounts() (active, notActive int, err error) {695var workspaces workspacev1.WorkspaceList696if err = wav.reconciler.List(context.Background(), &workspaces, client.InNamespace(wav.reconciler.Config.Namespace)); err != nil {697return 0, 0, err698}699700for _, ws := range workspaces.Items {701if ws.Spec.Type != workspacev1.WorkspaceTypeRegular {702continue703}704705hasActivity := activity.Last(&ws) != nil706if hasActivity {707active++708} else {709notActive++710}711}712713return714}715716717