Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
gitpod-io
GitHub Repository: gitpod-io/gitpod
Path: blob/main/components/ws-manager-mk2/controllers/metrics.go
2498 views
1
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.
2
// Licensed under the GNU Affero General Public License (AGPL).
3
// See License.AGPL.txt in the project root for license information.
4
5
package controllers
6
7
import (
8
"context"
9
"fmt"
10
"strings"
11
"time"
12
13
wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
14
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
15
"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/maintenance"
16
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
17
"github.com/go-logr/logr"
18
lru "github.com/hashicorp/golang-lru"
19
"github.com/prometheus/client_golang/prometheus"
20
corev1 "k8s.io/api/core/v1"
21
"sigs.k8s.io/controller-runtime/pkg/client"
22
"sigs.k8s.io/controller-runtime/pkg/log"
23
)
24
25
const (
26
maintenanceEnabled string = "maintenance_enabled"
27
workspaceStartupSeconds string = "workspace_startup_seconds"
28
workspacePendingSeconds string = "workspace_pending_seconds"
29
workspaceCreatingSeconds string = "workspace_creating_seconds"
30
workspaceStartFailuresTotal string = "workspace_starts_failure_total"
31
workspaceFailuresTotal string = "workspace_failure_total"
32
workspaceStopsTotal string = "workspace_stops_total"
33
workspaceRecreationsTotal string = "workspace_recreations_total"
34
workspaceBackupsTotal string = "workspace_backups_total"
35
workspaceBackupFailuresTotal string = "workspace_backups_failure_total"
36
workspaceRestoresTotal string = "workspace_restores_total"
37
workspaceRestoresFailureTotal string = "workspace_restores_failure_total"
38
workspaceNodeUtilization string = "workspace_node_utilization"
39
workspaceActivityTotal string = "workspace_activity_total"
40
)
41
42
type StopReason string
43
44
const (
45
StopReasonFailed = "failed"
46
StopReasonStartFailure = "start-failure"
47
StopReasonAborted = "aborted"
48
StopReasonOutOfSpace = "out-of-space"
49
StopReasonTimeout = "timeout"
50
StopReasonTabClosed = "tab-closed"
51
StopReasonRegular = "regular-stop"
52
)
53
54
type controllerMetrics struct {
55
startupTimeHistVec *prometheus.HistogramVec
56
pendingTimeHistVec *prometheus.HistogramVec
57
creatingTimeHistVec *prometheus.HistogramVec
58
totalStartsFailureCounterVec *prometheus.CounterVec
59
totalFailuresCounterVec *prometheus.CounterVec
60
totalStopsCounterVec *prometheus.CounterVec
61
totalRecreationsCounterVec *prometheus.CounterVec
62
63
totalBackupCounterVec *prometheus.CounterVec
64
totalBackupFailureCounterVec *prometheus.CounterVec
65
totalRestoreCounterVec *prometheus.CounterVec
66
totalRestoreFailureCounterVec *prometheus.CounterVec
67
68
workspacePhases *phaseTotalVec
69
timeoutSettings *timeoutSettingsVec
70
71
workspaceNodeUtilization *nodeUtilizationVec
72
73
workspaceActivityTotal *workspaceActivityVec
74
75
// used to prevent recording metrics multiple times
76
cache *lru.Cache
77
}
78
79
func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) {
80
cache, err := lru.New(6000)
81
if err != nil {
82
return nil, err
83
}
84
85
return &controllerMetrics{
86
startupTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{
87
Namespace: metricsNamespace,
88
Subsystem: metricsWorkspaceSubsystem,
89
Name: workspaceStartupSeconds,
90
Help: "time it took for workspace pods to reach the running phase",
91
Buckets: prometheus.ExponentialBuckets(2, 2, 10),
92
}, []string{"type", "class"}),
93
pendingTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{
94
Namespace: metricsNamespace,
95
Subsystem: metricsWorkspaceSubsystem,
96
Name: workspacePendingSeconds,
97
Help: "time the workspace spent in pending",
98
Buckets: prometheus.ExponentialBuckets(2, 2, 10),
99
}, []string{"type", "class"}),
100
creatingTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{
101
Namespace: metricsNamespace,
102
Subsystem: metricsWorkspaceSubsystem,
103
Name: workspaceCreatingSeconds,
104
Help: "time the workspace spent in creation",
105
Buckets: prometheus.ExponentialBuckets(2, 2, 10),
106
}, []string{"type", "class"}),
107
totalStartsFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
108
Namespace: metricsNamespace,
109
Subsystem: metricsWorkspaceSubsystem,
110
Name: workspaceStartFailuresTotal,
111
Help: "total number of workspaces that failed to start",
112
}, []string{"type", "class"}),
113
totalFailuresCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
114
Namespace: metricsNamespace,
115
Subsystem: metricsWorkspaceSubsystem,
116
Name: workspaceFailuresTotal,
117
Help: "total number of workspaces that had a failed condition",
118
}, []string{"type", "class"}),
119
totalStopsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
120
Namespace: metricsNamespace,
121
Subsystem: metricsWorkspaceSubsystem,
122
Name: workspaceStopsTotal,
123
Help: "total number of workspaces stopped",
124
}, []string{"reason", "type", "class"}),
125
totalRecreationsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
126
Namespace: metricsNamespace,
127
Subsystem: metricsWorkspaceSubsystem,
128
Name: workspaceRecreationsTotal,
129
Help: "total number of workspace recreations",
130
}, []string{"type", "class", "attempt"}),
131
132
totalBackupCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
133
Namespace: metricsNamespace,
134
Subsystem: metricsWorkspaceSubsystem,
135
Name: workspaceBackupsTotal,
136
Help: "total number of workspace backups",
137
}, []string{"type", "class"}),
138
totalBackupFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
139
Namespace: metricsNamespace,
140
Subsystem: metricsWorkspaceSubsystem,
141
Name: workspaceBackupFailuresTotal,
142
Help: "total number of workspace backup failures",
143
}, []string{"type", "class"}),
144
totalRestoreCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
145
Namespace: metricsNamespace,
146
Subsystem: metricsWorkspaceSubsystem,
147
Name: workspaceRestoresTotal,
148
Help: "total number of workspace restores",
149
}, []string{"type", "class"}),
150
totalRestoreFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
151
Namespace: metricsNamespace,
152
Subsystem: metricsWorkspaceSubsystem,
153
Name: workspaceRestoresFailureTotal,
154
Help: "total number of workspace restore failures",
155
}, []string{"type", "class"}),
156
157
workspacePhases: newPhaseTotalVec(r),
158
timeoutSettings: newTimeoutSettingsVec(r),
159
workspaceNodeUtilization: newNodeUtilizationVec(r),
160
workspaceActivityTotal: newWorkspaceActivityVec(r),
161
cache: cache,
162
}, nil
163
}
164
165
func (m *controllerMetrics) recordWorkspaceStartupTime(log *logr.Logger, ws *workspacev1.Workspace) {
166
class := ws.Spec.Class
167
tpe := string(ws.Spec.Type)
168
169
hist, err := m.startupTimeHistVec.GetMetricWithLabelValues(tpe, class)
170
if err != nil {
171
log.Error(err, "could not record workspace startup time", "type", tpe, "class", class)
172
}
173
174
duration := time.Since(ws.CreationTimestamp.Time)
175
hist.Observe(float64(duration.Seconds()))
176
}
177
178
func (m *controllerMetrics) recordWorkspacePendingTime(log *logr.Logger, ws *workspacev1.Workspace, pendingTs time.Time) {
179
class := ws.Spec.Class
180
tpe := string(ws.Spec.Type)
181
182
hist, err := m.pendingTimeHistVec.GetMetricWithLabelValues(tpe, class)
183
if err != nil {
184
log.Error(err, "could not record workspace pending time", "type", tpe, "class", class)
185
}
186
187
hist.Observe(time.Since(pendingTs).Seconds())
188
}
189
190
func (m *controllerMetrics) recordWorkspaceCreatingTime(log *logr.Logger, ws *workspacev1.Workspace, creatingTs time.Time) {
191
class := ws.Spec.Class
192
tpe := string(ws.Spec.Type)
193
194
hist, err := m.creatingTimeHistVec.GetMetricWithLabelValues(tpe, class)
195
if err != nil {
196
log.Error(err, "could not record workspace creating time", "type", tpe, "class", class)
197
}
198
199
hist.Observe(time.Since(creatingTs).Seconds())
200
}
201
202
func (m *controllerMetrics) countWorkspaceStartFailures(log *logr.Logger, ws *workspacev1.Workspace) {
203
class := ws.Spec.Class
204
tpe := string(ws.Spec.Type)
205
206
m.totalStartsFailureCounterVec.WithLabelValues(tpe, class).Inc()
207
}
208
209
func (m *controllerMetrics) countWorkspaceFailure(log *logr.Logger, ws *workspacev1.Workspace) {
210
class := ws.Spec.Class
211
tpe := string(ws.Spec.Type)
212
213
m.totalFailuresCounterVec.WithLabelValues(tpe, class).Inc()
214
}
215
216
func (m *controllerMetrics) countWorkspaceStop(log *logr.Logger, ws *workspacev1.Workspace) {
217
var reason string
218
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)); c != nil {
219
reason = StopReasonFailed
220
if !ws.IsConditionTrue(workspacev1.WorkspaceConditionEverReady) {
221
// Don't record 'failed' if there was a start failure.
222
reason = StopReasonStartFailure
223
} else if strings.Contains(c.Message, "Pod ephemeral local storage usage exceeds the total limit of containers") {
224
reason = StopReasonOutOfSpace
225
}
226
} else if ws.IsConditionTrue(workspacev1.WorkspaceConditionAborted) {
227
reason = StopReasonAborted
228
} else if ws.IsConditionTrue(workspacev1.WorkspaceConditionTimeout) {
229
reason = StopReasonTimeout
230
} else if ws.IsConditionTrue(workspacev1.WorkspaceConditionClosed) {
231
reason = StopReasonTabClosed
232
} else {
233
reason = StopReasonRegular
234
}
235
236
class := ws.Spec.Class
237
tpe := string(ws.Spec.Type)
238
239
log.Info("workspace stop reason", "type", tpe, "class", class, "reason", reason)
240
241
m.totalStopsCounterVec.WithLabelValues(reason, tpe, class).Inc()
242
}
243
244
func (m *controllerMetrics) countWorkspaceRecreations(log *logr.Logger, ws *workspacev1.Workspace) {
245
class := ws.Spec.Class
246
tpe := string(ws.Spec.Type)
247
attempt := fmt.Sprint(ws.Status.PodRecreated)
248
249
m.totalRecreationsCounterVec.WithLabelValues(tpe, class, attempt).Inc()
250
}
251
252
func (m *controllerMetrics) countTotalBackups(log *logr.Logger, ws *workspacev1.Workspace) {
253
class := ws.Spec.Class
254
tpe := string(ws.Spec.Type)
255
256
m.totalBackupCounterVec.WithLabelValues(tpe, class).Inc()
257
}
258
259
func (m *controllerMetrics) countTotalBackupFailures(log *logr.Logger, ws *workspacev1.Workspace) {
260
class := ws.Spec.Class
261
tpe := string(ws.Spec.Type)
262
263
m.totalBackupFailureCounterVec.WithLabelValues(tpe, class).Inc()
264
}
265
266
func (m *controllerMetrics) countTotalRestores(log *logr.Logger, ws *workspacev1.Workspace) {
267
class := ws.Spec.Class
268
tpe := string(ws.Spec.Type)
269
270
m.totalRestoreCounterVec.WithLabelValues(tpe, class).Inc()
271
}
272
273
func (m *controllerMetrics) countTotalRestoreFailures(log *logr.Logger, ws *workspacev1.Workspace) {
274
class := ws.Spec.Class
275
tpe := string(ws.Spec.Type)
276
277
m.totalRestoreFailureCounterVec.WithLabelValues(tpe, class).Inc()
278
}
279
280
func (m *controllerMetrics) containsWorkspace(ws *workspacev1.Workspace) bool {
281
return m.cache.Contains(ws.Name)
282
}
283
284
func (m *controllerMetrics) rememberWorkspace(ws *workspacev1.Workspace, state *metricState) {
285
var s metricState
286
if state != nil {
287
s = *state
288
} else {
289
s = newMetricState(ws)
290
}
291
m.cache.Add(ws.Name, s)
292
}
293
294
func (m *controllerMetrics) forgetWorkspace(ws *workspacev1.Workspace) {
295
m.cache.Remove(ws.Name)
296
}
297
298
// metricState is used to track which metrics have been recorded for a workspace.
299
type metricState struct {
300
phase workspacev1.WorkspacePhase
301
pendingStartTime time.Time
302
creatingStartTime time.Time
303
recordedStartTime bool
304
recordedInitFailure bool
305
recordedStartFailure bool
306
recordedFailure bool
307
recordedContentReady bool
308
recordedBackupFailed bool
309
recordedBackupCompleted bool
310
recordedRecreations int
311
}
312
313
func newMetricState(ws *workspacev1.Workspace) metricState {
314
return metricState{
315
phase: ws.Status.Phase,
316
// Here we assume that we've recorded metrics for the following states already if their conditions already exist.
317
// This is to prevent these from being re-recorded after the controller restarts and clears the metric state for
318
// each workspace.
319
recordedStartTime: ws.Status.Phase == workspacev1.WorkspacePhaseRunning,
320
recordedInitFailure: wsk8s.ConditionWithStatusAndReason(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, workspacev1.ReasonInitializationFailure),
321
recordedStartFailure: ws.Status.Phase == workspacev1.WorkspacePhaseStopped && isStartFailure(ws),
322
recordedFailure: ws.IsConditionTrue(workspacev1.WorkspaceConditionFailed),
323
recordedContentReady: ws.IsConditionTrue(workspacev1.WorkspaceConditionContentReady),
324
recordedBackupFailed: ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupFailure),
325
recordedBackupCompleted: ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupComplete),
326
recordedRecreations: ws.Status.PodRecreated,
327
}
328
}
329
330
// getWorkspace returns the last recorded metric state for that workspace.
331
func (m *controllerMetrics) getWorkspace(log *logr.Logger, ws *workspacev1.Workspace) (bool, metricState) {
332
s, ok := m.cache.Get(ws.Name)
333
if !ok {
334
return false, metricState{}
335
}
336
337
return true, s.(metricState)
338
}
339
340
// Describe implements Collector. It will send exactly one Desc to the provided channel.
341
func (m *controllerMetrics) Describe(ch chan<- *prometheus.Desc) {
342
m.startupTimeHistVec.Describe(ch)
343
m.pendingTimeHistVec.Describe(ch)
344
m.creatingTimeHistVec.Describe(ch)
345
m.totalStopsCounterVec.Describe(ch)
346
m.totalStartsFailureCounterVec.Describe(ch)
347
m.totalFailuresCounterVec.Describe(ch)
348
349
m.totalBackupCounterVec.Describe(ch)
350
m.totalBackupFailureCounterVec.Describe(ch)
351
m.totalRestoreCounterVec.Describe(ch)
352
m.totalRestoreFailureCounterVec.Describe(ch)
353
354
m.workspacePhases.Describe(ch)
355
m.timeoutSettings.Describe(ch)
356
m.workspaceNodeUtilization.Describe(ch)
357
m.workspaceActivityTotal.Describe(ch)
358
}
359
360
// Collect implements Collector.
361
func (m *controllerMetrics) Collect(ch chan<- prometheus.Metric) {
362
m.startupTimeHistVec.Collect(ch)
363
m.pendingTimeHistVec.Collect(ch)
364
m.creatingTimeHistVec.Collect(ch)
365
m.totalStopsCounterVec.Collect(ch)
366
m.totalStartsFailureCounterVec.Collect(ch)
367
m.totalFailuresCounterVec.Collect(ch)
368
369
m.totalBackupCounterVec.Collect(ch)
370
m.totalBackupFailureCounterVec.Collect(ch)
371
m.totalRestoreCounterVec.Collect(ch)
372
m.totalRestoreFailureCounterVec.Collect(ch)
373
374
m.workspacePhases.Collect(ch)
375
m.timeoutSettings.Collect(ch)
376
m.workspaceNodeUtilization.Collect(ch)
377
m.workspaceActivityTotal.Collect(ch)
378
}
379
380
// phaseTotalVec returns a gauge vector counting the workspaces per phase
381
type phaseTotalVec struct {
382
name string
383
desc *prometheus.Desc
384
reconciler *WorkspaceReconciler
385
}
386
387
func newPhaseTotalVec(r *WorkspaceReconciler) *phaseTotalVec {
388
name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, "workspace_phase_total")
389
return &phaseTotalVec{
390
name: name,
391
desc: prometheus.NewDesc(name, "Current number of workspaces per phase", []string{"phase", "type", "class"}, prometheus.Labels(map[string]string{})),
392
reconciler: r,
393
}
394
}
395
396
// Describe implements Collector. It will send exactly one Desc to the provided channel.
397
func (ptv *phaseTotalVec) Describe(ch chan<- *prometheus.Desc) {
398
ch <- ptv.desc
399
}
400
401
// Collect implements Collector.
402
func (ptv *phaseTotalVec) Collect(ch chan<- prometheus.Metric) {
403
ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
404
defer cancel()
405
406
var workspaces workspacev1.WorkspaceList
407
err := ptv.reconciler.List(ctx, &workspaces, client.InNamespace(ptv.reconciler.Config.Namespace))
408
if err != nil {
409
return
410
}
411
412
counts := make(map[string]int)
413
for _, ws := range workspaces.Items {
414
counts[string(ws.Spec.Type)+"::"+string(ws.Status.Phase)+"::"+ws.Spec.Class]++
415
}
416
417
for key, count := range counts {
418
segs := strings.Split(key, "::")
419
tpe, phase, class := segs[0], segs[1], segs[2]
420
421
metric, err := prometheus.NewConstMetric(ptv.desc, prometheus.GaugeValue, float64(count), phase, tpe, class)
422
if err != nil {
423
continue
424
}
425
426
ch <- metric
427
}
428
}
429
430
// timeoutSettingsVec provides a gauge of the currently active/inactive workspaces.
431
// Adding both up returns the total number of workspaces.
432
type timeoutSettingsVec struct {
433
name string
434
reconciler *WorkspaceReconciler
435
desc *prometheus.Desc
436
}
437
438
func newTimeoutSettingsVec(r *WorkspaceReconciler) *timeoutSettingsVec {
439
name := prometheus.BuildFQName("wsman", "workspace", "timeout_settings_total")
440
desc := prometheus.NewDesc(
441
name,
442
"Current number of workspaces per timeout setting",
443
[]string{"timeout"},
444
prometheus.Labels(map[string]string{}),
445
)
446
return &timeoutSettingsVec{
447
name: name,
448
reconciler: r,
449
desc: desc,
450
}
451
}
452
453
// Describe implements Collector. It will send exactly one Desc to the provided channel.
454
func (vec *timeoutSettingsVec) Describe(ch chan<- *prometheus.Desc) {
455
ch <- vec.desc
456
}
457
458
// Collect implements Collector.
459
func (tsv *timeoutSettingsVec) Collect(ch chan<- prometheus.Metric) {
460
ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
461
defer cancel()
462
463
var workspaces workspacev1.WorkspaceList
464
err := tsv.reconciler.List(ctx, &workspaces, client.InNamespace(tsv.reconciler.Config.Namespace))
465
if err != nil {
466
return
467
}
468
469
timeouts := make(map[time.Duration]int)
470
for _, ws := range workspaces.Items {
471
if ws.Spec.Timeout.Time == nil {
472
continue
473
}
474
475
timeouts[ws.Spec.Timeout.Time.Duration]++
476
}
477
478
for phase, cnt := range timeouts {
479
// metrics cannot be re-used, we have to create them every single time
480
metric, err := prometheus.NewConstMetric(tsv.desc, prometheus.GaugeValue, float64(cnt), phase.String())
481
if err != nil {
482
continue
483
}
484
485
ch <- metric
486
}
487
}
488
489
type maintenanceEnabledGauge struct {
490
name string
491
desc *prometheus.Desc
492
maintenance maintenance.Maintenance
493
}
494
495
func newMaintenanceEnabledGauge(m maintenance.Maintenance) *maintenanceEnabledGauge {
496
name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, maintenanceEnabled)
497
return &maintenanceEnabledGauge{
498
name: name,
499
desc: prometheus.NewDesc(name, "Whether the cluster is in maintenance mode", nil, prometheus.Labels(map[string]string{})),
500
maintenance: m,
501
}
502
}
503
504
func (m *maintenanceEnabledGauge) Describe(ch chan<- *prometheus.Desc) {
505
ch <- m.desc
506
}
507
508
func (m *maintenanceEnabledGauge) Collect(ch chan<- prometheus.Metric) {
509
var value float64
510
if m.maintenance.IsEnabled(context.Background()) {
511
value = 1
512
}
513
514
metric, err := prometheus.NewConstMetric(m.desc, prometheus.GaugeValue, value)
515
if err != nil {
516
return
517
}
518
519
ch <- metric
520
}
521
522
// nodeUtilizationVec provides metrics per workspace node on:
523
// - the amount of cpu/memory requested by workspaces on the node (size of the workspace class)
524
// CPU is measured in cores, memory in bytes.
525
// Differentiates between headless and regular workspace nodes using the type label.
526
// Useful to determine node utilization and capacity.
527
type nodeUtilizationVec struct {
528
name string
529
desc *prometheus.Desc
530
reconciler *WorkspaceReconciler
531
}
532
533
func newNodeUtilizationVec(r *WorkspaceReconciler) *nodeUtilizationVec {
534
name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, workspaceNodeUtilization)
535
desc := prometheus.NewDesc(
536
name,
537
"Amount of resources requested by workspaces on the node (cpu/memory, workspace type)",
538
[]string{"node", "resource", "type"},
539
prometheus.Labels(map[string]string{}),
540
)
541
return &nodeUtilizationVec{
542
name: name,
543
reconciler: r,
544
desc: desc,
545
}
546
}
547
548
// Describe implements Collector. It will send exactly one Desc to the provided channel.
549
func (n *nodeUtilizationVec) Describe(ch chan<- *prometheus.Desc) {
550
ch <- n.desc
551
}
552
553
// Collect implements Collector.
554
func (n *nodeUtilizationVec) Collect(ch chan<- prometheus.Metric) {
555
ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
556
defer cancel()
557
558
var nodes corev1.NodeList
559
err := n.reconciler.List(ctx, &nodes)
560
if err != nil {
561
log.FromContext(ctx).Error(err, "cannot list nodes for node utilization metric")
562
return
563
}
564
565
var (
566
nodeUtilization = make(map[string]map[corev1.ResourceName]float64)
567
nodeTypes = make(map[string]string)
568
)
569
for _, node := range nodes.Items {
570
isRegular := node.Labels["gitpod.io/workload_workspace_regular"] == "true"
571
isHeadless := node.Labels["gitpod.io/workload_workspace_headless"] == "true"
572
if !isRegular && !isHeadless {
573
// Ignore non-workspace nodes.
574
continue
575
}
576
577
nodeUtilization[node.Name] = map[corev1.ResourceName]float64{
578
corev1.ResourceCPU: 0,
579
corev1.ResourceMemory: 0,
580
}
581
nodeTypes[node.Name] = "regular"
582
if !isRegular && isHeadless {
583
// In case a node is both regular and headless (e.g. a preview env), mark it as regular.
584
nodeTypes[node.Name] = "headless"
585
}
586
}
587
588
var workspaces workspacev1.WorkspaceList
589
if err = n.reconciler.List(ctx, &workspaces, client.InNamespace(n.reconciler.Config.Namespace)); err != nil {
590
log.FromContext(ctx).Error(err, "cannot list workspaces for node utilization metric")
591
return
592
}
593
594
// Aggregate workspace pod resource requests per node.
595
for _, ws := range workspaces.Items {
596
// This list is indexed and reads from memory, so it's not that expensive to do this for every workspace.
597
pods, err := n.reconciler.listWorkspacePods(ctx, &ws)
598
if err != nil {
599
log.FromContext(ctx).Error(err, "cannot list workspace pods for node utilization metric", "workspace", ws.Name)
600
continue
601
}
602
603
if len(pods.Items) == 0 {
604
// No pods (yet), not consuming resources on the node.
605
continue
606
}
607
608
for _, pod := range pods.Items {
609
nodeName := pod.Spec.NodeName
610
if nodeName == "" {
611
// Not yet scheduled.
612
continue
613
}
614
615
if _, ok := nodeUtilization[nodeName]; !ok {
616
nodeUtilization[nodeName] = map[corev1.ResourceName]float64{
617
corev1.ResourceCPU: 0,
618
corev1.ResourceMemory: 0,
619
}
620
}
621
622
for _, container := range pod.Spec.Containers {
623
requests := container.Resources.Requests
624
nodeUtilization[nodeName][corev1.ResourceCPU] += float64(requests.Cpu().MilliValue()) / 1000.0
625
nodeUtilization[nodeName][corev1.ResourceMemory] += float64(requests.Memory().Value())
626
}
627
}
628
}
629
630
for nodeName, metrics := range nodeUtilization {
631
for resource, value := range metrics {
632
nodeType := nodeTypes[nodeName]
633
metric, err := prometheus.NewConstMetric(n.desc, prometheus.GaugeValue, value, nodeName, resource.String(), nodeType)
634
if err != nil {
635
log.FromContext(ctx).Error(err, "cannot create node utilization metric", "node", nodeName, "resource", resource.String(), "type", nodeType)
636
continue
637
}
638
639
ch <- metric
640
}
641
}
642
}
643
644
type workspaceActivityVec struct {
645
name string
646
desc *prometheus.Desc
647
reconciler *WorkspaceReconciler
648
}
649
650
func newWorkspaceActivityVec(r *WorkspaceReconciler) *workspaceActivityVec {
651
name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, workspaceActivityTotal)
652
desc := prometheus.NewDesc(
653
name,
654
"total number of active workspaces",
655
[]string{"active"},
656
prometheus.Labels(map[string]string{}),
657
)
658
return &workspaceActivityVec{
659
name: name,
660
desc: desc,
661
reconciler: r,
662
}
663
}
664
665
// Describe implements Collector. It will send exactly one Desc to the provided channel.
666
func (wav *workspaceActivityVec) Describe(ch chan<- *prometheus.Desc) {
667
ch <- wav.desc
668
}
669
670
func (wav *workspaceActivityVec) Collect(ch chan<- prometheus.Metric) {
671
ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
672
defer cancel()
673
674
active, notActive, err := wav.getWorkspaceActivityCounts()
675
if err != nil {
676
log.FromContext(ctx).Error(err, fmt.Sprintf("cannot determine active/inactive counts - %s will be inaccurate", wav.name))
677
return
678
}
679
680
activeMetrics, err := prometheus.NewConstMetric(wav.desc, prometheus.GaugeValue, float64(active), "true")
681
if err != nil {
682
log.FromContext(ctx).Error(err, "cannot create wrokspace activity metric", "active", "true")
683
return
684
}
685
notActiveMetrics, err := prometheus.NewConstMetric(wav.desc, prometheus.GaugeValue, float64(notActive), "false")
686
if err != nil {
687
log.FromContext(ctx).Error(err, "cannot create wrokspace activity metric", "active", "false")
688
return
689
}
690
691
ch <- activeMetrics
692
ch <- notActiveMetrics
693
}
694
695
func (wav *workspaceActivityVec) getWorkspaceActivityCounts() (active, notActive int, err error) {
696
var workspaces workspacev1.WorkspaceList
697
if err = wav.reconciler.List(context.Background(), &workspaces, client.InNamespace(wav.reconciler.Config.Namespace)); err != nil {
698
return 0, 0, err
699
}
700
701
for _, ws := range workspaces.Items {
702
if ws.Spec.Type != workspacev1.WorkspaceTypeRegular {
703
continue
704
}
705
706
hasActivity := activity.Last(&ws) != nil
707
if hasActivity {
708
active++
709
} else {
710
notActive++
711
}
712
}
713
714
return
715
}
716
717