CoCalc -- metrics.go

GitHub Repository: gitpod-io/gitpod
Path: blob/main/components/ws-manager-mk2/controllers/metrics.go
²⁴⁹⁸ views
1
// Copyright (c) 2023 Gitpod GmbH. All rights reserved.
2
// Licensed under the GNU Affero General Public License (AGPL).
3
// See License.AGPL.txt in the project root for license information.
4

5
package controllers
6

7
import (
8
	"context"
9
	"fmt"
10
	"strings"
11
	"time"
12

13
	wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
14
	"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/activity"
15
	"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/maintenance"
16
	workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
17
	"github.com/go-logr/logr"
18
	lru "github.com/hashicorp/golang-lru"
19
	"github.com/prometheus/client_golang/prometheus"
20
	corev1 "k8s.io/api/core/v1"
21
	"sigs.k8s.io/controller-runtime/pkg/client"
22
	"sigs.k8s.io/controller-runtime/pkg/log"
23
)
24

25
const (
26
	maintenanceEnabled            string = "maintenance_enabled"
27
	workspaceStartupSeconds       string = "workspace_startup_seconds"
28
	workspacePendingSeconds       string = "workspace_pending_seconds"
29
	workspaceCreatingSeconds      string = "workspace_creating_seconds"
30
	workspaceStartFailuresTotal   string = "workspace_starts_failure_total"
31
	workspaceFailuresTotal        string = "workspace_failure_total"
32
	workspaceStopsTotal           string = "workspace_stops_total"
33
	workspaceRecreationsTotal     string = "workspace_recreations_total"
34
	workspaceBackupsTotal         string = "workspace_backups_total"
35
	workspaceBackupFailuresTotal  string = "workspace_backups_failure_total"
36
	workspaceRestoresTotal        string = "workspace_restores_total"
37
	workspaceRestoresFailureTotal string = "workspace_restores_failure_total"
38
	workspaceNodeUtilization      string = "workspace_node_utilization"
39
	workspaceActivityTotal        string = "workspace_activity_total"
40
)
41

42
type StopReason string
43

44
const (
45
	StopReasonFailed       = "failed"
46
	StopReasonStartFailure = "start-failure"
47
	StopReasonAborted      = "aborted"
48
	StopReasonOutOfSpace   = "out-of-space"
49
	StopReasonTimeout      = "timeout"
50
	StopReasonTabClosed    = "tab-closed"
51
	StopReasonRegular      = "regular-stop"
52
)
53

54
type controllerMetrics struct {
55
	startupTimeHistVec           *prometheus.HistogramVec
56
	pendingTimeHistVec           *prometheus.HistogramVec
57
	creatingTimeHistVec          *prometheus.HistogramVec
58
	totalStartsFailureCounterVec *prometheus.CounterVec
59
	totalFailuresCounterVec      *prometheus.CounterVec
60
	totalStopsCounterVec         *prometheus.CounterVec
61
	totalRecreationsCounterVec   *prometheus.CounterVec
62

63
	totalBackupCounterVec         *prometheus.CounterVec
64
	totalBackupFailureCounterVec  *prometheus.CounterVec
65
	totalRestoreCounterVec        *prometheus.CounterVec
66
	totalRestoreFailureCounterVec *prometheus.CounterVec
67

68
	workspacePhases *phaseTotalVec
69
	timeoutSettings *timeoutSettingsVec
70

71
	workspaceNodeUtilization *nodeUtilizationVec
72

73
	workspaceActivityTotal *workspaceActivityVec
74

75
	// used to prevent recording metrics multiple times
76
	cache *lru.Cache
77
}
78

79
func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) {
80
	cache, err := lru.New(6000)
81
	if err != nil {
82
		return nil, err
83
	}
84

85
	return &controllerMetrics{
86
		startupTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{
87
			Namespace: metricsNamespace,
88
			Subsystem: metricsWorkspaceSubsystem,
89
			Name:      workspaceStartupSeconds,
90
			Help:      "time it took for workspace pods to reach the running phase",
91
			Buckets:   prometheus.ExponentialBuckets(2, 2, 10),
92
		}, []string{"type", "class"}),
93
		pendingTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{
94
			Namespace: metricsNamespace,
95
			Subsystem: metricsWorkspaceSubsystem,
96
			Name:      workspacePendingSeconds,
97
			Help:      "time the workspace spent in pending",
98
			Buckets:   prometheus.ExponentialBuckets(2, 2, 10),
99
		}, []string{"type", "class"}),
100
		creatingTimeHistVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{
101
			Namespace: metricsNamespace,
102
			Subsystem: metricsWorkspaceSubsystem,
103
			Name:      workspaceCreatingSeconds,
104
			Help:      "time the workspace spent in creation",
105
			Buckets:   prometheus.ExponentialBuckets(2, 2, 10),
106
		}, []string{"type", "class"}),
107
		totalStartsFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
108
			Namespace: metricsNamespace,
109
			Subsystem: metricsWorkspaceSubsystem,
110
			Name:      workspaceStartFailuresTotal,
111
			Help:      "total number of workspaces that failed to start",
112
		}, []string{"type", "class"}),
113
		totalFailuresCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
114
			Namespace: metricsNamespace,
115
			Subsystem: metricsWorkspaceSubsystem,
116
			Name:      workspaceFailuresTotal,
117
			Help:      "total number of workspaces that had a failed condition",
118
		}, []string{"type", "class"}),
119
		totalStopsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
120
			Namespace: metricsNamespace,
121
			Subsystem: metricsWorkspaceSubsystem,
122
			Name:      workspaceStopsTotal,
123
			Help:      "total number of workspaces stopped",
124
		}, []string{"reason", "type", "class"}),
125
		totalRecreationsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
126
			Namespace: metricsNamespace,
127
			Subsystem: metricsWorkspaceSubsystem,
128
			Name:      workspaceRecreationsTotal,
129
			Help:      "total number of workspace recreations",
130
		}, []string{"type", "class", "attempt"}),
131

132
		totalBackupCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
133
			Namespace: metricsNamespace,
134
			Subsystem: metricsWorkspaceSubsystem,
135
			Name:      workspaceBackupsTotal,
136
			Help:      "total number of workspace backups",
137
		}, []string{"type", "class"}),
138
		totalBackupFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
139
			Namespace: metricsNamespace,
140
			Subsystem: metricsWorkspaceSubsystem,
141
			Name:      workspaceBackupFailuresTotal,
142
			Help:      "total number of workspace backup failures",
143
		}, []string{"type", "class"}),
144
		totalRestoreCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
145
			Namespace: metricsNamespace,
146
			Subsystem: metricsWorkspaceSubsystem,
147
			Name:      workspaceRestoresTotal,
148
			Help:      "total number of workspace restores",
149
		}, []string{"type", "class"}),
150
		totalRestoreFailureCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
151
			Namespace: metricsNamespace,
152
			Subsystem: metricsWorkspaceSubsystem,
153
			Name:      workspaceRestoresFailureTotal,
154
			Help:      "total number of workspace restore failures",
155
		}, []string{"type", "class"}),
156

157
		workspacePhases:          newPhaseTotalVec(r),
158
		timeoutSettings:          newTimeoutSettingsVec(r),
159
		workspaceNodeUtilization: newNodeUtilizationVec(r),
160
		workspaceActivityTotal:   newWorkspaceActivityVec(r),
161
		cache:                    cache,
162
	}, nil
163
}
164

165
func (m *controllerMetrics) recordWorkspaceStartupTime(log *logr.Logger, ws *workspacev1.Workspace) {
166
	class := ws.Spec.Class
167
	tpe := string(ws.Spec.Type)
168

169
	hist, err := m.startupTimeHistVec.GetMetricWithLabelValues(tpe, class)
170
	if err != nil {
171
		log.Error(err, "could not record workspace startup time", "type", tpe, "class", class)
172
	}
173

174
	duration := time.Since(ws.CreationTimestamp.Time)
175
	hist.Observe(float64(duration.Seconds()))
176
}
177

178
func (m *controllerMetrics) recordWorkspacePendingTime(log *logr.Logger, ws *workspacev1.Workspace, pendingTs time.Time) {
179
	class := ws.Spec.Class
180
	tpe := string(ws.Spec.Type)
181

182
	hist, err := m.pendingTimeHistVec.GetMetricWithLabelValues(tpe, class)
183
	if err != nil {
184
		log.Error(err, "could not record workspace pending time", "type", tpe, "class", class)
185
	}
186

187
	hist.Observe(time.Since(pendingTs).Seconds())
188
}
189

190
func (m *controllerMetrics) recordWorkspaceCreatingTime(log *logr.Logger, ws *workspacev1.Workspace, creatingTs time.Time) {
191
	class := ws.Spec.Class
192
	tpe := string(ws.Spec.Type)
193

194
	hist, err := m.creatingTimeHistVec.GetMetricWithLabelValues(tpe, class)
195
	if err != nil {
196
		log.Error(err, "could not record workspace creating time", "type", tpe, "class", class)
197
	}
198

199
	hist.Observe(time.Since(creatingTs).Seconds())
200
}
201

202
func (m *controllerMetrics) countWorkspaceStartFailures(log *logr.Logger, ws *workspacev1.Workspace) {
203
	class := ws.Spec.Class
204
	tpe := string(ws.Spec.Type)
205

206
	m.totalStartsFailureCounterVec.WithLabelValues(tpe, class).Inc()
207
}
208

209
func (m *controllerMetrics) countWorkspaceFailure(log *logr.Logger, ws *workspacev1.Workspace) {
210
	class := ws.Spec.Class
211
	tpe := string(ws.Spec.Type)
212

213
	m.totalFailuresCounterVec.WithLabelValues(tpe, class).Inc()
214
}
215

216
func (m *controllerMetrics) countWorkspaceStop(log *logr.Logger, ws *workspacev1.Workspace) {
217
	var reason string
218
	if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)); c != nil {
219
		reason = StopReasonFailed
220
		if !ws.IsConditionTrue(workspacev1.WorkspaceConditionEverReady) {
221
			// Don't record 'failed' if there was a start failure.
222
			reason = StopReasonStartFailure
223
		} else if strings.Contains(c.Message, "Pod ephemeral local storage usage exceeds the total limit of containers") {
224
			reason = StopReasonOutOfSpace
225
		}
226
	} else if ws.IsConditionTrue(workspacev1.WorkspaceConditionAborted) {
227
		reason = StopReasonAborted
228
	} else if ws.IsConditionTrue(workspacev1.WorkspaceConditionTimeout) {
229
		reason = StopReasonTimeout
230
	} else if ws.IsConditionTrue(workspacev1.WorkspaceConditionClosed) {
231
		reason = StopReasonTabClosed
232
	} else {
233
		reason = StopReasonRegular
234
	}
235

236
	class := ws.Spec.Class
237
	tpe := string(ws.Spec.Type)
238

239
	log.Info("workspace stop reason", "type", tpe, "class", class, "reason", reason)
240

241
	m.totalStopsCounterVec.WithLabelValues(reason, tpe, class).Inc()
242
}
243

244
func (m *controllerMetrics) countWorkspaceRecreations(log *logr.Logger, ws *workspacev1.Workspace) {
245
	class := ws.Spec.Class
246
	tpe := string(ws.Spec.Type)
247
	attempt := fmt.Sprint(ws.Status.PodRecreated)
248

249
	m.totalRecreationsCounterVec.WithLabelValues(tpe, class, attempt).Inc()
250
}
251

252
func (m *controllerMetrics) countTotalBackups(log *logr.Logger, ws *workspacev1.Workspace) {
253
	class := ws.Spec.Class
254
	tpe := string(ws.Spec.Type)
255

256
	m.totalBackupCounterVec.WithLabelValues(tpe, class).Inc()
257
}
258

259
func (m *controllerMetrics) countTotalBackupFailures(log *logr.Logger, ws *workspacev1.Workspace) {
260
	class := ws.Spec.Class
261
	tpe := string(ws.Spec.Type)
262

263
	m.totalBackupFailureCounterVec.WithLabelValues(tpe, class).Inc()
264
}
265

266
func (m *controllerMetrics) countTotalRestores(log *logr.Logger, ws *workspacev1.Workspace) {
267
	class := ws.Spec.Class
268
	tpe := string(ws.Spec.Type)
269

270
	m.totalRestoreCounterVec.WithLabelValues(tpe, class).Inc()
271
}
272

273
func (m *controllerMetrics) countTotalRestoreFailures(log *logr.Logger, ws *workspacev1.Workspace) {
274
	class := ws.Spec.Class
275
	tpe := string(ws.Spec.Type)
276

277
	m.totalRestoreFailureCounterVec.WithLabelValues(tpe, class).Inc()
278
}
279

280
func (m *controllerMetrics) containsWorkspace(ws *workspacev1.Workspace) bool {
281
	return m.cache.Contains(ws.Name)
282
}
283

284
func (m *controllerMetrics) rememberWorkspace(ws *workspacev1.Workspace, state *metricState) {
285
	var s metricState
286
	if state != nil {
287
		s = *state
288
	} else {
289
		s = newMetricState(ws)
290
	}
291
	m.cache.Add(ws.Name, s)
292
}
293

294
func (m *controllerMetrics) forgetWorkspace(ws *workspacev1.Workspace) {
295
	m.cache.Remove(ws.Name)
296
}
297

298
// metricState is used to track which metrics have been recorded for a workspace.
299
type metricState struct {
300
	phase                   workspacev1.WorkspacePhase
301
	pendingStartTime        time.Time
302
	creatingStartTime       time.Time
303
	recordedStartTime       bool
304
	recordedInitFailure     bool
305
	recordedStartFailure    bool
306
	recordedFailure         bool
307
	recordedContentReady    bool
308
	recordedBackupFailed    bool
309
	recordedBackupCompleted bool
310
	recordedRecreations     int
311
}
312

313
func newMetricState(ws *workspacev1.Workspace) metricState {
314
	return metricState{
315
		phase: ws.Status.Phase,
316
		// Here we assume that we've recorded metrics for the following states already if their conditions already exist.
317
		// This is to prevent these from being re-recorded after the controller restarts and clears the metric state for
318
		// each workspace.
319
		recordedStartTime:       ws.Status.Phase == workspacev1.WorkspacePhaseRunning,
320
		recordedInitFailure:     wsk8s.ConditionWithStatusAndReason(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, workspacev1.ReasonInitializationFailure),
321
		recordedStartFailure:    ws.Status.Phase == workspacev1.WorkspacePhaseStopped && isStartFailure(ws),
322
		recordedFailure:         ws.IsConditionTrue(workspacev1.WorkspaceConditionFailed),
323
		recordedContentReady:    ws.IsConditionTrue(workspacev1.WorkspaceConditionContentReady),
324
		recordedBackupFailed:    ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupFailure),
325
		recordedBackupCompleted: ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupComplete),
326
		recordedRecreations:     ws.Status.PodRecreated,
327
	}
328
}
329

330
// getWorkspace returns the last recorded metric state for that workspace.
331
func (m *controllerMetrics) getWorkspace(log *logr.Logger, ws *workspacev1.Workspace) (bool, metricState) {
332
	s, ok := m.cache.Get(ws.Name)
333
	if !ok {
334
		return false, metricState{}
335
	}
336

337
	return true, s.(metricState)
338
}
339

340
// Describe implements Collector. It will send exactly one Desc to the provided channel.
341
func (m *controllerMetrics) Describe(ch chan<- *prometheus.Desc) {
342
	m.startupTimeHistVec.Describe(ch)
343
	m.pendingTimeHistVec.Describe(ch)
344
	m.creatingTimeHistVec.Describe(ch)
345
	m.totalStopsCounterVec.Describe(ch)
346
	m.totalStartsFailureCounterVec.Describe(ch)
347
	m.totalFailuresCounterVec.Describe(ch)
348

349
	m.totalBackupCounterVec.Describe(ch)
350
	m.totalBackupFailureCounterVec.Describe(ch)
351
	m.totalRestoreCounterVec.Describe(ch)
352
	m.totalRestoreFailureCounterVec.Describe(ch)
353

354
	m.workspacePhases.Describe(ch)
355
	m.timeoutSettings.Describe(ch)
356
	m.workspaceNodeUtilization.Describe(ch)
357
	m.workspaceActivityTotal.Describe(ch)
358
}
359

360
// Collect implements Collector.
361
func (m *controllerMetrics) Collect(ch chan<- prometheus.Metric) {
362
	m.startupTimeHistVec.Collect(ch)
363
	m.pendingTimeHistVec.Collect(ch)
364
	m.creatingTimeHistVec.Collect(ch)
365
	m.totalStopsCounterVec.Collect(ch)
366
	m.totalStartsFailureCounterVec.Collect(ch)
367
	m.totalFailuresCounterVec.Collect(ch)
368

369
	m.totalBackupCounterVec.Collect(ch)
370
	m.totalBackupFailureCounterVec.Collect(ch)
371
	m.totalRestoreCounterVec.Collect(ch)
372
	m.totalRestoreFailureCounterVec.Collect(ch)
373

374
	m.workspacePhases.Collect(ch)
375
	m.timeoutSettings.Collect(ch)
376
	m.workspaceNodeUtilization.Collect(ch)
377
	m.workspaceActivityTotal.Collect(ch)
378
}
379

380
// phaseTotalVec returns a gauge vector counting the workspaces per phase
381
type phaseTotalVec struct {
382
	name       string
383
	desc       *prometheus.Desc
384
	reconciler *WorkspaceReconciler
385
}
386

387
func newPhaseTotalVec(r *WorkspaceReconciler) *phaseTotalVec {
388
	name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, "workspace_phase_total")
389
	return &phaseTotalVec{
390
		name:       name,
391
		desc:       prometheus.NewDesc(name, "Current number of workspaces per phase", []string{"phase", "type", "class"}, prometheus.Labels(map[string]string{})),
392
		reconciler: r,
393
	}
394
}
395

396
// Describe implements Collector. It will send exactly one Desc to the provided channel.
397
func (ptv *phaseTotalVec) Describe(ch chan<- *prometheus.Desc) {
398
	ch <- ptv.desc
399
}
400

401
// Collect implements Collector.
402
func (ptv *phaseTotalVec) Collect(ch chan<- prometheus.Metric) {
403
	ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
404
	defer cancel()
405

406
	var workspaces workspacev1.WorkspaceList
407
	err := ptv.reconciler.List(ctx, &workspaces, client.InNamespace(ptv.reconciler.Config.Namespace))
408
	if err != nil {
409
		return
410
	}
411

412
	counts := make(map[string]int)
413
	for _, ws := range workspaces.Items {
414
		counts[string(ws.Spec.Type)+"::"+string(ws.Status.Phase)+"::"+ws.Spec.Class]++
415
	}
416

417
	for key, count := range counts {
418
		segs := strings.Split(key, "::")
419
		tpe, phase, class := segs[0], segs[1], segs[2]
420

421
		metric, err := prometheus.NewConstMetric(ptv.desc, prometheus.GaugeValue, float64(count), phase, tpe, class)
422
		if err != nil {
423
			continue
424
		}
425

426
		ch <- metric
427
	}
428
}
429

430
// timeoutSettingsVec provides a gauge of the currently active/inactive workspaces.
431
// Adding both up returns the total number of workspaces.
432
type timeoutSettingsVec struct {
433
	name       string
434
	reconciler *WorkspaceReconciler
435
	desc       *prometheus.Desc
436
}
437

438
func newTimeoutSettingsVec(r *WorkspaceReconciler) *timeoutSettingsVec {
439
	name := prometheus.BuildFQName("wsman", "workspace", "timeout_settings_total")
440
	desc := prometheus.NewDesc(
441
		name,
442
		"Current number of workspaces per timeout setting",
443
		[]string{"timeout"},
444
		prometheus.Labels(map[string]string{}),
445
	)
446
	return &timeoutSettingsVec{
447
		name:       name,
448
		reconciler: r,
449
		desc:       desc,
450
	}
451
}
452

453
// Describe implements Collector. It will send exactly one Desc to the provided channel.
454
func (vec *timeoutSettingsVec) Describe(ch chan<- *prometheus.Desc) {
455
	ch <- vec.desc
456
}
457

458
// Collect implements Collector.
459
func (tsv *timeoutSettingsVec) Collect(ch chan<- prometheus.Metric) {
460
	ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
461
	defer cancel()
462

463
	var workspaces workspacev1.WorkspaceList
464
	err := tsv.reconciler.List(ctx, &workspaces, client.InNamespace(tsv.reconciler.Config.Namespace))
465
	if err != nil {
466
		return
467
	}
468

469
	timeouts := make(map[time.Duration]int)
470
	for _, ws := range workspaces.Items {
471
		if ws.Spec.Timeout.Time == nil {
472
			continue
473
		}
474

475
		timeouts[ws.Spec.Timeout.Time.Duration]++
476
	}
477

478
	for phase, cnt := range timeouts {
479
		// metrics cannot be re-used, we have to create them every single time
480
		metric, err := prometheus.NewConstMetric(tsv.desc, prometheus.GaugeValue, float64(cnt), phase.String())
481
		if err != nil {
482
			continue
483
		}
484

485
		ch <- metric
486
	}
487
}
488

489
type maintenanceEnabledGauge struct {
490
	name        string
491
	desc        *prometheus.Desc
492
	maintenance maintenance.Maintenance
493
}
494

495
func newMaintenanceEnabledGauge(m maintenance.Maintenance) *maintenanceEnabledGauge {
496
	name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, maintenanceEnabled)
497
	return &maintenanceEnabledGauge{
498
		name:        name,
499
		desc:        prometheus.NewDesc(name, "Whether the cluster is in maintenance mode", nil, prometheus.Labels(map[string]string{})),
500
		maintenance: m,
501
	}
502
}
503

504
func (m *maintenanceEnabledGauge) Describe(ch chan<- *prometheus.Desc) {
505
	ch <- m.desc
506
}
507

508
func (m *maintenanceEnabledGauge) Collect(ch chan<- prometheus.Metric) {
509
	var value float64
510
	if m.maintenance.IsEnabled(context.Background()) {
511
		value = 1
512
	}
513

514
	metric, err := prometheus.NewConstMetric(m.desc, prometheus.GaugeValue, value)
515
	if err != nil {
516
		return
517
	}
518

519
	ch <- metric
520
}
521

522
// nodeUtilizationVec provides metrics per workspace node on:
523
// - the amount of cpu/memory requested by workspaces on the node (size of the workspace class)
524
// CPU is measured in cores, memory in bytes.
525
// Differentiates between headless and regular workspace nodes using the type label.
526
// Useful to determine node utilization and capacity.
527
type nodeUtilizationVec struct {
528
	name       string
529
	desc       *prometheus.Desc
530
	reconciler *WorkspaceReconciler
531
}
532

533
func newNodeUtilizationVec(r *WorkspaceReconciler) *nodeUtilizationVec {
534
	name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, workspaceNodeUtilization)
535
	desc := prometheus.NewDesc(
536
		name,
537
		"Amount of resources requested by workspaces on the node (cpu/memory, workspace type)",
538
		[]string{"node", "resource", "type"},
539
		prometheus.Labels(map[string]string{}),
540
	)
541
	return &nodeUtilizationVec{
542
		name:       name,
543
		reconciler: r,
544
		desc:       desc,
545
	}
546
}
547

548
// Describe implements Collector. It will send exactly one Desc to the provided channel.
549
func (n *nodeUtilizationVec) Describe(ch chan<- *prometheus.Desc) {
550
	ch <- n.desc
551
}
552

553
// Collect implements Collector.
554
func (n *nodeUtilizationVec) Collect(ch chan<- prometheus.Metric) {
555
	ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
556
	defer cancel()
557

558
	var nodes corev1.NodeList
559
	err := n.reconciler.List(ctx, &nodes)
560
	if err != nil {
561
		log.FromContext(ctx).Error(err, "cannot list nodes for node utilization metric")
562
		return
563
	}
564

565
	var (
566
		nodeUtilization = make(map[string]map[corev1.ResourceName]float64)
567
		nodeTypes       = make(map[string]string)
568
	)
569
	for _, node := range nodes.Items {
570
		isRegular := node.Labels["gitpod.io/workload_workspace_regular"] == "true"
571
		isHeadless := node.Labels["gitpod.io/workload_workspace_headless"] == "true"
572
		if !isRegular && !isHeadless {
573
			// Ignore non-workspace nodes.
574
			continue
575
		}
576

577
		nodeUtilization[node.Name] = map[corev1.ResourceName]float64{
578
			corev1.ResourceCPU:    0,
579
			corev1.ResourceMemory: 0,
580
		}
581
		nodeTypes[node.Name] = "regular"
582
		if !isRegular && isHeadless {
583
			// In case a node is both regular and headless (e.g. a preview env), mark it as regular.
584
			nodeTypes[node.Name] = "headless"
585
		}
586
	}
587

588
	var workspaces workspacev1.WorkspaceList
589
	if err = n.reconciler.List(ctx, &workspaces, client.InNamespace(n.reconciler.Config.Namespace)); err != nil {
590
		log.FromContext(ctx).Error(err, "cannot list workspaces for node utilization metric")
591
		return
592
	}
593

594
	// Aggregate workspace pod resource requests per node.
595
	for _, ws := range workspaces.Items {
596
		// This list is indexed and reads from memory, so it's not that expensive to do this for every workspace.
597
		pods, err := n.reconciler.listWorkspacePods(ctx, &ws)
598
		if err != nil {
599
			log.FromContext(ctx).Error(err, "cannot list workspace pods for node utilization metric", "workspace", ws.Name)
600
			continue
601
		}
602

603
		if len(pods.Items) == 0 {
604
			// No pods (yet), not consuming resources on the node.
605
			continue
606
		}
607

608
		for _, pod := range pods.Items {
609
			nodeName := pod.Spec.NodeName
610
			if nodeName == "" {
611
				// Not yet scheduled.
612
				continue
613
			}
614

615
			if _, ok := nodeUtilization[nodeName]; !ok {
616
				nodeUtilization[nodeName] = map[corev1.ResourceName]float64{
617
					corev1.ResourceCPU:    0,
618
					corev1.ResourceMemory: 0,
619
				}
620
			}
621

622
			for _, container := range pod.Spec.Containers {
623
				requests := container.Resources.Requests
624
				nodeUtilization[nodeName][corev1.ResourceCPU] += float64(requests.Cpu().MilliValue()) / 1000.0
625
				nodeUtilization[nodeName][corev1.ResourceMemory] += float64(requests.Memory().Value())
626
			}
627
		}
628
	}
629

630
	for nodeName, metrics := range nodeUtilization {
631
		for resource, value := range metrics {
632
			nodeType := nodeTypes[nodeName]
633
			metric, err := prometheus.NewConstMetric(n.desc, prometheus.GaugeValue, value, nodeName, resource.String(), nodeType)
634
			if err != nil {
635
				log.FromContext(ctx).Error(err, "cannot create node utilization metric", "node", nodeName, "resource", resource.String(), "type", nodeType)
636
				continue
637
			}
638

639
			ch <- metric
640
		}
641
	}
642
}
643

644
type workspaceActivityVec struct {
645
	name       string
646
	desc       *prometheus.Desc
647
	reconciler *WorkspaceReconciler
648
}
649

650
func newWorkspaceActivityVec(r *WorkspaceReconciler) *workspaceActivityVec {
651
	name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, workspaceActivityTotal)
652
	desc := prometheus.NewDesc(
653
		name,
654
		"total number of active workspaces",
655
		[]string{"active"},
656
		prometheus.Labels(map[string]string{}),
657
	)
658
	return &workspaceActivityVec{
659
		name:       name,
660
		desc:       desc,
661
		reconciler: r,
662
	}
663
}
664

665
// Describe implements Collector. It will send exactly one Desc to the provided channel.
666
func (wav *workspaceActivityVec) Describe(ch chan<- *prometheus.Desc) {
667
	ch <- wav.desc
668
}
669

670
func (wav *workspaceActivityVec) Collect(ch chan<- prometheus.Metric) {
671
	ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
672
	defer cancel()
673

674
	active, notActive, err := wav.getWorkspaceActivityCounts()
675
	if err != nil {
676
		log.FromContext(ctx).Error(err, fmt.Sprintf("cannot determine active/inactive counts - %s will be inaccurate", wav.name))
677
		return
678
	}
679

680
	activeMetrics, err := prometheus.NewConstMetric(wav.desc, prometheus.GaugeValue, float64(active), "true")
681
	if err != nil {
682
		log.FromContext(ctx).Error(err, "cannot create wrokspace activity metric", "active", "true")
683
		return
684
	}
685
	notActiveMetrics, err := prometheus.NewConstMetric(wav.desc, prometheus.GaugeValue, float64(notActive), "false")
686
	if err != nil {
687
		log.FromContext(ctx).Error(err, "cannot create wrokspace activity metric", "active", "false")
688
		return
689
	}
690

691
	ch <- activeMetrics
692
	ch <- notActiveMetrics
693
}
694

695
func (wav *workspaceActivityVec) getWorkspaceActivityCounts() (active, notActive int, err error) {
696
	var workspaces workspacev1.WorkspaceList
697
	if err = wav.reconciler.List(context.Background(), &workspaces, client.InNamespace(wav.reconciler.Config.Namespace)); err != nil {
698
		return 0, 0, err
699
	}
700

701
	for _, ws := range workspaces.Items {
702
		if ws.Spec.Type != workspacev1.WorkspaceTypeRegular {
703
			continue
704
		}
705

706
		hasActivity := activity.Last(&ws) != nil
707
		if hasActivity {
708
			active++
709
		} else {
710
			notActive++
711
		}
712
	}
713

714
	return
715
}
716

717
Product

Resources

Company