Path: blob/main/operations/observability/mixins/cross-teams/dashboards/SLOs/workspace-startup-time.libsonnet
3688 views
/**
* Copyright (c) 2021 Gitpod GmbH. All rights reserved.
* Licensed under the GNU Affero General Public License (AGPL).
* See License.AGPL.txt in the project root for license information.
*/
local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local prometheus = grafana.prometheus;
local template = grafana.template;
local graphPanel = grafana.graphPanel;
local heatmapPanel = grafana.heatmapPanel;
local statPanel = grafana.statPanel;
local _config = (import '../../config.libsonnet')._config;
local datasourceTemplate = {
current: {
text: 'Prometheus',
value: 'Prometheus',
},
hide: 0,
label: null,
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
};
local clusterTemplate =
template.new(
name='cluster',
datasource='$datasource',
query='label_values(gitpod_ws_manager_mk2_workspace_startup_seconds_count, %s)' % _config.clusterLabel,
current='all',
hide=if _config.showMultiCluster then '' else 'hide',
refresh=2,
includeAll=true,
multi=true,
sort=1
);
local higherStartupSLOStatPanel =
statPanel.new(
'SLO and Error budget - Workspace Start up time < 128s (~2 min)',
description=
|||
Current SLO target and remaining error budget for the higher latency workspace SLO.
**How to interpret:**
Imagine we have a 95% target and 5% error budget left, that means that all workspaces started quicker than our SLO target in the last month.
On the other hand, if we have a 95% target and -5% error budget left, that means that 10% of our workspaces took longer than our SLO target to start up.
|||,
datasource='$datasource',
min=0,
max=1,
reducerFunction='lastNotNull',
orientation='horizontal',
unit='percentunit',
colorMode='background',
graphMode='none',
)
.addTarget(prometheus.target('0.95', legendFormat='Target'))
.addTarget(prometheus.target(
|||
(
sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_bucket{%(clusterLabel)s=~"$cluster",type="Regular", le="128"}[30d]))
/
sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_count{%(clusterLabel)s=~"$cluster",type="Regular"}[30d]))
) - 0.95
||| % _config, legendFormat='Monthly error budget remaining'
))
.addThreshold({ color: 'light-red', value: null })
.addThreshold({ color: 'dark-green', value: 0 })
.addThreshold({ color: 'rgb(255, 255, 255)', value: '0.95' })
;
local lowerStartupSLOStatPanel =
statPanel.new(
'SLO and Error budget - Workspace Start up time < 16s',
description=
|||
Current SLO target and remaining error budget for the higher latency workspace SLO.
**How to interpret:**
Imagine we have a 95% target and 5% error budget left, that means that all workspaces started quicker than our SLO target in the last month.
On the other hand, if we have a 95% target and -5% error budget left, that means that 10% of our workspaces took longer than our SLO target to start up.
|||,
datasource='$datasource',
min=0,
max=1,
reducerFunction='lastNotNull',
orientation='horizontal',
unit='percentunit',
colorMode='background',
graphMode='none',
)
.addTarget(prometheus.target('0.5', legendFormat='Target'))
.addTarget(prometheus.target(
|||
(
sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_bucket{%(clusterLabel)s=~"$cluster",type="Regular", le="16"}[30d]))
/
sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_count{%(clusterLabel)s=~"$cluster",type="Regular"}[30d]))
) - 0.5
||| % _config, legendFormat='Monthly error budget remaining'
))
.addThreshold({ color: 'light-red', value: null })
.addThreshold({ color: 'dark-green', value: 0 })
.addThreshold({ color: 'rgb(255, 255, 255)', value: '0.5' })
;
local workspaceStartupTimeHeatMap =
heatmapPanel.new(
title='$cluster: Regular Workspace Startup time heatmap',
datasource='$datasource',
yAxis_format='s',
dataFormat='tsbuckets',
yBucketBound='auto',
hideZeroBuckets=true,
highlightCards=true,
color_mode='spectrum',
color_colorScheme='interpolateGreens',
repeat='%s' % _config.clusterLabel,
)
.addTarget(prometheus.target(
'sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_bucket{%(clusterLabel)s=~"$cluster",type="Regular"}[$__rate_interval])) by (le)' % _config,
legendFormat='{{le}}',
format='heatmap'
))
;
local workspaceStartupTimeQuantiles =
graphPanel.new(
'$cluster: Regular Workspace Startup time Percentiles',
datasource='$datasource',
format='s',
stack=false,
fill=1,
legend_show=true,
repeat='%s' % _config.clusterLabel,
)
.addTarget(prometheus.target(
|||
histogram_quantile(0.95,
sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_bucket{%(clusterLabel)s=~"$cluster",type="Regular"}[$__rate_interval])) by (le)
)
||| % _config, legendFormat='95th Percentile'
))
.addTarget(prometheus.target(
|||
histogram_quantile(0.5,
sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_bucket{%(clusterLabel)s=~"$cluster",type="Regular"}[$__rate_interval])) by (le)
)
||| % _config, legendFormat='50th Percentile'
))
.addTarget(prometheus.target(
|||
sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_sum{%(clusterLabel)s=~"$cluster",type="Regular"}[$__rate_interval]))
/
sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_count{%(clusterLabel)s=~"$cluster",type="Regular"}[$__rate_interval]))
||| % _config, legendFormat='avg'
));
{
grafanaDashboards+:: {
'gitpod-slo-workspace-startuptime.json':
dashboard.new(
'%sSLO / Workspace Startup time' % _config.dashboardNamePrefix,
time_from='now-1h',
tags=(_config.dashboardTags),
timezone='utc',
refresh='30s',
graphTooltip='shared_crosshair',
uid='gitpod-slo-workspace-startuptime'
)
.addTemplate(datasourceTemplate)
.addTemplate(clusterTemplate)
.addPanel(higherStartupSLOStatPanel, gridPos={ x: 4, y: 0, w: 8, h: 4 })
.addPanel(lowerStartupSLOStatPanel, gridPos={ x: 12, y: 0, w: 8, h: 4 })
.addRow(
row.new('Workspace startup time - User experience')
.addPanel(workspaceStartupTimeHeatMap, gridPos={ y: 5 })
.addPanel(workspaceStartupTimeQuantiles, gridPos={ y: 13 })
),
},
}