Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
gitpod-io
GitHub Repository: gitpod-io/gitpod
Path: blob/main/operations/observability/mixins/cross-teams/dashboards/SLOs/workspace-startup-time.libsonnet
2501 views
/**
 * Copyright (c) 2021 Gitpod GmbH. All rights reserved.
 * Licensed under the GNU Affero General Public License (AGPL).
 * See License.AGPL.txt in the project root for license information.
 */

local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local prometheus = grafana.prometheus;
local template = grafana.template;
local graphPanel = grafana.graphPanel;
local heatmapPanel = grafana.heatmapPanel;
local statPanel = grafana.statPanel;
local _config = (import '../../config.libsonnet')._config;

local datasourceTemplate = {
  current: {
    text: 'Prometheus',
    value: 'Prometheus',
  },
  hide: 0,
  label: null,
  name: 'datasource',
  options: [],
  query: 'prometheus',
  refresh: 1,
  regex: '',
  type: 'datasource',
};


local clusterTemplate =
  template.new(
    name='cluster',
    datasource='$datasource',
    query='label_values(gitpod_ws_manager_mk2_workspace_startup_seconds_count, %s)' % _config.clusterLabel,
    current='all',
    hide=if _config.showMultiCluster then '' else 'hide',
    refresh=2,
    includeAll=true,
    multi=true,
    sort=1
  );

local higherStartupSLOStatPanel =
  statPanel.new(
    'SLO and Error budget - Workspace Start up time < 128s (~2 min)',
    description=
    |||
      Current SLO target and remaining error budget for the higher latency workspace SLO.

      **How to interpret:**
      Imagine we have a 95% target and 5% error budget left, that means that all workspaces started quicker than our SLO target in the last month.
      On the other hand, if we have a 95% target and -5% error budget left, that means that 10% of our workspaces took longer than our SLO target to start up.
    |||,
    datasource='$datasource',
    min=0,
    max=1,
    reducerFunction='lastNotNull',
    orientation='horizontal',
    unit='percentunit',
    colorMode='background',
    graphMode='none',
  )
  .addTarget(prometheus.target('0.95', legendFormat='Target'))
  .addTarget(prometheus.target(
    |||
      (
        sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_bucket{%(clusterLabel)s=~"$cluster",type="Regular", le="128"}[30d]))
        /
        sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_count{%(clusterLabel)s=~"$cluster",type="Regular"}[30d]))
      ) - 0.95
    ||| % _config, legendFormat='Monthly error budget remaining'
  ))
  .addThreshold({ color: 'light-red', value: null })
  .addThreshold({ color: 'dark-green', value: 0 })
  .addThreshold({ color: 'rgb(255, 255, 255)', value: '0.95' })
;

local lowerStartupSLOStatPanel =
  statPanel.new(
    'SLO and Error budget - Workspace Start up time < 16s',
    description=
    |||
      Current SLO target and remaining error budget for the higher latency workspace SLO.

      **How to interpret:**

      Imagine we have a 95% target and 5% error budget left, that means that all workspaces started quicker than our SLO target in the last month.
      On the other hand, if we have a 95% target and -5% error budget left, that means that 10% of our workspaces took longer than our SLO target to start up.
    |||,
    datasource='$datasource',
    min=0,
    max=1,
    reducerFunction='lastNotNull',
    orientation='horizontal',
    unit='percentunit',
    colorMode='background',
    graphMode='none',
  )
  .addTarget(prometheus.target('0.5', legendFormat='Target'))
  .addTarget(prometheus.target(
    |||
      (
        sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_bucket{%(clusterLabel)s=~"$cluster",type="Regular", le="16"}[30d]))
        /
        sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_count{%(clusterLabel)s=~"$cluster",type="Regular"}[30d]))
      ) - 0.5
    ||| % _config, legendFormat='Monthly error budget remaining'
  ))
  .addThreshold({ color: 'light-red', value: null })
  .addThreshold({ color: 'dark-green', value: 0 })
  .addThreshold({ color: 'rgb(255, 255, 255)', value: '0.5' })
;

local workspaceStartupTimeHeatMap =
  heatmapPanel.new(
    title='$cluster: Regular Workspace Startup time heatmap',
    datasource='$datasource',
    yAxis_format='s',
    dataFormat='tsbuckets',
    yBucketBound='auto',
    hideZeroBuckets=true,
    highlightCards=true,
    color_mode='spectrum',
    color_colorScheme='interpolateGreens',
    repeat='%s' % _config.clusterLabel,
  )
  .addTarget(prometheus.target(
    'sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_bucket{%(clusterLabel)s=~"$cluster",type="Regular"}[$__rate_interval])) by (le)' % _config,
    legendFormat='{{le}}',
    format='heatmap'
  ))
;

local workspaceStartupTimeQuantiles =
  graphPanel.new(
    '$cluster: Regular Workspace Startup time Percentiles',
    datasource='$datasource',
    format='s',
    stack=false,
    fill=1,
    legend_show=true,
    repeat='%s' % _config.clusterLabel,
  )
  .addTarget(prometheus.target(
    |||
      histogram_quantile(0.95,
        sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_bucket{%(clusterLabel)s=~"$cluster",type="Regular"}[$__rate_interval])) by (le)
      )
    ||| % _config, legendFormat='95th Percentile'
  ))
  .addTarget(prometheus.target(
    |||
      histogram_quantile(0.5,
        sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_bucket{%(clusterLabel)s=~"$cluster",type="Regular"}[$__rate_interval])) by (le)
      )
    ||| % _config, legendFormat='50th Percentile'
  ))
  .addTarget(prometheus.target(
    |||
      sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_sum{%(clusterLabel)s=~"$cluster",type="Regular"}[$__rate_interval]))
      /
      sum(rate(gitpod_ws_manager_mk2_workspace_startup_seconds_count{%(clusterLabel)s=~"$cluster",type="Regular"}[$__rate_interval]))
    ||| % _config, legendFormat='avg'
  ));

{
  grafanaDashboards+:: {
    'gitpod-slo-workspace-startuptime.json':
      dashboard.new(
        '%sSLO / Workspace Startup time' % _config.dashboardNamePrefix,
        time_from='now-1h',
        tags=(_config.dashboardTags),
        timezone='utc',
        refresh='30s',
        graphTooltip='shared_crosshair',
        uid='gitpod-slo-workspace-startuptime'
      )
      .addTemplate(datasourceTemplate)
      .addTemplate(clusterTemplate)
      .addPanel(higherStartupSLOStatPanel, gridPos={ x: 4, y: 0, w: 8, h: 4 })
      .addPanel(lowerStartupSLOStatPanel, gridPos={ x: 12, y: 0, w: 8, h: 4 })
      .addRow(
        row.new('Workspace startup time - User experience')
        .addPanel(workspaceStartupTimeHeatMap, gridPos={ y: 5 })
        .addPanel(workspaceStartupTimeQuantiles, gridPos={ y: 13 })
      ),
  },
}