Path: blob/main/production/grafana-agent-mixin/dashboards.libsonnet
5459 views
local utils = import './utils.libsonnet';
local g = import 'grafana-builder/grafana.libsonnet';
local grafana = import 'grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local singlestat = grafana.singlestat;
local prometheus = grafana.prometheus;
local graphPanel = grafana.graphPanel;
local tablePanel = grafana.tablePanel;
local template = grafana.template;
{
grafanaDashboards+:: {
'agent.json':
utils.injectUtils(g.dashboard('Agent'))
.addMultiTemplate('cluster', 'agent_build_info', 'cluster')
.addMultiTemplate('namespace', 'agent_build_info', 'namespace')
.addMultiTemplate('container', 'agent_build_info', 'container')
.addMultiTemplateWithAll('pod', 'agent_build_info{container=~"$container"}', 'pod', all='grafana-agent-.*')
.addRow(
g.row('Agent Stats')
.addPanel(
g.panel('Agent Stats') +
g.tablePanel([
'count by (pod, container, version) (agent_build_info{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})',
'max by (pod, container) (time() - process_start_time_seconds{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})',
], {
pod: { alias: 'Pod' },
container: { alias: 'Container' },
version: { alias: 'Version' },
'Value #A': { alias: 'Count', type: 'hidden' },
'Value #B': { alias: 'Uptime' },
})
)
)
.addRow(
g.row('Prometheus Discovery')
.addPanel(
g.panel('Target Sync') +
g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])) by (pod, scrape_job) * 1e3', '{{pod}}/{{scrape_job}}') +
{ yaxes: g.yaxes('ms') }
)
.addPanel(
g.panel('Targets') +
g.queryPanel('sum by (pod) (prometheus_sd_discovered_targets{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})', '{{pod}}') +
g.stack
)
)
.addRow(
g.row('Prometheus Retrieval')
.addPanel(
g.panel('Average Scrape Interval Duration') +
g.queryPanel(|||
rate(prometheus_target_interval_length_seconds_sum{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])
/
rate(prometheus_target_interval_length_seconds_count{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])
* 1e3
|||, '{{pod}} {{interval}} configured') +
{ yaxes: g.yaxes('ms') }
)
.addPanel(
g.panel('Scrape failures') +
g.queryPanel([
'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]))',
], [
'exceeded sample limit: {{job}}',
'duplicate timestamp: {{job}}',
'out of bounds: {{job}}',
'out of order: {{job}}',
]) +
g.stack
)
.addPanel(
g.panel('Appended Samples') +
g.queryPanel('sum by (job, instance_group_name) (rate(agent_wal_samples_appended_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]))', '{{job}} {{instance_group_name}}') +
g.stack
)
),
// Remote write specific dashboard.
'agent-remote-write.json':
local timestampComparison =
graphPanel.new(
'Highest Timestamp In vs. Highest Timestamp Sent',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
|||
(
prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}
-
ignoring(url, remote_name) group_right(pod)
prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}
)
|||,
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local remoteSendLatency =
graphPanel.new(
'Latency [1m]',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
'rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]) / rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m])',
legendFormat='mean {{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
))
.addTarget(prometheus.target(
'histogram_quantile(0.99, rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]))',
legendFormat='p99 {{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local samplesInRate =
graphPanel.new(
'Rate in [5m]',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
'rate(agent_wal_samples_appended_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])',
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local samplesOutRate =
graphPanel.new(
'Rate succeeded [5m]',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
'rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])',
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local currentShards =
graphPanel.new(
'Current Shards',
datasource='$datasource',
span=12,
min_span=6,
)
.addTarget(prometheus.target(
'prometheus_remote_storage_shards{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}',
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local maxShards =
graphPanel.new(
'Max Shards',
datasource='$datasource',
span=4,
)
.addTarget(prometheus.target(
'prometheus_remote_storage_shards_max{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}',
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local minShards =
graphPanel.new(
'Min Shards',
datasource='$datasource',
span=4,
)
.addTarget(prometheus.target(
'prometheus_remote_storage_shards_min{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}',
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local desiredShards =
graphPanel.new(
'Desired Shards',
datasource='$datasource',
span=4,
)
.addTarget(prometheus.target(
'prometheus_remote_storage_shards_desired{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}',
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local shardsCapacity =
graphPanel.new(
'Shard Capacity',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}',
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local pendingSamples =
graphPanel.new(
'Pending Samples',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
'prometheus_remote_storage_samples_pending{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}',
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local queueSegment =
graphPanel.new(
'Remote Write Current Segment',
datasource='$datasource',
span=6,
formatY1='none',
)
.addTarget(prometheus.target(
'prometheus_wal_watcher_current_segment{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}',
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local droppedSamples =
graphPanel.new(
'Dropped Samples',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
'rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])',
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local failedSamples =
graphPanel.new(
'Failed Samples',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
'rate(prometheus_remote_storage_samples_failed_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])',
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local retriedSamples =
graphPanel.new(
'Retried Samples',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
'rate(prometheus_remote_storage_samples_retried_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])',
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
local enqueueRetries =
graphPanel.new(
'Enqueue Retries',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])',
legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}',
));
dashboard.new('Agent Prometheus Remote Write', tags=['grafana-agent-mixin'], editable=true, refresh='30s', time_from='now-1h')
.addTemplate(
{
hide: 0,
label: null,
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
},
)
.addTemplate(
template.new(
'cluster',
'$datasource',
'label_values(agent_build_info, cluster)',
refresh='time',
current={
selected: true,
text: 'All',
value: '$__all',
},
includeAll=true,
),
)
.addTemplate(
template.new(
'namespace',
'$datasource',
'label_values(agent_build_info, namespace)',
refresh='time',
current={
selected: true,
text: 'All',
value: '$__all',
},
includeAll=true,
),
)
.addTemplate(
template.new(
'container',
'$datasource',
'label_values(agent_build_info, container)',
refresh='time',
current={
selected: true,
text: 'All',
value: '$__all',
},
includeAll=true,
),
)
.addTemplate(
template.new(
'pod',
'$datasource',
'label_values(agent_build_info{container=~"$container"}, pod)',
refresh='time',
current={
selected: true,
text: 'All',
value: '$__all',
},
includeAll=true,
),
)
.addTemplate(
template.new(
'url',
'$datasource',
'label_values(prometheus_remote_storage_shards{cluster=~"$cluster", pod=~"$pod"}, url)',
refresh='time',
includeAll=true,
)
)
.addRow(
row.new('Timestamps')
.addPanel(timestampComparison)
.addPanel(remoteSendLatency)
)
.addRow(
row.new('Samples')
.addPanel(samplesInRate)
.addPanel(samplesOutRate)
.addPanel(pendingSamples)
.addPanel(droppedSamples)
.addPanel(failedSamples)
.addPanel(retriedSamples)
)
.addRow(
row.new('Shards')
.addPanel(currentShards)
.addPanel(maxShards)
.addPanel(minShards)
.addPanel(desiredShards)
)
.addRow(
row.new('Shard Details')
.addPanel(shardsCapacity)
)
.addRow(
row.new('Segments')
.addPanel(queueSegment)
)
.addRow(
row.new('Misc. Rates')
.addPanel(enqueueRetries)
),
'agent-tracing-pipeline.json':
local acceptedSpans =
graphPanel.new(
'Accepted spans',
datasource='$datasource',
interval='1m',
span=3,
legend_show=false,
fill=0,
)
.addTarget(prometheus.target(
|||
rate(traces_receiver_accepted_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",receiver!="otlp/lb"}[$__rate_interval])
|||,
legendFormat='{{ pod }} - {{ receiver }}/{{ transport }}',
));
local refusedSpans =
graphPanel.new(
'Refused spans',
datasource='$datasource',
interval='1m',
span=3,
legend_show=false,
fill=0,
)
.addTarget(prometheus.target(
|||
rate(traces_receiver_refused_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",receiver!="otlp/lb"}[$__rate_interval])
|||,
legendFormat='{{ pod }} - {{ receiver }}/{{ transport }}',
));
local sentSpans =
graphPanel.new(
'Exported spans',
datasource='$datasource',
interval='1m',
span=3,
legend_show=false,
fill=0,
)
.addTarget(prometheus.target(
|||
rate(traces_exporter_sent_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",exporter!="otlp"}[$__rate_interval])
|||,
legendFormat='{{ pod }} - {{ exporter }}',
));
local exportedFailedSpans =
graphPanel.new(
'Exported failed spans',
datasource='$datasource',
interval='1m',
span=3,
legend_show=false,
fill=0,
)
.addTarget(prometheus.target(
|||
rate(traces_exporter_send_failed_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",exporter!="otlp"}[$__rate_interval])
|||,
legendFormat='{{ pod }} - {{ exporter }}',
));
local receivedSpans(receiverFilter, width) =
graphPanel.new(
'Received spans',
datasource='$datasource',
interval='1m',
span=width,
fill=1,
)
.addTarget(prometheus.target(
|||
sum(rate(traces_receiver_accepted_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",%s}[$__rate_interval]))
||| % receiverFilter,
legendFormat='Accepted',
))
.addTarget(prometheus.target(
|||
sum(rate(traces_receiver_refused_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",%s}[$__rate_interval]))
||| % receiverFilter,
legendFormat='Refused',
));
local exportedSpans(exporterFilter, width) =
graphPanel.new(
'Exported spans',
datasource='$datasource',
interval='1m',
span=width,
fill=1,
)
.addTarget(prometheus.target(
|||
sum(rate(traces_exporter_sent_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",%s}[$__rate_interval]))
||| % exporterFilter,
legendFormat='Sent',
))
.addTarget(prometheus.target(
|||
sum(rate(traces_exporter_send_failed_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",%s}[$__rate_interval]))
||| % exporterFilter,
legendFormat='Send failed',
));
local loadBalancedSpans =
graphPanel.new(
'Load-balanced spans',
datasource='$datasource',
interval='1m',
span=3,
fill=1,
stack=true,
)
.addTarget(prometheus.target(
|||
rate(traces_loadbalancer_backend_outcome{cluster=~"$cluster",namespace=~"$namespace",success="true",container=~"$container",pod=~"$pod"}[$__rate_interval])
|||,
legendFormat='{{ pod }}',
));
local peersNum =
graphPanel.new(
'Number of peers',
datasource='$datasource',
interval='1m',
span=3,
legend_show=false,
fill=0,
)
.addTarget(prometheus.target(
|||
traces_loadbalancer_num_backends{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod"}
|||,
legendFormat='{{ pod }}',
));
dashboard.new('Agent Tracing Pipeline', tags=['grafana-agent-mixin'], editable=true, refresh='30s', time_from='now-1h')
.addTemplate(
{
hide: 0,
label: null,
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
},
)
.addTemplate(
template.new(
'cluster',
'$datasource',
'label_values(agent_build_info, cluster)',
refresh='time',
current={
selected: true,
text: 'All',
value: '$__all',
},
includeAll=true,
),
)
.addTemplate(
template.new(
'namespace',
'$datasource',
'label_values(agent_build_info, namespace)',
refresh='time',
current={
selected: true,
text: 'All',
value: '$__all',
},
includeAll=true,
),
)
.addTemplate(
template.new(
'container',
'$datasource',
'label_values(agent_build_info, container)',
refresh='time',
current={
selected: true,
text: 'All',
value: '$__all',
},
includeAll=true,
),
)
.addTemplate(
template.new(
'pod',
'$datasource',
'label_values(agent_build_info{container=~"$container"}, pod)',
refresh='time',
current={
selected: true,
text: 'All',
value: '$__all',
},
includeAll=true,
),
)
.addRow(
row.new('Write / Read')
.addPanel(acceptedSpans)
.addPanel(refusedSpans)
.addPanel(sentSpans)
.addPanel(exportedFailedSpans)
.addPanel(receivedSpans('receiver!="otlp/lb"', 6))
.addPanel(exportedSpans('exporter!="otlp"', 6))
)
.addRow(
row.new('Load balancing')
.addPanel(loadBalancedSpans)
.addPanel(peersNum)
.addPanel(receivedSpans('receiver="otlp/lb"', 3))
.addPanel(exportedSpans('exporter="otlp"', 3))
),
},
}