Path: blob/main/production/grafana-agent-mixin/dashboards.libsonnet
4093 views
local utils = import './utils.libsonnet'; local g = import 'grafana-builder/grafana.libsonnet'; local grafana = import 'grafonnet/grafana.libsonnet'; local dashboard = grafana.dashboard; local row = grafana.row; local singlestat = grafana.singlestat; local prometheus = grafana.prometheus; local graphPanel = grafana.graphPanel; local tablePanel = grafana.tablePanel; local template = grafana.template; { grafanaDashboards+:: { 'agent.json': utils.injectUtils(g.dashboard('Agent')) .addMultiTemplate('cluster', 'agent_build_info', 'cluster') .addMultiTemplate('namespace', 'agent_build_info', 'namespace') .addMultiTemplate('container', 'agent_build_info', 'container') .addMultiTemplateWithAll('pod', 'agent_build_info{container=~"$container"}', 'pod', all='grafana-agent-.*') .addRow( g.row('Agent Stats') .addPanel( g.panel('Agent Stats') + g.tablePanel([ 'count by (pod, container, version) (agent_build_info{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})', 'max by (pod, container) (time() - process_start_time_seconds{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})', ], { pod: { alias: 'Pod' }, container: { alias: 'Container' }, version: { alias: 'Version' }, 'Value #A': { alias: 'Count', type: 'hidden' }, 'Value #B': { alias: 'Uptime' }, }) ) ) .addRow( g.row('Prometheus Discovery') .addPanel( g.panel('Target Sync') + g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])) by (pod, scrape_job) * 1e3', '{{pod}}/{{scrape_job}}') + { yaxes: g.yaxes('ms') } ) .addPanel( g.panel('Targets') + g.queryPanel('sum by (pod) (prometheus_sd_discovered_targets{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})', '{{pod}}') + g.stack ) ) .addRow( g.row('Prometheus Retrieval') .addPanel( g.panel('Average Scrape Interval Duration') + g.queryPanel(||| rate(prometheus_target_interval_length_seconds_sum{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]) / rate(prometheus_target_interval_length_seconds_count{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]) * 1e3 |||, '{{pod}} {{interval}} configured') + { yaxes: g.yaxes('ms') } ) .addPanel( g.panel('Scrape failures') + g.queryPanel([ 'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]))', 'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]))', 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]))', 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]))', ], [ 'exceeded sample limit: {{job}}', 'duplicate timestamp: {{job}}', 'out of bounds: {{job}}', 'out of order: {{job}}', ]) + g.stack ) .addPanel( g.panel('Appended Samples') + g.queryPanel('sum by (job, instance_group_name) (rate(agent_wal_samples_appended_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]))', '{{job}} {{instance_group_name}}') + g.stack ) ), // Remote write specific dashboard. 'agent-remote-write.json': local timestampComparison = graphPanel.new( 'Highest Timestamp In vs. Highest Timestamp Sent', datasource='$datasource', span=6, ) .addTarget(prometheus.target( ||| ( prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"} - ignoring(url, remote_name) group_right(pod) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"} ) |||, legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local remoteSendLatency = graphPanel.new( 'Latency [1m]', datasource='$datasource', span=6, ) .addTarget(prometheus.target( 'rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]) / rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m])', legendFormat='mean {{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )) .addTarget(prometheus.target( 'histogram_quantile(0.99, rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]))', legendFormat='p99 {{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local samplesInRate = graphPanel.new( 'Rate in [5m]', datasource='$datasource', span=6, ) .addTarget(prometheus.target( 'rate(agent_wal_samples_appended_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])', legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local samplesOutRate = graphPanel.new( 'Rate succeeded [5m]', datasource='$datasource', span=6, ) .addTarget(prometheus.target( 'rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])', legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local currentShards = graphPanel.new( 'Current Shards', datasource='$datasource', span=12, min_span=6, ) .addTarget(prometheus.target( 'prometheus_remote_storage_shards{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local maxShards = graphPanel.new( 'Max Shards', datasource='$datasource', span=4, ) .addTarget(prometheus.target( 'prometheus_remote_storage_shards_max{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local minShards = graphPanel.new( 'Min Shards', datasource='$datasource', span=4, ) .addTarget(prometheus.target( 'prometheus_remote_storage_shards_min{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local desiredShards = graphPanel.new( 'Desired Shards', datasource='$datasource', span=4, ) .addTarget(prometheus.target( 'prometheus_remote_storage_shards_desired{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local shardsCapacity = graphPanel.new( 'Shard Capacity', datasource='$datasource', span=6, ) .addTarget(prometheus.target( 'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local pendingSamples = graphPanel.new( 'Pending Samples', datasource='$datasource', span=6, ) .addTarget(prometheus.target( 'prometheus_remote_storage_samples_pending{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local queueSegment = graphPanel.new( 'Remote Write Current Segment', datasource='$datasource', span=6, formatY1='none', ) .addTarget(prometheus.target( 'prometheus_wal_watcher_current_segment{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local droppedSamples = graphPanel.new( 'Dropped Samples', datasource='$datasource', span=6, ) .addTarget(prometheus.target( 'rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])', legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local failedSamples = graphPanel.new( 'Failed Samples', datasource='$datasource', span=6, ) .addTarget(prometheus.target( 'rate(prometheus_remote_storage_samples_failed_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])', legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local retriedSamples = graphPanel.new( 'Retried Samples', datasource='$datasource', span=6, ) .addTarget(prometheus.target( 'rate(prometheus_remote_storage_samples_retried_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])', legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); local enqueueRetries = graphPanel.new( 'Enqueue Retries', datasource='$datasource', span=6, ) .addTarget(prometheus.target( 'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])', legendFormat='{{cluster}}:{{pod}}-{{instance_group_name}}-{{url}}', )); dashboard.new('Agent Prometheus Remote Write', tags=['grafana-agent-mixin'], editable=true, refresh='30s', time_from='now-1h') .addTemplate( { hide: 0, label: null, name: 'datasource', options: [], query: 'prometheus', refresh: 1, regex: '', type: 'datasource', }, ) .addTemplate( template.new( 'cluster', '$datasource', 'label_values(agent_build_info, cluster)', refresh='time', current={ selected: true, text: 'All', value: '$__all', }, includeAll=true, ), ) .addTemplate( template.new( 'namespace', '$datasource', 'label_values(agent_build_info, namespace)', refresh='time', current={ selected: true, text: 'All', value: '$__all', }, includeAll=true, ), ) .addTemplate( template.new( 'container', '$datasource', 'label_values(agent_build_info, container)', refresh='time', current={ selected: true, text: 'All', value: '$__all', }, includeAll=true, ), ) .addTemplate( template.new( 'pod', '$datasource', 'label_values(agent_build_info{container=~"$container"}, pod)', refresh='time', current={ selected: true, text: 'All', value: '$__all', }, includeAll=true, ), ) .addTemplate( template.new( 'url', '$datasource', 'label_values(prometheus_remote_storage_shards{cluster=~"$cluster", pod=~"$pod"}, url)', refresh='time', includeAll=true, ) ) .addRow( row.new('Timestamps') .addPanel(timestampComparison) .addPanel(remoteSendLatency) ) .addRow( row.new('Samples') .addPanel(samplesInRate) .addPanel(samplesOutRate) .addPanel(pendingSamples) .addPanel(droppedSamples) .addPanel(failedSamples) .addPanel(retriedSamples) ) .addRow( row.new('Shards') .addPanel(currentShards) .addPanel(maxShards) .addPanel(minShards) .addPanel(desiredShards) ) .addRow( row.new('Shard Details') .addPanel(shardsCapacity) ) .addRow( row.new('Segments') .addPanel(queueSegment) ) .addRow( row.new('Misc. Rates') .addPanel(enqueueRetries) ), 'agent-tracing-pipeline.json': local acceptedSpans = graphPanel.new( 'Accepted spans', datasource='$datasource', interval='1m', span=3, legend_show=false, fill=0, ) .addTarget(prometheus.target( ||| rate(traces_receiver_accepted_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",receiver!="otlp/lb"}[$__rate_interval]) |||, legendFormat='{{ pod }} - {{ receiver }}/{{ transport }}', )); local refusedSpans = graphPanel.new( 'Refused spans', datasource='$datasource', interval='1m', span=3, legend_show=false, fill=0, ) .addTarget(prometheus.target( ||| rate(traces_receiver_refused_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",receiver!="otlp/lb"}[$__rate_interval]) |||, legendFormat='{{ pod }} - {{ receiver }}/{{ transport }}', )); local sentSpans = graphPanel.new( 'Exported spans', datasource='$datasource', interval='1m', span=3, legend_show=false, fill=0, ) .addTarget(prometheus.target( ||| rate(traces_exporter_sent_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",exporter!="otlp"}[$__rate_interval]) |||, legendFormat='{{ pod }} - {{ exporter }}', )); local exportedFailedSpans = graphPanel.new( 'Exported failed spans', datasource='$datasource', interval='1m', span=3, legend_show=false, fill=0, ) .addTarget(prometheus.target( ||| rate(traces_exporter_send_failed_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",exporter!="otlp"}[$__rate_interval]) |||, legendFormat='{{ pod }} - {{ exporter }}', )); local receivedSpans(receiverFilter, width) = graphPanel.new( 'Received spans', datasource='$datasource', interval='1m', span=width, fill=1, ) .addTarget(prometheus.target( ||| sum(rate(traces_receiver_accepted_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",%s}[$__rate_interval])) ||| % receiverFilter, legendFormat='Accepted', )) .addTarget(prometheus.target( ||| sum(rate(traces_receiver_refused_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",%s}[$__rate_interval])) ||| % receiverFilter, legendFormat='Refused', )); local exportedSpans(exporterFilter, width) = graphPanel.new( 'Exported spans', datasource='$datasource', interval='1m', span=width, fill=1, ) .addTarget(prometheus.target( ||| sum(rate(traces_exporter_sent_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",%s}[$__rate_interval])) ||| % exporterFilter, legendFormat='Sent', )) .addTarget(prometheus.target( ||| sum(rate(traces_exporter_send_failed_spans{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod",%s}[$__rate_interval])) ||| % exporterFilter, legendFormat='Send failed', )); local loadBalancedSpans = graphPanel.new( 'Load-balanced spans', datasource='$datasource', interval='1m', span=3, fill=1, stack=true, ) .addTarget(prometheus.target( ||| rate(traces_loadbalancer_backend_outcome{cluster=~"$cluster",namespace=~"$namespace",success="true",container=~"$container",pod=~"$pod"}[$__rate_interval]) |||, legendFormat='{{ pod }}', )); local peersNum = graphPanel.new( 'Number of peers', datasource='$datasource', interval='1m', span=3, legend_show=false, fill=0, ) .addTarget(prometheus.target( ||| traces_loadbalancer_num_backends{cluster=~"$cluster",namespace=~"$namespace",container=~"$container",pod=~"$pod"} |||, legendFormat='{{ pod }}', )); dashboard.new('Agent Tracing Pipeline', tags=['grafana-agent-mixin'], editable=true, refresh='30s', time_from='now-1h') .addTemplate( { hide: 0, label: null, name: 'datasource', options: [], query: 'prometheus', refresh: 1, regex: '', type: 'datasource', }, ) .addTemplate( template.new( 'cluster', '$datasource', 'label_values(agent_build_info, cluster)', refresh='time', current={ selected: true, text: 'All', value: '$__all', }, includeAll=true, ), ) .addTemplate( template.new( 'namespace', '$datasource', 'label_values(agent_build_info, namespace)', refresh='time', current={ selected: true, text: 'All', value: '$__all', }, includeAll=true, ), ) .addTemplate( template.new( 'container', '$datasource', 'label_values(agent_build_info, container)', refresh='time', current={ selected: true, text: 'All', value: '$__all', }, includeAll=true, ), ) .addTemplate( template.new( 'pod', '$datasource', 'label_values(agent_build_info{container=~"$container"}, pod)', refresh='time', current={ selected: true, text: 'All', value: '$__all', }, includeAll=true, ), ) .addRow( row.new('Write / Read') .addPanel(acceptedSpans) .addPanel(refusedSpans) .addPanel(sentSpans) .addPanel(exportedFailedSpans) .addPanel(receivedSpans('receiver!="otlp/lb"', 6)) .addPanel(exportedSpans('exporter!="otlp"', 6)) ) .addRow( row.new('Load balancing') .addPanel(loadBalancedSpans) .addPanel(peersNum) .addPanel(receivedSpans('receiver="otlp/lb"', 3)) .addPanel(exportedSpans('exporter="otlp"', 3)) ), }, }