Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
aos
GitHub Repository: aos/grafana-agent
Path: blob/main/operations/agent-flow-mixin/dashboards/controller.libsonnet
4096 views
local dashboard = import './utils/dashboard.jsonnet';
local panel = import './utils/panel.jsonnet';
local filename = 'agent-flow-controller.json';

{
  [filename]:
    dashboard.new(name='Grafana Agent Flow / Controller') +
    dashboard.withDocsLink(
      url='https://grafana.com/docs/agent/latest/flow/concepts/component_controller/',
      desc='Component controller documentation',
    ) +
    dashboard.withDashboardsLink() +
    dashboard.withUID(std.md5(filename)) +
    dashboard.withTemplateVariablesMixin([
      dashboard.newTemplateVariable('cluster', |||
        label_values(agent_component_controller_running_components, cluster)
      |||),
      dashboard.newTemplateVariable('namespace', |||
        label_values(agent_component_controller_running_components{cluster="$cluster"}, namespace)
      |||),
    ]) +
    // TODO(@tpaschalis) Make the annotation optional.
    dashboard.withAnnotations([
      dashboard.newLokiAnnotation('Deployments', '{cluster="$cluster", container="kube-diff-logger"} | json | namespace_extracted="grafana-agent" | name_extracted=~"grafana-agent.*"', 'rgba(0, 211, 255, 1)'),
    ]) +
    dashboard.withPanelsMixin([
      // Running agents
      (
        panel.newSingleStat('Running agents') +
        panel.withUnit('agents') +
        panel.withDescription(|||
          The number of Grafana Agent Flow instances whose metrics are being sent and reported.
        |||) +
        panel.withPosition({ x: 0, y: 0, w: 10, h: 4 }) +
        panel.withQueries([
          panel.newQuery(
            expr='count(agent_component_controller_evaluating{cluster="$cluster", namespace="$namespace"})',
          ),
        ])
      ),

      // Running components
      (
        panel.newSingleStat('Running components') +
        panel.withUnit('components') +
        panel.withDescription(|||
          The number of running components across all running agents.
        |||) +
        panel.withPosition({ x: 0, y: 4, w: 10, h: 4 }) +
        panel.withQueries([
          panel.newQuery(
            expr='sum(agent_component_controller_running_components{cluster="$cluster", namespace="$namespace"})',
          ),
        ])
      ),

      // Overall component health
      (
        panel.newGraphedSingleStat('Overall component health') {
          fieldConfig: {
            defaults: {
              min: 0,
              max: 1,
              noValue: 'No components',
            },
          },
        } +
        panel.withUnit('percentunit') +
        panel.withDescription(|||
          The percentage of components which are in a healthy state.
        |||) +
        panel.withPosition({ x: 0, y: 8, w: 10, h: 4 }) +
        panel.withQueries([
          panel.newQuery(
            expr=|||
              sum(agent_component_controller_running_components{cluster="$cluster", namespace="$namespace",health_type="healthy"}) /
              sum(agent_component_controller_running_components{cluster="$cluster", namespace="$namespace"})
            |||,
          ),
        ])
      ),

      // Components by health
      (
        panel.new(title='Components by health', type='bargauge') {
          options: {
            orientation: 'vertical',
            showUnfilled: true,
          },
          fieldConfig: {
            defaults: {
              min: 0,
              thresholds: {
                mode: 'absolute',
                steps: [{ color: 'green', value: null }],
              },
            },
            overrides: [
              {
                matcher: { id: 'byName', options: 'Unhealthy' },
                properties: [{
                  id: 'thresholds',
                  value: {
                    mode: 'absolute',
                    steps: [
                      { color: 'green', value: null },
                      { color: 'red', value: 1 },
                    ],
                  },
                }],
              },
              {
                matcher: { id: 'byName', options: 'Unknown' },
                properties: [{
                  id: 'thresholds',
                  value: {
                    mode: 'absolute',
                    steps: [
                      { color: 'green', value: null },
                      { color: 'blue', value: 1 },
                    ],
                  },
                }],
              },
              {
                matcher: { id: 'byName', options: 'Exited' },
                properties: [{
                  id: 'thresholds',
                  value: {
                    mode: 'absolute',
                    steps: [
                      { color: 'green', value: null },
                      { color: 'orange', value: 1 },
                    ],
                  },
                }],
              },
            ],
          },
        } +
        panel.withDescription(|||
          Breakdown of components by health across all running agents.

          * Healthy: components have been evaluated completely and are reporting themselves as healthy.
          * Unhealthy: Components either could not be evaluated or are reporting themselves as unhealthy.
          * Unknown: A component has been created but has not yet been started.
          * Exited: A component has exited. It will not return to the running state.

          More information on a component's health state can be retrieved using
          the Grafana Agent Flow UI.

          Note that components may be in a degraded state even if they report
          themselves as healthy. Use component-specific dashboards and alerts
          to observe detailed information about the behavior of a component.
        |||) +
        panel.withPosition({ x: 10, y: 0, w: 14, h: 12 }) +
        panel.withQueries([
          panel.newInstantQuery(
            legendFormat='Healthy',
            expr='sum(agent_component_controller_running_components{cluster="$cluster", namespace="$namespace", health_type="healthy"}) or vector(0)',
          ),
          panel.newInstantQuery(
            legendFormat='Unhealthy',
            expr='sum(agent_component_controller_running_components{cluster="$cluster", namespace="$namespace", health_type="unhealthy"}) or vector(0)',
          ),
          panel.newInstantQuery(
            legendFormat='Unknown',
            expr='sum(agent_component_controller_running_components{cluster="$cluster", namespace="$namespace", health_type="unknown"}) or vector(0)',
          ),
          panel.newInstantQuery(
            legendFormat='Exited',
            expr='sum(agent_component_controller_running_components{cluster="$cluster", namespace="$namespace", health_type="exited"}) or vector(0)',
          ),
        ])
      ),

      // Graph evaluation rate
      (
        panel.new(title='Graph evaluation rate', type='timeseries') {
          fieldConfig: {
            defaults: {
              custom: {
                drawStyle: 'points',
                pointSize: 3,
              },
            },
          },
        } +
        panel.withUnit('ops') +
        panel.withDescription(|||
          The frequency in which the component graph gets updated.
        |||) +
        panel.withPosition({ x: 0, y: 12, w: 8, h: 10 }) +
        panel.withMultiTooltip() +
        panel.withQueries([
          panel.newQuery(
            expr='sum by (instance) (rate(agent_component_evaluation_seconds_count{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))',
          ),
        ])
      ),

      // Graph evaluation time
      (
        panel.new(title='Graph evaluation time', type='timeseries') +
        panel.withUnit('s') +
        panel.withDescription(|||
          The percentiles for how long it takes to complete a graph evaluation.

          Graph evaluations must complete for components to have the latest
          arguments. The longer graph evaluations take, the slower it will be to
          reconcile the state of components.

          If evaluation is taking too long, consider sharding your components to
          deal with smaller amounts of data and reuse data as much as possible.
        |||) +
        panel.withPosition({ x: 8, y: 12, w: 8, h: 10 }) +
        panel.withQueries([
          panel.newQuery(
            expr='histogram_quantile(0.99, sum by (le) (rate(agent_component_evaluation_seconds_bucket{cluster="$cluster",namespace="$namespace"}[$__rate_interval])))',
            legendFormat='99th percentile',
          ),
          panel.newQuery(
            expr='histogram_quantile(0.50, sum by (le) (rate(agent_component_evaluation_seconds_bucket{cluster="$cluster",namespace="$namespace"}[$__rate_interval])))',
            legendFormat='50th percentile',
          ),
          panel.newQuery(
            expr=|||
              sum(rate(agent_component_evaluation_seconds_sum{cluster="$cluster",namespace="$namespace"}[$__rate_interval])) /
              sum(rate(agent_component_evaluation_seconds_count{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))
            |||,
            legendFormat='Average',
          ),
        ])
      ),

      // Graph evaluation histogram
      (
        panel.newHeatmap('Graph evaluation histogram') +
        panel.withDescription(|||
          Detailed histogram view of how long graph evaluations take.

          The goal is to design your config so that evaluations take as little
          time as possible; under 100ms is a good goal.
        |||) +
        panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) +
        panel.withQueries([
          panel.newQuery(
            expr='sum by (le) (increase(agent_component_evaluation_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))',
            format='heatmap',
            legendFormat='{{le}}',
          ),
        ])
      ),
    ]),
}