Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
aos
GitHub Repository: aos/grafana-agent
Path: blob/main/operations/agent-flow-mixin/dashboards/cluster-node.libsonnet
4096 views
local dashboard = import './utils/dashboard.jsonnet';
local panel = import './utils/panel.jsonnet';
local filename = 'agent-cluster-node.json';

{
  [filename]:
    dashboard.new(name='Grafana Agent Flow / Cluster Node') +
    dashboard.withDocsLink(
      url='https://grafana.com/docs/agent/latest/flow/reference/cli/run/#clustered-mode-experimental',
      desc='Clustering documentation',
    ) +
    dashboard.withDashboardsLink() +
    dashboard.withUID(std.md5(filename)) +
    dashboard.withTemplateVariablesMixin([
      dashboard.newTemplateVariable('cluster', |||
        label_values(agent_component_controller_running_components, cluster)
      |||),
      dashboard.newTemplateVariable('namespace', |||
        label_values(agent_component_controller_running_components{cluster="$cluster"}, namespace)
      |||),
      dashboard.newTemplateVariable('instance', |||
        label_values(agent_component_controller_running_components{cluster="$cluster", namespace="$namespace"}, instance)
      |||),
    ]) +
    // TODO(@tpaschalis) Make the annotation optional.
    dashboard.withAnnotations([
      dashboard.newLokiAnnotation('Deployments', '{cluster="$cluster", container="kube-diff-logger"} | json | namespace_extracted="grafana-agent" | name_extracted=~"grafana-agent.*"', 'rgba(0, 211, 255, 1)'),
    ]) +
    dashboard.withPanelsMixin([
      // Node Info row
      (
        panel.new('Node Info', 'row') +
        panel.withPosition({ h: 1, w: 24, x: 0, y: 0 })
      ),
      // Node Info
      (
        panel.new('Node Info', 'table') +
        panel.withDescription(|||
          Information about a specific cluster node.

          * Lamport clock time: The observed Lamport time on the specific node's clock used to provide partial ordering around gossip messages. Nodes should ideally be observing roughly the same time, meaning they are up-to-date on the cluster state. If a node is falling behind, it means that it has not recently processed the same number of messages and may have an outdated view of its peers.

          * Internal cluster state observers: The number of Observer functions that are registered to run whenever the node detects a cluster change.

          * Gossip health score: A health score assigned to this node by the memberlist implementation. The lower, the better.

          * Gossip protocol version: The protocol version used by nodes to communicate with one another. It should match across all nodes.
        |||) +
        panel.withPosition({ x: 0, y: 1, w: 12, h: 8 }) +
        panel.withQueries([
          panel.newNamedInstantQuery(
            expr='sum(cluster_node_lamport_time{instance="$instance"})',
            refId='Lamport clock time',
            format='table',
          ),
          panel.newNamedInstantQuery(
            expr='sum(cluster_node_update_observers{instance="$instance"})',
            refId='Internal cluster state observers',
            format='table',
          ),
          panel.newNamedInstantQuery(
            expr='sum(cluster_node_gossip_health_score{instance="$instance"})',
            refId='Gossip health score',
            format='table',
          ),
          panel.newNamedInstantQuery(
            expr='sum(cluster_node_gossip_proto_version{instance="$instance"})',
            refId='Gossip protocol version',
            format='table',
          ),
        ]) +
        panel.withTransformations([
          {
            id: 'renameByRegex',
            options: {
              regex: 'Value #(.*)',
              renamePattern: '$1',
            },
          },
          {
            id: 'reduce',
            options: {},
          },
          {
            id: 'organize',
            options: {
              excludeByName: {},
              indexByName: {},
              renameByName: {
                Field: 'Metric',
                Max: 'Value',
              },
            },
          },
        ])
      ),
      // Gossip ops/sec
      (
        panel.new('Gossip ops/s', 'timeseries') +
        panel.withPosition({ x: 12, y: 1, w: 12, h: 8 }) +
        panel.withQueries([
          panel.newQuery(
            expr='rate(cluster_node_gossip_received_events_total{instance="$instance"}[$__rate_interval])',
            legendFormat='{{event}}'
          ),
        ])
      ),
      // Known peers
      (
        panel.new('Known peers', 'stat') +
        panel.withDescription(|||
          Known peers to the node (including the local node).
        |||) +
        panel.withPosition({ x: 0, y: 9, w: 12, h: 8 }) +
        panel.withQueries([
          panel.newQuery(
            expr='sum(cluster_node_peers{instance="$instance"})',
          ),
        ]) +
        panel.withUnit('suffix:peers')
      ),
      // Peers by state
      (
        panel.new('Peers by state', 'timeseries') +
        panel.withDescription(|||
          Known peers to the node by state (including the local node).
        |||) +
        panel.withPosition({ x: 12, y: 9, w: 12, h: 8 }) +
        panel.withQueries([
          panel.newQuery(
            expr='cluster_node_peers{instance="$instance"}',
            legendFormat='{{state}}',
          ),
        ]) +
        panel.withUnit('suffix:nodes')
      ),
      // Gossip Transport row
      (
        panel.new('Gossip Transport', 'row') +
        panel.withPosition({ h: 1, w: 24, x: 0, y: 17 })
      ),
      // Transport bandwidth
      (
        panel.new('Transport bandwidth', 'timeseries') +
        panel.withPosition({
          h: 8,
          w: 8,
          x: 0,
          y: 18,
        }) +
        panel.withQueries([
          panel.newQuery(
            expr='rate(cluster_transport_rx_bytes_total{instance="$instance"}[$__rate_interval])',
            legendFormat='rx',
          ),
          panel.newQuery(
            expr='-1 * rate(cluster_transport_tx_bytes_total{instance="$instance"}[$__rate_interval])',
            legendFormat='tx',
          ),
        ]) +
        panel.withCenteredAxis() +
        panel.withUnit('Bps')
      ),
      // Packet write success rate
      (
        panel.new('Packet write success rate', 'timeseries') +
        panel.withPosition({
          h: 8,
          w: 8,
          x: 8,
          y: 18,
        }) +
        panel.withQueries([
          panel.newQuery(
            expr=|||
              1 - (
              rate(cluster_transport_tx_packets_failed_total{instance="$instance"}[$__rate_interval]) /
              rate(cluster_transport_tx_packets_total{instance="$instance"}[$__rate_interval])
              )
            |||,
            legendFormat='Tx success %',
          ),
          panel.newQuery(
            expr=|||
              1 - (
                rate(cluster_transport_rx_packets_failed_total{instance="$instance"}[$__rate_interval]) /
                rate(cluster_transport_rx_packets_total{instance="$instance"}[$__rate_interval])
                )
            |||,
            legendFormat='Rx success %',
          ),
        ]) +
        panel.withUnit('percentunit')
      ),
      // Pending packet queue
      (
        panel.new('Pending packet queue', 'timeseries') +
        panel.withDescription(|||
          The number of packets enqueued currently to be decoded or encoded and sent during communication with other nodes.

          The incoming and outgoing packet queue should be as empty as possible; a growing queue means that the Agent cannot keep up with the number of messages required to have all nodes informed of cluster changes, and the nodes may not converge in a timely fashion.
        |||) +
        panel.withPosition({
          h: 8,
          w: 8,
          x: 16,
          y: 18,
        }) +
        panel.withQueries([
          panel.newQuery(
            expr='cluster_transport_tx_packet_queue_length{instance="$instance"}',
            legendFormat='tx queue',
          ),
          panel.newQuery(
            expr='cluster_transport_rx_packet_queue_length{instance="$instance"}',
            legendFormat='rx queue',
          ),
        ]) +
        panel.withUnit('pkts')
      ),
      // Stream bandwidth
      (
        panel.new('Stream bandwidth', 'timeseries') +
        panel.withPosition({
          h: 8,
          w: 8,
          x: 0,
          y: 26,
        }) +
        panel.withQueries([
          panel.newQuery(
            expr='rate(cluster_transport_stream_rx_bytes_total{instance="$instance"}[$__rate_interval])',
            legendFormat='rx',
          ),
          panel.newQuery(
            expr='-1 * rate(cluster_transport_stream_tx_bytes_total{instance="$instance"}[$__rate_interval])',
            legendFormat='tx',
          ),
        ]) +
        panel.withCenteredAxis() +
        panel.withUnit('Bps')
      ),
      // Stream write success rate
      (
        panel.new('Stream write success rate', 'timeseries') +
        panel.withPosition({
          h: 8,
          w: 8,
          x: 8,
          y: 26,
        }) +
        panel.withQueries([
          panel.newQuery(
            expr=|||
              1 - (
                rate(cluster_transport_stream_tx_packets_failed_total{instance="$instance"}[$__rate_interval]) /
                rate(cluster_transport_stream_tx_packets_total{instance="$instance"}[$__rate_interval])
                )
            |||,
            legendFormat='Tx success %'
          ),
          panel.newQuery(
            expr=|||
              1 - (
                rate(cluster_transport_stream_rx_packets_failed_total{instance="$instance"}[$__rate_interval]) /
                rate(cluster_transport_stream_rx_packets_total{instance="$instance"}[$__rate_interval])
                )
            |||,
            legendFormat='Rx success %'
          ),
        ]) +
        panel.withUnit('percentunit')
      ),
      // Open transport streams
      (
        panel.new('Open transport streams', 'timeseries') +
        panel.withDescription(|||
          The number of open connections from this node to its peers.

          Each node picks up a subset of its peers to continuously gossip messages around cluster status using streaming HTTP/2 connections. This panel can be used to detect networking failures that result in cluster communication being disrupted and convergence taking longer than expected or outright failing.
        |||) +
        panel.withPosition({
          h: 8,
          w: 8,
          x: 16,
          y: 26,
        }) +
        panel.withQueries([
          panel.newQuery(
            expr='cluster_transport_streams{instance="$instance"}',
            legendFormat='Open streams'
          ),
        ])
      ),
    ]),
}