CoCalc -- alerts.libsonnet

GitHub Repository: aos/grafana-agent
Path: blob/main/production/grafana-agent-mixin/alerts.libsonnet
⁴⁰⁹³ views
local config = import 'config.libsonnet';
local _config = config._config;

{
  prometheusAlerts+:: {
    groups+: [
      {
        name: 'grafana-agent-tracing',
        rules: [
          {
            alert: 'AgentTracingReceiverErrors',
            // TODO(@mapno): add recording rule for total spans
            expr: |||
              100 * sum(rate(traces_receiver_refused_spans{receiver!="otlp/lb"}[1m])) by (%(group_by_cluster)s, receiver)
                /
              (sum(rate(traces_receiver_refused_spans{receiver!="otlp/lb"}[1m])) by (%(group_by_cluster)s, receiver) + sum(rate(traces_receiver_accepted_spans{receiver!="otlp/lb"}[1m])) by (%(group_by_cluster)s, receiver))
                > 10
            ||| % _config,
            'for': '15m',
            labels: {
              severity: 'warning',
            },
            annotations: {
              message: |||
                Receiver {{ $labels.receiver }} is experiencing {{ printf "%.2f" $value }}% errors.
              |||,
            },
          },
          {
            alert: 'AgentTracingExporterErrors',
            // TODO(@mapno): add recording rule for total spans
            expr: |||
              100 * sum(rate(traces_exporter_send_failed_spans{exporter!="otlp"}[1m])) by (%(group_by_cluster)s, exporter)
                /
              (sum(rate(traces_exporter_send_failed_spans{exporter!="otlp"}[1m])) by (%(group_by_cluster)s, exporter) + sum(rate(traces_exporter_sent_spans{exporter!="otlp"}[1m])) by (%(group_by_cluster)s, exporter))
                > 10
            ||| % _config,
            'for': '15m',
            labels: {
              severity: 'warning',
            },
            annotations: {
              message: |||
                Exporter {{ $labels.exporter }} is experiencing {{ printf "%.2f" $value }}% errors.
              |||,
            },
          },
          {
            alert: 'AgentTracingLoadBalancingErrors',
            expr: |||
              100 * sum(rate(traces_loadbalancer_backend_outcome{success="false"}[1m])) by (%(group_by_cluster)s)
                /
              sum(rate(traces_loadbalancer_backend_outcome{success="true"}[1m])) by (%(group_by_cluster)s)
                > 10
            ||| % _config,
            'for': '15m',
            labels: {
              severity: 'warning',
            },
            annotations: {
              message: |||
                Load balancing is experiencing {{ printf "%.2f" $value }}% errors.
              |||,
            },
          },
        ],
      },
      {
        name: 'GrafanaAgentSmokeChecks',
        rules: [
          {
            alert: 'GrafanaAgentDown',
            expr: |||
              up{
                namespace="agent-smoke-test",
                pod=~"grafana-agent-smoke-test-(0|cluster-0|cluster-1|cluster-2)",
              } == 0
            |||,
            'for': '5m',
            annotations: {
              summary: '{{ $labels.job }} is down',
            },
          },
          {
            alert: 'GrafanaAgentFlapping',
            expr: |||
              avg_over_time(up{
                namespace="agent-smoke-test",
                pod=~"grafana-agent-smoke-test-(0|cluster-0|cluster-1|cluster-2)",
              }[5m]) < 1
            |||,
            'for': '15m',
            annotations: {
              summary: '{{ $labels.job }} is flapping',
            },
          },

          // Checks that the CPU usage doesn't go too high. This was generated from internal usage where
          // every 1,000 active series used roughly 0.0013441% of CPU. This alert only fires if there is a
          // minimum load threshold of at least 1000 active series.
          {
            alert: 'GrafanaAgentCPUHigh',
            expr: |||
              (sum by (pod) (rate(container_cpu_usage_seconds_total{cluster=~".+", namespace=~"agent-smoke-test", container=~".+", pod="grafana-agent-smoke-test-cluster-2"}[5m]))
              /
              (sum by (pod) (agent_wal_storage_active_series{cluster=~".+", namespace=~"agent-smoke-test", container=~".+", pod="grafana-agent-smoke-test-cluster-2"}) / 1000)
              > 0.0013441)
              and
              sum by (pod) (agent_wal_storage_active_series{cluster=~".+", namespace=~"agent-smoke-test", container=~".+", pod="grafana-agent-smoke-test-cluster-2"}) > 1000
            |||,
            'for': '1h',
            annotations: {
              summary: '{{ $labels.pod }} is using more than 0.0013441 CPU per 1000 series over the last 5 minutes',
            },
          },

          // We assume roughly ~8KB per series. Check that each deployment
          // doesn't go too far above this.
          //
          // We aggregate the memory of the scraping service together since an individual
          // node with a really small number of active series will throw this metric off.
          {
            alert: 'GrafanaAgentMemHigh',
            expr: |||
              sum without (pod, instance) (go_memstats_heap_inuse_bytes{job=~"agent-smoke-test/grafana-agent-smoke-test.*"}) /
              sum without (pod, instance, instance_group_name) (agent_wal_storage_active_series{job=~"agent-smoke-test/grafana-agent-smoke-test.*"}) / 1e3 > 10
            |||,
            'for': '1h',
            annotations: {
              summary: '{{ $labels.job }} has used more than 10KB per series for more than 5 minutes',
            },
          },
          {
            alert: 'GrafanaAgentContainerRestarts',
            expr: |||
              sum by (pod) (rate(kube_pod_container_status_restarts_total{namespace="agent-smoke-test"}[10m])) > 0
            |||,
            annotations: {
              summary: '{{ $labels.pod }} has a high rate of container restarts',
            },
          },
        ],
      },
      {
        name: 'GrafanaAgentCrowChecks',
        rules: [
          {
            alert: 'CrowDown',
            expr: |||
              up{job=~"agent-smoke-test/crow-.*"} == 0
            |||,
            'for': '5m',
            annotations: {
              summary: 'Crow {{ $labels.job }} is down.',
            },
          },
          {
            alert: 'CrowFlapping',
            expr: |||
              avg_over_time(up{job=~"agent-smoke-test/crow-.*"}[5m]) < 1
            |||,
            'for': '15m',
            annotations: {
              summary: 'Crow {{ $labels.job }} is flapping.',
            },
          },
          {
            alert: 'CrowNotScraped',
            expr: |||
              rate(crow_test_samples_total[5m]) == 0
            |||,
            'for': '15m',
            annotations: {
              summary: 'Crow {{ $labels.job }} is not being scraped.',
            },
          },
          {
            alert: 'CrowFailures',
            expr: |||
              (
                  rate(crow_test_sample_results_total{result="success"}[5m])
                  /
                  ignoring(result) sum without (result) (rate(crow_test_sample_results_total[5m]))
              )
              < 1
            |||,
            'for': '15m',
            annotations: {
              summary: 'Crow {{ $labels.job }} has had failures for at least 5m',
            },
          },
        ],
      },
      {
        name: 'VultureChecks',
        rules: [
          {
            alert: 'VultureDown',
            expr: |||
              up{job=~"agent-smoke-test/vulture"} == 0
            |||,
            'for': '5m',
            annotations: {
              summary: 'Vulture {{ $labels.job }} is down.',
            },
          },
          {
            alert: 'VultureFlapping',
            expr: |||
              avg_over_time(up{job=~"agent-smoke-test/vulture"}[5m]) < 1
            |||,
            'for': '15m',
            annotations: {
              summary: 'Vulture {{ $labels.job }} is flapping.',
            },
          },
          {
            alert: 'VultureNotScraped',
            expr: |||
              rate(tempo_vulture_trace_total[1m]) == 0
            |||,
            'for': '5m',
            annotations: {
              summary: 'Vulture {{ $labels.job }} is not being scraped.',
            },
          },
          {
            alert: 'VultureFailures',
            expr: |||
              (rate(tempo_vulture_error_total[5m]) / rate(tempo_vulture_trace_total[5m])) > 0.3
            |||,
            'for': '5m',
            annotations: {
              summary: 'Vulture {{ $labels.job }} has had failures for at least 5m',
            },
          },
        ],
      },
      {
        name: 'GrafanaAgentManagement',
        rules: [
          {
            alert: 'AgentManagementBadAPIRequests',
            expr: |||
              100 * sum(rate(agent_remote_config_fetches_total{status_code="(4|5).."}[10m])) by (%(group_by_cluster)s)
                /
              sum(rate(agent_remote_config_fetches_total[10m])) by (%(group_by_cluster)s)
                > 5
            ||| % _config,
            'for': '10m',
            labels: {
              severity: 'warning',
            },
            annotations: {
              message: |||
                Receiving HTTP {{ $labels.status_code }} errors from API in {{ printf "%.2f" $value }}% of cases.
              |||,
            },
          },
          {
            alert: 'AgentManagementBadAPIRequests',
            expr: |||
              100 * sum(rate(agent_remote_config_fetches_total{status_code="(4|5).."}[10m])) by (%(group_by_cluster)s)
                /
              sum(rate(agent_remote_config_fetches_total[10m])) by (%(group_by_cluster)s)
                > 10
            ||| % _config,
            'for': '10m',
            labels: {
              severity: 'critical',
            },
            annotations: {
              message: |||
                Receiving HTTP {{ $labels.status_code }} errors from API in {{ printf "%.2f" $value }}% of cases.
              |||,
            },
          },
          {
            alert: 'AgentManagementRequestFailures',
            expr: |||
              100 * sum(rate(agent_remote_config_fetch_errors_total[10m])) by (%(group_by_cluster)s)
                /
              sum(rate(agent_remote_config_fetches_total[10m])) by (%(group_by_cluster)s)
                > 5
            ||| % _config,
            'for': '10m',
            labels: {
              severity: 'warning',
            },
            annotations: {
              message: |||
                Failing to reach Agent Management API.
              |||,
            },
          },
          {
            alert: 'AgentManagementRequestFailures',
            expr: |||
              100 * sum(rate(agent_remote_config_fetch_errors_total[10m])) by (%(group_by_cluster)s)
                /
              sum(rate(agent_remote_config_fetches_total[10m])) by (%(group_by_cluster)s)
                > 10
            ||| % _config,
            'for': '10m',
            labels: {
              severity: 'critical',
            },
            annotations: {
              message: |||
                Failing to reach Agent Management API.
              |||,
            },
          },
          {
            alert: 'AgentManagementInvalidAPIResponses',
            expr: |||
              100 * sum(rate(agent_remote_config_invalid_total{reason=~".+"}[10m])) by (%(group_by_cluster)s)
                /
              sum(rate(agent_remote_config_fetches_total[10m])) by (%(group_by_cluster)s)
                > 5
            ||| % _config,
            'for': '10m',
            labels: {
              severity: 'warning',
            },
            annotations: {
              message: |||
                API is responding with {{ $labels.reason }} in {{ printf "%.2f" $value }}% of cases.
              |||,
            },
          },
          {
            alert: 'AgentManagementInvalidAPIResponses',
            expr: |||
              100 * sum(rate(agent_remote_config_invalid_total{reason=~".+"}[10m])) by (%(group_by_cluster)s)
                /
              sum(rate(agent_remote_config_fetches_total[10m])) by (%(group_by_cluster)s)
                > 10
            ||| % _config,
            'for': '10m',
            labels: {
              severity: 'critical',
            },
            annotations: {
              message: |||
                API is responding with {{ $labels.reason }} in {{ printf "%.2f" $value }}% of cases.
              |||,
            },
          },
          {
            alert: 'AgentManagementFailureToReload',
            expr: |||
              avg_over_time(agent_config_last_load_successful[10m]) < 0.9
            ||| % _config,
            'for': '10m',
            labels: {
              severity: 'warning',
            },
            annotations: {
              message: |||
                 Instance {{ $labels.instance }} failed to successfully reload the config.
              |||,
            },
          },
          {
            alert: 'AgentManagementFailureToReload',
            expr: |||
              avg_over_time(agent_config_last_load_successful[10m]) < 0.9
            ||| % _config,
            'for': '30m',
            labels: {
              severity: 'critical',
            },
            annotations: {
              message: |||
                 Instance {{ $labels.instance }} failed to successfully reload the config.
              |||,
            },
          },
        ],
      },      
    ],
  },
}
Product

Resources

Company