Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
aos
GitHub Repository: aos/grafana-agent
Path: blob/main/example/k3d/smoke/monitoring/prometheus_monitoring.libsonnet
5340 views
local agent_prometheus = import 'grafana-agent/v1/lib/metrics.libsonnet';

{
  config: {
    global: {
      scrape_interval: '1m',
    },
    scrape_configs: agent_prometheus.scrapeInstanceKubernetes.scrape_configs,
  },

  rules: {
    groups: [
      {
        name: 'GrafanaAgentChecks',
        rules: [
          // Basic sanity checks: ensure that Agents exist, are up,
          // and haven't been flapping.
          {
            alert: 'GrafanaAgentMissing',
            expr: |||
              absent(up{ namespace="smoke", pod="grafana-agent-0" })         == 1 or
              absent(up{ namespace="smoke", pod="grafana-agent-cluster-0" }) == 1 or
              absent(up{ namespace="smoke", pod="grafana-agent-cluster-1" }) == 1 or
              absent(up{ namespace="smoke", pod="grafana-agent-cluster-2" }) == 1
            |||,
            'for': '5m',
            annotations: {
              summary: '{{ $labels.pod }} is not running.',
            },
          },
          {
            alert: 'GrafanaAgentDown',
            expr: |||
              up{
                namespace="smoke",
                pod=~"grafana-agent-(0|cluster-0|cluster-1|cluster-2)",
              } == 0
            |||,
            'for': '5m',
            annotations: {
              summary: '{{ $labels.job }} is down',
            },
          },
          {
            alert: 'GrafanaAgentFlapping',
            expr: |||
              avg_over_time(up{
                namespace="smoke",
                pod=~"grafana-agent-(0|cluster-0|cluster-1|cluster-2)",
              }[5m]) < 1
            |||,
            'for': '15m',
            annotations: {
              summary: '{{ $labels.job }} is flapping',
            },
          },

          // Checks that the CPU usage doesn't go too high. This was generated
          // from main where the CPU usage hovered around 2-3% per pod.
          //
          // TODO: something less guessworky here.
          {
            alert: 'GrafanaAgentCPUHigh',
            expr: |||
              rate(container_cpu_usage_seconds_total{namespace="smoke", pod=~"grafana-agent-.*"}[1m]) > 0.05
            |||,
            'for': '5m',
            annotations: {
              summary: '{{ $labels.pod }} is using more than 5% CPU over the last 5 minutes',
            },
          },

          // We assume roughly ~8KB per series. Check that each deployment
          // doesn't go too far above this.
          //
          // We aggregate the memory of the scraping service together since an individual
          // node with a really small number of active series will throw this metric off.
          {
            alert: 'GrafanaAgentMemHigh',
            expr: |||
              sum without (pod, instance) (go_memstats_heap_inuse_bytes{job=~"smoke/grafana-agent.*"}) /
              sum without (pod, instance, instance_group_name) (agent_wal_storage_active_series{job=~"smoke/grafana-agent.*"}) / 1e3 > 10
            |||,
            'for': '5m',
            annotations: {
              summary: '{{ $labels.job }} has used more than 10KB per series for more than 5 minutes',
            },
          },
        ],
      },
      {
        name: 'CrowChecks',
        rules: [
          {
            alert: 'CrowMissing',
            expr: |||
              absent(up{container="crow-single"})  == 1 or
              absent(up{container="crow-cluster"}) == 1
            |||,
            'for': '5m',
            annotations: {
              summary: '{{ $labels.container }} is not running.',
            },
          },
          {
            alert: 'CrowDown',
            expr: |||
              up{job=~"smoke/crow-.*"} == 0
            |||,
            'for': '5m',
            annotations: {
              summary: 'Crow {{ $labels.job }} is down.',
            },
          },
          {
            alert: 'CrowFlapping',
            expr: |||
              avg_over_time(up{job=~"smoke/crow-.*"}[5m]) < 1
            |||,
            'for': '15m',
            annotations: {
              summary: 'Crow {{ $labels.job }} is flapping.',
            },
          },
          {
            alert: 'CrowNotScraped',
            expr: |||
              rate(crow_test_samples_total[1m]) == 0
            |||,
            'for': '5m',
            annotations: {
              summary: 'Crow {{ $labels.job }} is not being scraped.',
            },
          },
          {
            alert: 'CrowFailures',
            expr: |||
              (
                rate(crow_test_sample_results_total{result="success"}[1m])
                / ignoring(result) rate(crow_test_samples_total[1m])
              ) < 1
            |||,
            'for': '5m',
            annotations: {
              summary: 'Crow {{ $labels.job }} has had failures for at least 5m',
            },
          },
        ],
      },
      {
        name: 'VultureChecks',
        rules: [
          {
            alert: 'VultureMissing',
            expr: |||
              absent(up{container="vulture"})  == 1
            |||,
            'for': '5m',
            annotations: {
              summary: '{{ $labels.container }} is not running.',
            },
          },
          {
            alert: 'VultureDown',
            expr: |||
              up{job=~"smoke/vulture"} == 0
            |||,
            'for': '5m',
            annotations: {
              summary: 'Vulture {{ $labels.job }} is down.',
            },
          },
          {
            alert: 'VultureFlapping',
            expr: |||
              avg_over_time(up{job=~"smoke/vulture"}[5m]) < 1
            |||,
            'for': '15m',
            annotations: {
              summary: 'Vulture {{ $labels.job }} is flapping.',
            },
          },
          {
            alert: 'VultureNotScraped',
            expr: |||
              rate(tempo_vulture_trace_total[1m]) == 0
            |||,
            'for': '5m',
            annotations: {
              summary: 'Vulture {{ $labels.job }} is not being scraped.',
            },
          },
          {
            alert: 'VultureFailures',
            expr: |||
              (rate(tempo_vulture_error_total[5m]) / rate(tempo_vulture_trace_total[5m])) > 0.3 
            |||,
            'for': '5m',
            annotations: {
              summary: 'Vulture {{ $labels.job }} has had failures for at least 5m',
            },
          },
        ],
      },
      {
        name: 'CanaryChecks',
        rules: [
          {
            alert: 'CanaryMissing',
            expr: |||
              absent(up{container="loki-canary"})  == 1
            |||,
            'for': '5m',
            annotations: {
              summary: '{{ $labels.container }} is not running.',
            },
          },
          {
            alert: 'CanaryDown',
            expr: |||
              up{job=~"smoke/loki-canary"} == 0
            |||,
            'for': '5m',
            annotations: {
              summary: ' Canary is down.',
            },
          },
          {
            alert: 'CanaryNotScraped',
            expr: |||
              rate(loki_canary_entries_total[1m]) == 0
            |||,
            'for': '5m',
            annotations: {
              summary: 'Canary is not being scraped.',
            },
          },
          {
            alert: 'CanaryMissingEntries',
            expr: |||
              (rate(loki_canary_missing_entries_total[2m])) > 0 
            |||,
            'for': '2m',
            annotations: {
              summary: 'Canary has had missing entries for at least 2m',
            },
          },
          {
            alert: 'CanarySpotChecksMissingEntries',
            expr: |||
              (rate(loki_canary_spot_check_missing_entries_total[2m])) > 0 
            |||,
            'for': '2m',
            annotations: {
              summary: 'Canary has had missing spot check entries for at least 2m',
            },
          },
          {
            alert: 'CanaryWebsocketMissingEntries',
            expr: |||
              (rate(loki_canary_websocket_missing_entries_total[2m])) > 0 
            |||,
            'for': '2m',
            annotations: {
              summary: 'Canary has had missing websocket entries for at least 2m',
            },
          },
          {
            alert: 'CanaryUnexpectedEntries',
            expr: |||
              (rate(loki_canary_unexpected_entries_total[2m])) > 0 
            |||,
            'for': '2m',
            annotations: {
              summary: 'Canary has had unexpected entries for at least 2m',
            },
          },
        ],
      },
    ],
  },
}