Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
aos
GitHub Repository: aos/grafana-agent
Path: blob/main/example/k3d/scripts/smoke-test.bash
5283 views
1
#!/usr/bin/env bash
2
#
3
# Usage:
4
# smoke-test.bash [-i] [-d] [-s] [-t <duration>]
5
#
6
# Dependencies:
7
# k3d >=3.0
8
# Tanka
9
# jq
10
#
11
# smoke-test.bash performs smoke tests that can be used to validate a release.
12
# It only validates correctness, and does not attempt to do significant load
13
# testing or performance benchmarks.
14
#
15
# It works by deploying a k3d cluster with two pairs of Agent deployments: one
16
# using the scraping service, and one using host filtering. Each Agent deployment
17
# monitors a unique set of correctness tools (Loki Canary, Tempo Vulture,
18
# Cortex text-exporter). These tools expose correctness metrics, generated by
19
# by querying the backends that the Grafana Agents are configured to send
20
# telemetry data to.
21
#
22
# Grafana and Prometheus are deployed to the cluster and are responsible for
23
# internal monitoring. Prometheus is configured with a set of alert rules that serve
24
# as test cases for the smoke tests; alerts generated within the span of the testing
25
# period are treated as failures.
26
#
27
# After the smoke-test duration period (defaulting to 3h), alerts will
28
# be checked, and the script will end.
29
#
30
# The k3d cluster is kept alive after the test for analysis. To clean up assets created
31
# by the script, re-run the script with the -d flag.
32
33
set -euo pipefail
34
35
# Constants
36
ROOT=$(git rev-parse --show-toplevel)
37
K3D_CLUSTER_NAME="agent-smoke-test"
38
SKIP_CREATE=""
39
40
# Variables
41
42
# Which function will be called
43
ENTRYPOINT="run"
44
TEST_DURATION="10800"
45
IMPORT_IMAGES=""
46
47
while getopts "dt:ish" opt; do
48
case $opt in
49
d) ENTRYPOINT="cleanup" ;;
50
t) TEST_DURATION=$OPTARG ;;
51
i) IMPORT_IMAGES="yes" ;;
52
s) SKIP_CREATE="yes" ;;
53
h)
54
echo "Usage: $0 [-i] [-d] [-s] [-t <duration>]"
55
exit 0
56
;;
57
*)
58
echo "Usage: $0 [-i] [-d] [-s] [-t <duration>]"
59
exit 1
60
;;
61
esac
62
done
63
64
# Run runs the smoke test for $TEST_DURATION.
65
run() {
66
if [[ -z "$SKIP_CREATE" ]]; then
67
echo "--- Creating k3d cluster $K3D_CLUSTER_NAME"
68
k3d cluster create $K3D_CLUSTER_NAME \
69
--port 50080:80@loadbalancer \
70
--api-port 50443 \
71
--kubeconfig-update-default=true \
72
--kubeconfig-switch-context=true \
73
--wait >/dev/null
74
fi
75
76
# Give the cluster a little bit of time to settle before
77
# applying the environment
78
echo "--- Waiting for cluster to warm up"
79
sleep 10
80
81
if [[ ! -z "$IMPORT_IMAGES" ]]; then
82
echo "--- Importing local images"
83
84
k3d image import -c $K3D_CLUSTER_NAME \
85
grafana/agent:main \
86
grafana/agentctl:main \
87
us.gcr.io/kubernetes-dev/grafana/agent-crow:main \
88
us.gcr.io/kubernetes-dev/grafana/agent-smoke:main
89
fi
90
91
(cd $ROOT/example/k3d && jb install)
92
tk apply $ROOT/example/k3d/smoke --dangerous-auto-approve
93
94
# Immediately create a job to sync configs so our two Agent deployments
95
# are synced up as closely as possible.
96
kubectl --context=k3d-$K3D_CLUSTER_NAME --namespace=smoke \
97
create job --from=cronjob/grafana-agent-syncer \
98
grafana-agent-syncer-startup
99
100
echo "Your environment is now running for the next $TEST_DURATION seconds."
101
echo "Grafana URL: http://grafana.k3d.localhost:50080"
102
echo "Prometheus URL: http://prometheus.k3d.localhost:50080"
103
echo "Check smoke test logs: "
104
echo " kubectl logs --namespace=smoke -f deployment/smoke-test"
105
sleep $TEST_DURATION
106
107
kubectl scale -n smoke --replicas=0 deployment/smoke-test
108
109
echo "Smoke tests complete!"
110
echo "Grafana URL: http://grafana.k3d.localhost:50080"
111
echo "Prometheus URL: http://prometheus.k3d.localhost:50080"
112
echo ""
113
echo "Getting results..."
114
115
get_results
116
}
117
118
get_results() {
119
NUM_ALERTS=$(curl -s -G \
120
-H "Host: prometheus.k3d.localhost" \
121
-d "query=count_over_time(ALERTS{alertstate=\"firing\"}[$TEST_DURATION])" \
122
'http://localhost:50080/api/v1/query' \
123
| jq '.data.result | length' \
124
)
125
if test $NUM_ALERTS -ne 0; then
126
echo "FAIL: $NUM_ALERTS alerts found over the last $TEST_DURATION seconds."
127
echo "More information: http://prometheus.k3d.localhost:50080/graph?g0.expr=count_over_time(ALERTS{alertstate%3D%22firing%22}[$TEST_DURATION])"
128
129
exit 1
130
else
131
echo "PASS: 0 alerts found over the last $TEST_DURATION seconds. You're good to go!"
132
133
exit 0
134
fi
135
}
136
137
cleanup() {
138
echo "--- Deleting k3d cluster $K3D_CLUSTER_NAME"
139
k3d cluster delete $K3D_CLUSTER_NAME >/dev/null
140
}
141
142
$ENTRYPOINT
143
144