Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/samples/bpf/hbm.c
25924 views
1
// SPDX-License-Identifier: GPL-2.0
2
/* Copyright (c) 2019 Facebook
3
*
4
* This program is free software; you can redistribute it and/or
5
* modify it under the terms of version 2 of the GNU General Public
6
* License as published by the Free Software Foundation.
7
*
8
* Example program for Host Bandwidth Managment
9
*
10
* This program loads a cgroup skb BPF program to enforce cgroup output
11
* (egress) or input (ingress) bandwidth limits.
12
*
13
* USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog]
14
* Where:
15
* -d Print BPF trace debug buffer
16
* -l Also limit flows doing loopback
17
* -n <#> To create cgroup \"/hbm#\" and attach prog
18
* Default is /hbm1
19
* --no_cn Do not return cn notifications
20
* -r <rate> Rate limit in Mbps
21
* -s Get HBM stats (marked, dropped, etc.)
22
* -t <time> Exit after specified seconds (default is 0)
23
* -w Work conserving flag. cgroup can increase its bandwidth
24
* beyond the rate limit specified while there is available
25
* bandwidth. Current implementation assumes there is only
26
* NIC (eth0), but can be extended to support multiple NICs.
27
* Currrently only supported for egress.
28
* -h Print this info
29
* prog BPF program file name. Name defaults to hbm_out_kern.o
30
*/
31
32
#define _GNU_SOURCE
33
34
#include <stdio.h>
35
#include <stdlib.h>
36
#include <assert.h>
37
#include <sys/time.h>
38
#include <unistd.h>
39
#include <errno.h>
40
#include <fcntl.h>
41
#include <linux/unistd.h>
42
#include <linux/compiler.h>
43
44
#include <linux/bpf.h>
45
#include <bpf/bpf.h>
46
#include <getopt.h>
47
48
#include "cgroup_helpers.h"
49
#include "hbm.h"
50
#include "bpf_util.h"
51
#include <bpf/libbpf.h>
52
53
bool outFlag = true;
54
int minRate = 1000; /* cgroup rate limit in Mbps */
55
int rate = 1000; /* can grow if rate conserving is enabled */
56
int dur = 1;
57
bool stats_flag;
58
bool loopback_flag;
59
bool debugFlag;
60
bool work_conserving_flag;
61
bool no_cn_flag;
62
bool edt_flag;
63
64
static void Usage(void);
65
static void read_trace_pipe2(void);
66
static void do_error(char *msg, bool errno_flag);
67
68
#define TRACEFS "/sys/kernel/tracing/"
69
70
static struct bpf_program *bpf_prog;
71
static struct bpf_object *obj;
72
static int queue_stats_fd;
73
74
static void read_trace_pipe2(void)
75
{
76
int trace_fd;
77
FILE *outf;
78
char *outFname = "hbm_out.log";
79
80
trace_fd = open(TRACEFS "trace_pipe", O_RDONLY, 0);
81
if (trace_fd < 0) {
82
printf("Error opening trace_pipe\n");
83
return;
84
}
85
86
// Future support of ingress
87
// if (!outFlag)
88
// outFname = "hbm_in.log";
89
outf = fopen(outFname, "w");
90
91
if (outf == NULL)
92
printf("Error creating %s\n", outFname);
93
94
while (1) {
95
static char buf[4097];
96
ssize_t sz;
97
98
sz = read(trace_fd, buf, sizeof(buf) - 1);
99
if (sz > 0) {
100
buf[sz] = 0;
101
puts(buf);
102
if (outf != NULL) {
103
fprintf(outf, "%s\n", buf);
104
fflush(outf);
105
}
106
}
107
}
108
}
109
110
static void do_error(char *msg, bool errno_flag)
111
{
112
if (errno_flag)
113
printf("ERROR: %s, errno: %d\n", msg, errno);
114
else
115
printf("ERROR: %s\n", msg);
116
exit(1);
117
}
118
119
static int prog_load(char *prog)
120
{
121
struct bpf_program *pos;
122
const char *sec_name;
123
124
obj = bpf_object__open_file(prog, NULL);
125
if (libbpf_get_error(obj)) {
126
printf("ERROR: opening BPF object file failed\n");
127
return 1;
128
}
129
130
/* load BPF program */
131
if (bpf_object__load(obj)) {
132
printf("ERROR: loading BPF object file failed\n");
133
goto err;
134
}
135
136
bpf_object__for_each_program(pos, obj) {
137
sec_name = bpf_program__section_name(pos);
138
if (sec_name && !strcmp(sec_name, "cgroup_skb/egress")) {
139
bpf_prog = pos;
140
break;
141
}
142
}
143
if (!bpf_prog) {
144
printf("ERROR: finding a prog in obj file failed\n");
145
goto err;
146
}
147
148
queue_stats_fd = bpf_object__find_map_fd_by_name(obj, "queue_stats");
149
if (queue_stats_fd < 0) {
150
printf("ERROR: finding a map in obj file failed\n");
151
goto err;
152
}
153
154
return 0;
155
156
err:
157
bpf_object__close(obj);
158
return 1;
159
}
160
161
static int run_bpf_prog(char *prog, int cg_id)
162
{
163
struct hbm_queue_stats qstats = {0};
164
char cg_dir[100], cg_pin_path[100];
165
struct bpf_link *link = NULL;
166
int key = 0;
167
int cg1 = 0;
168
int rc = 0;
169
170
sprintf(cg_dir, "/hbm%d", cg_id);
171
rc = prog_load(prog);
172
if (rc != 0)
173
return rc;
174
175
if (setup_cgroup_environment()) {
176
printf("ERROR: setting cgroup environment\n");
177
goto err;
178
}
179
cg1 = create_and_get_cgroup(cg_dir);
180
if (!cg1) {
181
printf("ERROR: create_and_get_cgroup\n");
182
goto err;
183
}
184
if (join_cgroup(cg_dir)) {
185
printf("ERROR: join_cgroup\n");
186
goto err;
187
}
188
189
qstats.rate = rate;
190
qstats.stats = stats_flag ? 1 : 0;
191
qstats.loopback = loopback_flag ? 1 : 0;
192
qstats.no_cn = no_cn_flag ? 1 : 0;
193
if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) {
194
printf("ERROR: Could not update map element\n");
195
goto err;
196
}
197
198
if (!outFlag)
199
bpf_program__set_expected_attach_type(bpf_prog, BPF_CGROUP_INET_INGRESS);
200
201
link = bpf_program__attach_cgroup(bpf_prog, cg1);
202
if (libbpf_get_error(link)) {
203
fprintf(stderr, "ERROR: bpf_program__attach_cgroup failed\n");
204
goto err;
205
}
206
207
sprintf(cg_pin_path, "/sys/fs/bpf/hbm%d", cg_id);
208
rc = bpf_link__pin(link, cg_pin_path);
209
if (rc < 0) {
210
printf("ERROR: bpf_link__pin failed: %d\n", rc);
211
goto err;
212
}
213
214
if (work_conserving_flag) {
215
struct timeval t0, t_last, t_new;
216
FILE *fin;
217
unsigned long long last_eth_tx_bytes, new_eth_tx_bytes;
218
signed long long last_cg_tx_bytes, new_cg_tx_bytes;
219
signed long long delta_time, delta_bytes, delta_rate;
220
int delta_ms;
221
#define DELTA_RATE_CHECK 10000 /* in us */
222
#define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */
223
224
bpf_map_lookup_elem(queue_stats_fd, &key, &qstats);
225
if (gettimeofday(&t0, NULL) < 0)
226
do_error("gettimeofday failed", true);
227
t_last = t0;
228
fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r");
229
if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1)
230
do_error("fscanf fails", false);
231
fclose(fin);
232
last_cg_tx_bytes = qstats.bytes_total;
233
while (true) {
234
usleep(DELTA_RATE_CHECK);
235
if (gettimeofday(&t_new, NULL) < 0)
236
do_error("gettimeofday failed", true);
237
delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 +
238
(t_new.tv_usec - t0.tv_usec)/1000;
239
if (delta_ms > dur * 1000)
240
break;
241
delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 +
242
(t_new.tv_usec - t_last.tv_usec);
243
if (delta_time == 0)
244
continue;
245
t_last = t_new;
246
fin = fopen("/sys/class/net/eth0/statistics/tx_bytes",
247
"r");
248
if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1)
249
do_error("fscanf fails", false);
250
fclose(fin);
251
printf(" new_eth_tx_bytes:%llu\n",
252
new_eth_tx_bytes);
253
bpf_map_lookup_elem(queue_stats_fd, &key, &qstats);
254
new_cg_tx_bytes = qstats.bytes_total;
255
delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes;
256
last_eth_tx_bytes = new_eth_tx_bytes;
257
delta_rate = (delta_bytes * 8000000) / delta_time;
258
printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps",
259
delta_ms, delta_rate/1000000000.0,
260
rate/1000.0);
261
if (delta_rate < RATE_THRESHOLD) {
262
/* can increase cgroup rate limit, but first
263
* check if we are using the current limit.
264
* Currently increasing by 6.25%, unknown
265
* if that is the optimal rate.
266
*/
267
int rate_diff100;
268
269
delta_bytes = new_cg_tx_bytes -
270
last_cg_tx_bytes;
271
last_cg_tx_bytes = new_cg_tx_bytes;
272
delta_rate = (delta_bytes * 8000000) /
273
delta_time;
274
printf(" rate:%.3fGbps",
275
delta_rate/1000000000.0);
276
rate_diff100 = (((long long)rate)*1000000 -
277
delta_rate) * 100 /
278
(((long long) rate) * 1000000);
279
printf(" rdiff:%d", rate_diff100);
280
if (rate_diff100 <= 3) {
281
rate += (rate >> 4);
282
if (rate > RATE_THRESHOLD / 1000000)
283
rate = RATE_THRESHOLD / 1000000;
284
qstats.rate = rate;
285
printf(" INC\n");
286
} else {
287
printf("\n");
288
}
289
} else {
290
/* Need to decrease cgroup rate limit.
291
* Currently decreasing by 12.5%, unknown
292
* if that is optimal
293
*/
294
printf(" DEC\n");
295
rate -= (rate >> 3);
296
if (rate < minRate)
297
rate = minRate;
298
qstats.rate = rate;
299
}
300
if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY))
301
do_error("update map element fails", false);
302
}
303
} else {
304
sleep(dur);
305
}
306
// Get stats!
307
if (stats_flag && bpf_map_lookup_elem(queue_stats_fd, &key, &qstats)) {
308
char fname[100];
309
FILE *fout;
310
311
if (!outFlag)
312
sprintf(fname, "hbm.%d.in", cg_id);
313
else
314
sprintf(fname, "hbm.%d.out", cg_id);
315
fout = fopen(fname, "w");
316
fprintf(fout, "id:%d\n", cg_id);
317
fprintf(fout, "ERROR: Could not lookup queue_stats\n");
318
fclose(fout);
319
} else if (stats_flag && qstats.lastPacketTime >
320
qstats.firstPacketTime) {
321
long long delta_us = (qstats.lastPacketTime -
322
qstats.firstPacketTime)/1000;
323
unsigned int rate_mbps = ((qstats.bytes_total -
324
qstats.bytes_dropped) * 8 /
325
delta_us);
326
double percent_pkts, percent_bytes;
327
char fname[100];
328
FILE *fout;
329
int k;
330
static const char *returnValNames[] = {
331
"DROP_PKT",
332
"ALLOW_PKT",
333
"DROP_PKT_CWR",
334
"ALLOW_PKT_CWR"
335
};
336
#define RET_VAL_COUNT 4
337
338
// Future support of ingress
339
// if (!outFlag)
340
// sprintf(fname, "hbm.%d.in", cg_id);
341
// else
342
sprintf(fname, "hbm.%d.out", cg_id);
343
fout = fopen(fname, "w");
344
fprintf(fout, "id:%d\n", cg_id);
345
fprintf(fout, "rate_mbps:%d\n", rate_mbps);
346
fprintf(fout, "duration:%.1f secs\n",
347
(qstats.lastPacketTime - qstats.firstPacketTime) /
348
1000000000.0);
349
fprintf(fout, "packets:%d\n", (int)qstats.pkts_total);
350
fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total /
351
1000000));
352
fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped);
353
fprintf(fout, "bytes_dropped_MB:%d\n",
354
(int)(qstats.bytes_dropped /
355
1000000));
356
// Marked Pkts and Bytes
357
percent_pkts = (qstats.pkts_marked * 100.0) /
358
(qstats.pkts_total + 1);
359
percent_bytes = (qstats.bytes_marked * 100.0) /
360
(qstats.bytes_total + 1);
361
fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts);
362
fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes);
363
364
// Dropped Pkts and Bytes
365
percent_pkts = (qstats.pkts_dropped * 100.0) /
366
(qstats.pkts_total + 1);
367
percent_bytes = (qstats.bytes_dropped * 100.0) /
368
(qstats.bytes_total + 1);
369
fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
370
fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
371
372
// ECN CE markings
373
percent_pkts = (qstats.pkts_ecn_ce * 100.0) /
374
(qstats.pkts_total + 1);
375
fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts,
376
(int)qstats.pkts_ecn_ce);
377
378
// Average cwnd
379
fprintf(fout, "avg cwnd:%d\n",
380
(int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1)));
381
// Average rtt
382
fprintf(fout, "avg rtt:%d\n",
383
(int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
384
// Average credit
385
if (edt_flag)
386
fprintf(fout, "avg credit_ms:%.03f\n",
387
(qstats.sum_credit /
388
(qstats.pkts_total + 1.0)) / 1000000.0);
389
else
390
fprintf(fout, "avg credit:%d\n",
391
(int)(qstats.sum_credit /
392
(1500 * ((int)qstats.pkts_total ) + 1)));
393
394
// Return values stats
395
for (k = 0; k < RET_VAL_COUNT; k++) {
396
percent_pkts = (qstats.returnValCount[k] * 100.0) /
397
(qstats.pkts_total + 1);
398
fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k],
399
percent_pkts, (int)qstats.returnValCount[k]);
400
}
401
fclose(fout);
402
}
403
404
if (debugFlag)
405
read_trace_pipe2();
406
goto cleanup;
407
408
err:
409
rc = 1;
410
411
cleanup:
412
bpf_link__destroy(link);
413
bpf_object__close(obj);
414
415
if (cg1 != -1)
416
close(cg1);
417
418
if (rc != 0)
419
cleanup_cgroup_environment();
420
return rc;
421
}
422
423
static void Usage(void)
424
{
425
printf("This program loads a cgroup skb BPF program to enforce\n"
426
"cgroup output (egress) bandwidth limits.\n\n"
427
"USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n"
428
" [-s] [-t <secs>] [-w] [-h] [prog]\n"
429
" Where:\n"
430
" -o indicates egress direction (default)\n"
431
" -d print BPF trace debug buffer\n"
432
" --edt use fq's Earliest Departure Time\n"
433
" -l also limit flows using loopback\n"
434
" -n <#> to create cgroup \"/hbm#\" and attach prog\n"
435
" Default is /hbm1\n"
436
" --no_cn disable CN notifications\n"
437
" -r <rate> Rate in Mbps\n"
438
" -s Update HBM stats\n"
439
" -t <time> Exit after specified seconds (default is 0)\n"
440
" -w Work conserving flag. cgroup can increase\n"
441
" bandwidth beyond the rate limit specified\n"
442
" while there is available bandwidth. Current\n"
443
" implementation assumes there is only eth0\n"
444
" but can be extended to support multiple NICs\n"
445
" -h print this info\n"
446
" prog BPF program file name. Name defaults to\n"
447
" hbm_out_kern.o\n");
448
}
449
450
int main(int argc, char **argv)
451
{
452
char *prog = "hbm_out_kern.o";
453
int k;
454
int cg_id = 1;
455
char *optstring = "iodln:r:st:wh";
456
struct option loptions[] = {
457
{"no_cn", 0, NULL, 1},
458
{"edt", 0, NULL, 2},
459
{NULL, 0, NULL, 0}
460
};
461
462
while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) {
463
switch (k) {
464
case 1:
465
no_cn_flag = true;
466
break;
467
case 2:
468
prog = "hbm_edt_kern.o";
469
edt_flag = true;
470
break;
471
case'o':
472
break;
473
case 'd':
474
debugFlag = true;
475
break;
476
case 'l':
477
loopback_flag = true;
478
break;
479
case 'n':
480
cg_id = atoi(optarg);
481
break;
482
case 'r':
483
minRate = atoi(optarg) * 1.024;
484
rate = minRate;
485
break;
486
case 's':
487
stats_flag = true;
488
break;
489
case 't':
490
dur = atoi(optarg);
491
break;
492
case 'w':
493
work_conserving_flag = true;
494
break;
495
case '?':
496
if (optopt == 'n' || optopt == 'r' || optopt == 't')
497
fprintf(stderr,
498
"Option -%c requires an argument.\n\n",
499
optopt);
500
case 'h':
501
default:
502
Usage();
503
return 0;
504
}
505
}
506
507
if (optind < argc)
508
prog = argv[optind];
509
printf("HBM prog: %s\n", prog != NULL ? prog : "NULL");
510
511
/* Use libbpf 1.0 API mode */
512
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
513
514
return run_bpf_prog(prog, cg_id);
515
}
516
517