// SPDX-License-Identifier: GPL-2.01/* Copyright (c) 2019 Facebook2*3* This program is free software; you can redistribute it and/or4* modify it under the terms of version 2 of the GNU General Public5* License as published by the Free Software Foundation.6*7* Sample Host Bandwidth Manager (HBM) BPF program.8*9* A cgroup skb BPF egress program to limit cgroup output bandwidth.10* It uses a modified virtual token bucket queue to limit average11* egress bandwidth. The implementation uses credits instead of tokens.12* Negative credits imply that queueing would have happened (this is13* a virtual queue, so no queueing is done by it. However, queueing may14* occur at the actual qdisc (which is not used for rate limiting).15*16* This implementation uses 3 thresholds, one to start marking packets and17* the other two to drop packets:18* CREDIT19* - <--------------------------|------------------------> +20* | | | 021* | Large pkt |22* | drop thresh |23* Small pkt drop Mark threshold24* thresh25*26* The effect of marking depends on the type of packet:27* a) If the packet is ECN enabled and it is a TCP packet, then the packet28* is ECN marked.29* b) If the packet is a TCP packet, then we probabilistically call tcp_cwr30* to reduce the congestion window. The current implementation uses a linear31* distribution (0% probability at marking threshold, 100% probability32* at drop threshold).33* c) If the packet is not a TCP packet, then it is dropped.34*35* If the credit is below the drop threshold, the packet is dropped. If it36* is a TCP packet, then it also calls tcp_cwr since packets dropped by37* by a cgroup skb BPF program do not automatically trigger a call to38* tcp_cwr in the current kernel code.39*40* This BPF program actually uses 2 drop thresholds, one threshold41* for larger packets (>= 120 bytes) and another for smaller packets. This42* protects smaller packets such as SYNs, ACKs, etc.43*44* The default bandwidth limit is set at 1Gbps but this can be changed by45* a user program through a shared BPF map. In addition, by default this BPF46* program does not limit connections using loopback. This behavior can be47* overwritten by the user program. There is also an option to calculate48* some statistics, such as percent of packets marked or dropped, which49* the user program can access.50*51* A latter patch provides such a program (hbm.c)52*/5354#include "hbm_kern.h"5556SEC("cgroup_skb/egress")57int _hbm_out_cg(struct __sk_buff *skb)58{59struct hbm_pkt_info pkti;60int len = skb->len;61unsigned int queue_index = 0;62unsigned long long curtime;63int credit;64signed long long delta = 0, new_credit;65int max_credit = MAX_CREDIT;66bool congestion_flag = false;67bool drop_flag = false;68bool cwr_flag = false;69bool ecn_ce_flag = false;70struct hbm_vqueue *qdp;71struct hbm_queue_stats *qsp = NULL;72int rv = ALLOW_PKT;7374qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);75if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))76return ALLOW_PKT;7778hbm_get_pkt_info(skb, &pkti);7980// We may want to account for the length of headers in len81// calculation, like ETH header + overhead, specially if it82// is a gso packet. But I am not doing it right now.8384qdp = bpf_get_local_storage(&queue_state, 0);85if (!qdp)86return ALLOW_PKT;87else if (qdp->lasttime == 0)88hbm_init_vqueue(qdp, 1024);8990curtime = bpf_ktime_get_ns();9192// Begin critical section93bpf_spin_lock(&qdp->lock);94credit = qdp->credit;95delta = curtime - qdp->lasttime;96/* delta < 0 implies that another process with a curtime greater97* than ours beat us to the critical section and already added98* the new credit, so we should not add it ourselves99*/100if (delta > 0) {101qdp->lasttime = curtime;102new_credit = credit + CREDIT_PER_NS(delta, qdp->rate);103if (new_credit > MAX_CREDIT)104credit = MAX_CREDIT;105else106credit = new_credit;107}108credit -= len;109qdp->credit = credit;110bpf_spin_unlock(&qdp->lock);111// End critical section112113// Check if we should update rate114if (qsp != NULL && (qsp->rate * 128) != qdp->rate) {115qdp->rate = qsp->rate * 128;116bpf_printk("Updating rate: %d (1sec:%llu bits)\n",117(int)qdp->rate,118CREDIT_PER_NS(1000000000, qdp->rate) * 8);119}120121// Set flags (drop, congestion, cwr)122// Dropping => we are congested, so ignore congestion flag123if (credit < -DROP_THRESH ||124(len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) {125// Very congested, set drop packet126drop_flag = true;127if (pkti.ecn)128congestion_flag = true;129else if (pkti.is_tcp)130cwr_flag = true;131} else if (credit < 0) {132// Congested, set congestion flag133if (pkti.ecn || pkti.is_tcp) {134if (credit < -MARK_THRESH)135congestion_flag = true;136else137congestion_flag = false;138} else {139congestion_flag = true;140}141}142143if (congestion_flag) {144if (bpf_skb_ecn_set_ce(skb)) {145ecn_ce_flag = true;146} else {147if (pkti.is_tcp) {148unsigned int rand = bpf_get_prandom_u32();149150if (-credit >= MARK_THRESH +151(rand % MARK_REGION_SIZE)) {152// Do congestion control153cwr_flag = true;154}155} else if (len > LARGE_PKT_THRESH) {156// Problem if too many small packets?157drop_flag = true;158}159}160}161162if (qsp != NULL)163if (qsp->no_cn)164cwr_flag = false;165166hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,167cwr_flag, ecn_ce_flag, &pkti, credit);168169if (drop_flag) {170__sync_add_and_fetch(&(qdp->credit), len);171rv = DROP_PKT;172}173174if (cwr_flag)175rv |= 2;176return rv;177}178char _license[] SEC("license") = "GPL";179180181