/*-1* Copyright (C) 1997-20032* Sony Computer Science Laboratories Inc. All rights reserved.3*4* Redistribution and use in source and binary forms, with or without5* modification, are permitted provided that the following conditions6* are met:7* 1. Redistributions of source code must retain the above copyright8* notice, this list of conditions and the following disclaimer.9* 2. Redistributions in binary form must reproduce the above copyright10* notice, this list of conditions and the following disclaimer in the11* documentation and/or other materials provided with the distribution.12*13* THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND14* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE15* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE16* ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE17* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL18* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS19* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)20* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT21* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY22* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF23* SUCH DAMAGE.24*25*/26/*-27* Copyright (c) 1990-1994 Regents of the University of California.28* All rights reserved.29*30* Redistribution and use in source and binary forms, with or without31* modification, are permitted provided that the following conditions32* are met:33* 1. Redistributions of source code must retain the above copyright34* notice, this list of conditions and the following disclaimer.35* 2. Redistributions in binary form must reproduce the above copyright36* notice, this list of conditions and the following disclaimer in the37* documentation and/or other materials provided with the distribution.38* 3. All advertising materials mentioning features or use of this software39* must display the following acknowledgement:40* This product includes software developed by the Computer Systems41* Engineering Group at Lawrence Berkeley Laboratory.42* 4. Neither the name of the University nor of the Laboratory may be used43* to endorse or promote products derived from this software without44* specific prior written permission.45*46* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND47* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE48* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE49* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE50* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL51* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS52* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)53* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT54* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY55* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF56* SUCH DAMAGE.57*58* $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $59*/6061#include "opt_altq.h"62#include "opt_inet.h"63#include "opt_inet6.h"64#ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */6566#include <sys/param.h>67#include <sys/malloc.h>68#include <sys/mbuf.h>69#include <sys/socket.h>70#include <sys/systm.h>71#include <sys/errno.h>72#if 1 /* ALTQ3_COMPAT */73#include <sys/sockio.h>74#include <sys/proc.h>75#include <sys/kernel.h>76#ifdef ALTQ_FLOWVALVE77#include <sys/queue.h>78#include <sys/time.h>79#endif80#endif /* ALTQ3_COMPAT */8182#include <net/if.h>83#include <net/if_var.h>8485#include <netinet/in.h>86#include <netinet/in_systm.h>87#include <netinet/ip.h>88#ifdef INET689#include <netinet/ip6.h>90#endif9192#include <netpfil/pf/pf.h>93#include <netpfil/pf/pf_altq.h>94#include <netpfil/pf/pf_mtag.h>95#include <net/altq/altq.h>96#include <net/altq/altq_red.h>9798/*99* ALTQ/RED (Random Early Detection) implementation using 32-bit100* fixed-point calculation.101*102* written by kjc using the ns code as a reference.103* you can learn more about red and ns from Sally's home page at104* http://www-nrg.ee.lbl.gov/floyd/105*106* most of the red parameter values are fixed in this implementation107* to prevent fixed-point overflow/underflow.108* if you change the parameters, watch out for overflow/underflow!109*110* the parameters used are recommended values by Sally.111* the corresponding ns config looks:112* q_weight=0.00195113* minthresh=5 maxthresh=15 queue-size=60114* linterm=30115* dropmech=drop-tail116* bytes=false (can't be handled by 32-bit fixed-point)117* doubleq=false dqthresh=false118* wait=true119*/120/*121* alternative red parameters for a slow link.122*123* assume the queue length becomes from zero to L and keeps L, it takes124* N packets for q_avg to reach 63% of L.125* when q_weight is 0.002, N is about 500 packets.126* for a slow link like dial-up, 500 packets takes more than 1 minute!127* when q_weight is 0.008, N is about 127 packets.128* when q_weight is 0.016, N is about 63 packets.129* bursts of 50 packets are allowed for 0.002, bursts of 25 packets130* are allowed for 0.016.131* see Sally's paper for more details.132*/133/* normal red parameters */134#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */135/* q_weight = 0.00195 */136137/* red parameters for a slow link */138#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */139/* q_weight = 0.0078125 */140141/* red parameters for a very slow link (e.g., dialup) */142#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */143/* q_weight = 0.015625 */144145/* fixed-point uses 12-bit decimal places */146#define FP_SHIFT 12 /* fixed-point shift */147148/* red parameters for drop probability */149#define INV_P_MAX 10 /* inverse of max drop probability */150#define TH_MIN 5 /* min threshold */151#define TH_MAX 15 /* max threshold */152153#define RED_LIMIT 60 /* default max queue length */154#define RED_STATS /* collect statistics */155156/*157* our default policy for forced-drop is drop-tail.158* (in altq-1.1.2 or earlier, the default was random-drop.159* but it makes more sense to punish the cause of the surge.)160* to switch to the random-drop policy, define "RED_RANDOM_DROP".161*/162163/* default red parameter values */164static int default_th_min = TH_MIN;165static int default_th_max = TH_MAX;166static int default_inv_pmax = INV_P_MAX;167168/*169* red support routines170*/171red_t *172red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,173int pkttime)174{175red_t *rp;176int w, i;177int npkts_per_sec;178179rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO);180if (rp == NULL)181return (NULL);182183if (weight == 0)184rp->red_weight = W_WEIGHT;185else186rp->red_weight = weight;187188/* allocate weight table */189rp->red_wtab = wtab_alloc(rp->red_weight);190if (rp->red_wtab == NULL) {191free(rp, M_DEVBUF);192return (NULL);193}194195rp->red_avg = 0;196rp->red_idle = 1;197198if (inv_pmax == 0)199rp->red_inv_pmax = default_inv_pmax;200else201rp->red_inv_pmax = inv_pmax;202if (th_min == 0)203rp->red_thmin = default_th_min;204else205rp->red_thmin = th_min;206if (th_max == 0)207rp->red_thmax = default_th_max;208else209rp->red_thmax = th_max;210211rp->red_flags = flags;212213if (pkttime == 0)214/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */215rp->red_pkttime = 800;216else217rp->red_pkttime = pkttime;218219if (weight == 0) {220/* when the link is very slow, adjust red parameters */221npkts_per_sec = 1000000 / rp->red_pkttime;222if (npkts_per_sec < 50) {223/* up to about 400Kbps */224rp->red_weight = W_WEIGHT_2;225} else if (npkts_per_sec < 300) {226/* up to about 2.4Mbps */227rp->red_weight = W_WEIGHT_1;228}229}230231/* calculate wshift. weight must be power of 2 */232w = rp->red_weight;233for (i = 0; w > 1; i++)234w = w >> 1;235rp->red_wshift = i;236w = 1 << rp->red_wshift;237if (w != rp->red_weight) {238printf("invalid weight value %d for red! use %d\n",239rp->red_weight, w);240rp->red_weight = w;241}242243/*244* thmin_s and thmax_s are scaled versions of th_min and th_max245* to be compared with avg.246*/247rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);248rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);249250/*251* precompute probability denominator252* probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point253*/254rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)255* rp->red_inv_pmax) << FP_SHIFT;256257microtime(&rp->red_last);258return (rp);259}260261void262red_destroy(red_t *rp)263{264wtab_destroy(rp->red_wtab);265free(rp, M_DEVBUF);266}267268void269red_getstats(red_t *rp, struct redstats *sp)270{271sp->q_avg = rp->red_avg >> rp->red_wshift;272sp->xmit_cnt = rp->red_stats.xmit_cnt;273sp->drop_cnt = rp->red_stats.drop_cnt;274sp->drop_forced = rp->red_stats.drop_forced;275sp->drop_unforced = rp->red_stats.drop_unforced;276sp->marked_packets = rp->red_stats.marked_packets;277}278279int280red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,281struct altq_pktattr *pktattr)282{283int avg, droptype;284int n;285286avg = rp->red_avg;287288/*289* if we were idle, we pretend that n packets arrived during290* the idle period.291*/292if (rp->red_idle) {293struct timeval now;294int t;295296rp->red_idle = 0;297microtime(&now);298t = (now.tv_sec - rp->red_last.tv_sec);299if (t > 60) {300/*301* being idle for more than 1 minute, set avg to zero.302* this prevents t from overflow.303*/304avg = 0;305} else {306t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);307n = t / rp->red_pkttime - 1;308309/* the following line does (avg = (1 - Wq)^n * avg) */310if (n > 0)311avg = (avg >> FP_SHIFT) *312pow_w(rp->red_wtab, n);313}314}315316/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */317avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);318rp->red_avg = avg; /* save the new value */319320/*321* red_count keeps a tally of arriving traffic that has not322* been dropped.323*/324rp->red_count++;325326/* see if we drop early */327droptype = DTYPE_NODROP;328if (avg >= rp->red_thmin_s && qlen(q) > 1) {329if (avg >= rp->red_thmax_s) {330/* avg >= th_max: forced drop */331droptype = DTYPE_FORCED;332} else if (rp->red_old == 0) {333/* first exceeds th_min */334rp->red_count = 1;335rp->red_old = 1;336} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,337rp->red_probd, rp->red_count)) {338/* mark or drop by red */339if ((rp->red_flags & REDF_ECN) &&340mark_ecn(m, pktattr, rp->red_flags)) {341/* successfully marked. do not drop. */342rp->red_count = 0;343#ifdef RED_STATS344rp->red_stats.marked_packets++;345#endif346} else {347/* unforced drop by red */348droptype = DTYPE_EARLY;349}350}351} else {352/* avg < th_min */353rp->red_old = 0;354}355356/*357* if the queue length hits the hard limit, it's a forced drop.358*/359if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))360droptype = DTYPE_FORCED;361362#ifdef RED_RANDOM_DROP363/* if successful or forced drop, enqueue this packet. */364if (droptype != DTYPE_EARLY)365_addq(q, m);366#else367/* if successful, enqueue this packet. */368if (droptype == DTYPE_NODROP)369_addq(q, m);370#endif371if (droptype != DTYPE_NODROP) {372if (droptype == DTYPE_EARLY) {373/* drop the incoming packet */374#ifdef RED_STATS375rp->red_stats.drop_unforced++;376#endif377} else {378/* forced drop, select a victim packet in the queue. */379#ifdef RED_RANDOM_DROP380m = _getq_random(q);381#endif382#ifdef RED_STATS383rp->red_stats.drop_forced++;384#endif385}386#ifdef RED_STATS387PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));388#endif389rp->red_count = 0;390m_freem(m);391return (-1);392}393/* successfully queued */394#ifdef RED_STATS395PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));396#endif397return (0);398}399400/*401* early-drop probability is calculated as follows:402* prob = p_max * (avg - th_min) / (th_max - th_min)403* prob_a = prob / (2 - count*prob)404* = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))405* here prob_a increases as successive undrop count increases.406* (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),407* becomes 1 when (count >= (2 / prob))).408*/409int410drop_early(int fp_len, int fp_probd, int count)411{412int d; /* denominator of drop-probability */413414d = fp_probd - count * fp_len;415if (d <= 0)416/* count exceeds the hard limit: drop or mark */417return (1);418419/*420* now the range of d is [1..600] in fixed-point. (when421* th_max-th_min=10 and p_max=1/30)422* drop probability = (avg - TH_MIN) / d423*/424425if ((arc4random() % d) < fp_len) {426/* drop or mark */427return (1);428}429/* no drop/mark */430return (0);431}432433/*434* try to mark CE bit to the packet.435* returns 1 if successfully marked, 0 otherwise.436*/437int438mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)439{440struct mbuf *m0;441struct pf_mtag *at;442void *hdr;443444at = pf_find_mtag(m);445if (at != NULL) {446hdr = at->hdr;447} else448return (0);449450/* verify that pattr_hdr is within the mbuf data */451for (m0 = m; m0 != NULL; m0 = m0->m_next)452if (((caddr_t)hdr >= m0->m_data) &&453((caddr_t)hdr < m0->m_data + m0->m_len))454break;455if (m0 == NULL) {456/* ick, tag info is stale */457return (0);458}459460switch (((struct ip *)hdr)->ip_v) {461case IPVERSION:462if (flags & REDF_ECN4) {463struct ip *ip = hdr;464u_int8_t otos;465int sum;466467if (ip->ip_v != 4)468return (0); /* version mismatch! */469470if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)471return (0); /* not-ECT */472if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)473return (1); /* already marked */474475/*476* ecn-capable but not marked,477* mark CE and update checksum478*/479otos = ip->ip_tos;480ip->ip_tos |= IPTOS_ECN_CE;481/*482* update checksum (from RFC1624)483* HC' = ~(~HC + ~m + m')484*/485sum = ~ntohs(ip->ip_sum) & 0xffff;486sum += (~otos & 0xffff) + ip->ip_tos;487sum = (sum >> 16) + (sum & 0xffff);488sum += (sum >> 16); /* add carry */489ip->ip_sum = htons(~sum & 0xffff);490return (1);491}492break;493#ifdef INET6494case (IPV6_VERSION >> 4):495if (flags & REDF_ECN6) {496struct ip6_hdr *ip6 = hdr;497u_int32_t flowlabel;498499flowlabel = ntohl(ip6->ip6_flow);500if ((flowlabel >> 28) != 6)501return (0); /* version mismatch! */502if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==503(IPTOS_ECN_NOTECT << 20))504return (0); /* not-ECT */505if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==506(IPTOS_ECN_CE << 20))507return (1); /* already marked */508/*509* ecn-capable but not marked, mark CE510*/511flowlabel |= (IPTOS_ECN_CE << 20);512ip6->ip6_flow = htonl(flowlabel);513return (1);514}515break;516#endif /* INET6 */517}518519/* not marked */520return (0);521}522523struct mbuf *524red_getq(red_t *rp, class_queue_t *q)525{526struct mbuf *m;527528if ((m = _getq(q)) == NULL) {529if (rp->red_idle == 0) {530rp->red_idle = 1;531microtime(&rp->red_last);532}533return NULL;534}535536rp->red_idle = 0;537return (m);538}539540/*541* helper routine to calibrate avg during idle.542* pow_w(wtab, n) returns (1 - Wq)^n in fixed-point543* here Wq = 1/weight and the code assumes Wq is close to zero.544*545* w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.546*/547static struct wtab *wtab_list = NULL; /* pointer to wtab list */548549struct wtab *550wtab_alloc(int weight)551{552struct wtab *w;553int i;554555for (w = wtab_list; w != NULL; w = w->w_next)556if (w->w_weight == weight) {557w->w_refcount++;558return (w);559}560561w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO);562if (w == NULL)563return (NULL);564w->w_weight = weight;565w->w_refcount = 1;566w->w_next = wtab_list;567wtab_list = w;568569/* initialize the weight table */570w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;571for (i = 1; i < 32; i++) {572w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;573if (w->w_tab[i] == 0 && w->w_param_max == 0)574w->w_param_max = 1 << i;575}576577return (w);578}579580int581wtab_destroy(struct wtab *w)582{583struct wtab *prev;584585if (--w->w_refcount > 0)586return (0);587588if (wtab_list == w)589wtab_list = w->w_next;590else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)591if (prev->w_next == w) {592prev->w_next = w->w_next;593break;594}595596free(w, M_DEVBUF);597return (0);598}599600int32_t601pow_w(struct wtab *w, int n)602{603int i, bit;604int32_t val;605606if (n >= w->w_param_max)607return (0);608609val = 1 << FP_SHIFT;610if (n <= 0)611return (val);612613bit = 1;614i = 0;615while (n) {616if (n & bit) {617val = (val * w->w_tab[i]) >> FP_SHIFT;618n &= ~bit;619}620i++;621bit <<= 1;622}623return (val);624}625626#endif /* ALTQ_RED */627628629