Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/net/altq/altq_red.c
39507 views
1
/*-
2
* Copyright (C) 1997-2003
3
* Sony Computer Science Laboratories Inc. All rights reserved.
4
*
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
7
* are met:
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
*
14
* THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
15
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17
* ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
18
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24
* SUCH DAMAGE.
25
*
26
*/
27
/*-
28
* Copyright (c) 1990-1994 Regents of the University of California.
29
* All rights reserved.
30
*
31
* Redistribution and use in source and binary forms, with or without
32
* modification, are permitted provided that the following conditions
33
* are met:
34
* 1. Redistributions of source code must retain the above copyright
35
* notice, this list of conditions and the following disclaimer.
36
* 2. Redistributions in binary form must reproduce the above copyright
37
* notice, this list of conditions and the following disclaimer in the
38
* documentation and/or other materials provided with the distribution.
39
* 3. All advertising materials mentioning features or use of this software
40
* must display the following acknowledgement:
41
* This product includes software developed by the Computer Systems
42
* Engineering Group at Lawrence Berkeley Laboratory.
43
* 4. Neither the name of the University nor of the Laboratory may be used
44
* to endorse or promote products derived from this software without
45
* specific prior written permission.
46
*
47
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57
* SUCH DAMAGE.
58
*
59
* $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $
60
*/
61
62
#include "opt_altq.h"
63
#include "opt_inet.h"
64
#include "opt_inet6.h"
65
#ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */
66
67
#include <sys/param.h>
68
#include <sys/malloc.h>
69
#include <sys/mbuf.h>
70
#include <sys/socket.h>
71
#include <sys/systm.h>
72
#include <sys/errno.h>
73
#if 1 /* ALTQ3_COMPAT */
74
#include <sys/sockio.h>
75
#include <sys/proc.h>
76
#include <sys/kernel.h>
77
#ifdef ALTQ_FLOWVALVE
78
#include <sys/queue.h>
79
#include <sys/time.h>
80
#endif
81
#endif /* ALTQ3_COMPAT */
82
83
#include <net/if.h>
84
#include <net/if_var.h>
85
86
#include <netinet/in.h>
87
#include <netinet/in_systm.h>
88
#include <netinet/ip.h>
89
#ifdef INET6
90
#include <netinet/ip6.h>
91
#endif
92
93
#include <netpfil/pf/pf.h>
94
#include <netpfil/pf/pf_altq.h>
95
#include <netpfil/pf/pf_mtag.h>
96
#include <net/altq/altq.h>
97
#include <net/altq/altq_red.h>
98
99
/*
100
* ALTQ/RED (Random Early Detection) implementation using 32-bit
101
* fixed-point calculation.
102
*
103
* written by kjc using the ns code as a reference.
104
* you can learn more about red and ns from Sally's home page at
105
* http://www-nrg.ee.lbl.gov/floyd/
106
*
107
* most of the red parameter values are fixed in this implementation
108
* to prevent fixed-point overflow/underflow.
109
* if you change the parameters, watch out for overflow/underflow!
110
*
111
* the parameters used are recommended values by Sally.
112
* the corresponding ns config looks:
113
* q_weight=0.00195
114
* minthresh=5 maxthresh=15 queue-size=60
115
* linterm=30
116
* dropmech=drop-tail
117
* bytes=false (can't be handled by 32-bit fixed-point)
118
* doubleq=false dqthresh=false
119
* wait=true
120
*/
121
/*
122
* alternative red parameters for a slow link.
123
*
124
* assume the queue length becomes from zero to L and keeps L, it takes
125
* N packets for q_avg to reach 63% of L.
126
* when q_weight is 0.002, N is about 500 packets.
127
* for a slow link like dial-up, 500 packets takes more than 1 minute!
128
* when q_weight is 0.008, N is about 127 packets.
129
* when q_weight is 0.016, N is about 63 packets.
130
* bursts of 50 packets are allowed for 0.002, bursts of 25 packets
131
* are allowed for 0.016.
132
* see Sally's paper for more details.
133
*/
134
/* normal red parameters */
135
#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
136
/* q_weight = 0.00195 */
137
138
/* red parameters for a slow link */
139
#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
140
/* q_weight = 0.0078125 */
141
142
/* red parameters for a very slow link (e.g., dialup) */
143
#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
144
/* q_weight = 0.015625 */
145
146
/* fixed-point uses 12-bit decimal places */
147
#define FP_SHIFT 12 /* fixed-point shift */
148
149
/* red parameters for drop probability */
150
#define INV_P_MAX 10 /* inverse of max drop probability */
151
#define TH_MIN 5 /* min threshold */
152
#define TH_MAX 15 /* max threshold */
153
154
#define RED_LIMIT 60 /* default max queue length */
155
#define RED_STATS /* collect statistics */
156
157
/*
158
* our default policy for forced-drop is drop-tail.
159
* (in altq-1.1.2 or earlier, the default was random-drop.
160
* but it makes more sense to punish the cause of the surge.)
161
* to switch to the random-drop policy, define "RED_RANDOM_DROP".
162
*/
163
164
/* default red parameter values */
165
static int default_th_min = TH_MIN;
166
static int default_th_max = TH_MAX;
167
static int default_inv_pmax = INV_P_MAX;
168
169
/*
170
* red support routines
171
*/
172
red_t *
173
red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
174
int pkttime)
175
{
176
red_t *rp;
177
int w, i;
178
int npkts_per_sec;
179
180
rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO);
181
if (rp == NULL)
182
return (NULL);
183
184
if (weight == 0)
185
rp->red_weight = W_WEIGHT;
186
else
187
rp->red_weight = weight;
188
189
/* allocate weight table */
190
rp->red_wtab = wtab_alloc(rp->red_weight);
191
if (rp->red_wtab == NULL) {
192
free(rp, M_DEVBUF);
193
return (NULL);
194
}
195
196
rp->red_avg = 0;
197
rp->red_idle = 1;
198
199
if (inv_pmax == 0)
200
rp->red_inv_pmax = default_inv_pmax;
201
else
202
rp->red_inv_pmax = inv_pmax;
203
if (th_min == 0)
204
rp->red_thmin = default_th_min;
205
else
206
rp->red_thmin = th_min;
207
if (th_max == 0)
208
rp->red_thmax = default_th_max;
209
else
210
rp->red_thmax = th_max;
211
212
rp->red_flags = flags;
213
214
if (pkttime == 0)
215
/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
216
rp->red_pkttime = 800;
217
else
218
rp->red_pkttime = pkttime;
219
220
if (weight == 0) {
221
/* when the link is very slow, adjust red parameters */
222
npkts_per_sec = 1000000 / rp->red_pkttime;
223
if (npkts_per_sec < 50) {
224
/* up to about 400Kbps */
225
rp->red_weight = W_WEIGHT_2;
226
} else if (npkts_per_sec < 300) {
227
/* up to about 2.4Mbps */
228
rp->red_weight = W_WEIGHT_1;
229
}
230
}
231
232
/* calculate wshift. weight must be power of 2 */
233
w = rp->red_weight;
234
for (i = 0; w > 1; i++)
235
w = w >> 1;
236
rp->red_wshift = i;
237
w = 1 << rp->red_wshift;
238
if (w != rp->red_weight) {
239
printf("invalid weight value %d for red! use %d\n",
240
rp->red_weight, w);
241
rp->red_weight = w;
242
}
243
244
/*
245
* thmin_s and thmax_s are scaled versions of th_min and th_max
246
* to be compared with avg.
247
*/
248
rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
249
rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
250
251
/*
252
* precompute probability denominator
253
* probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
254
*/
255
rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
256
* rp->red_inv_pmax) << FP_SHIFT;
257
258
microtime(&rp->red_last);
259
return (rp);
260
}
261
262
void
263
red_destroy(red_t *rp)
264
{
265
wtab_destroy(rp->red_wtab);
266
free(rp, M_DEVBUF);
267
}
268
269
void
270
red_getstats(red_t *rp, struct redstats *sp)
271
{
272
sp->q_avg = rp->red_avg >> rp->red_wshift;
273
sp->xmit_cnt = rp->red_stats.xmit_cnt;
274
sp->drop_cnt = rp->red_stats.drop_cnt;
275
sp->drop_forced = rp->red_stats.drop_forced;
276
sp->drop_unforced = rp->red_stats.drop_unforced;
277
sp->marked_packets = rp->red_stats.marked_packets;
278
}
279
280
int
281
red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
282
struct altq_pktattr *pktattr)
283
{
284
int avg, droptype;
285
int n;
286
287
avg = rp->red_avg;
288
289
/*
290
* if we were idle, we pretend that n packets arrived during
291
* the idle period.
292
*/
293
if (rp->red_idle) {
294
struct timeval now;
295
int t;
296
297
rp->red_idle = 0;
298
microtime(&now);
299
t = (now.tv_sec - rp->red_last.tv_sec);
300
if (t > 60) {
301
/*
302
* being idle for more than 1 minute, set avg to zero.
303
* this prevents t from overflow.
304
*/
305
avg = 0;
306
} else {
307
t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
308
n = t / rp->red_pkttime - 1;
309
310
/* the following line does (avg = (1 - Wq)^n * avg) */
311
if (n > 0)
312
avg = (avg >> FP_SHIFT) *
313
pow_w(rp->red_wtab, n);
314
}
315
}
316
317
/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
318
avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
319
rp->red_avg = avg; /* save the new value */
320
321
/*
322
* red_count keeps a tally of arriving traffic that has not
323
* been dropped.
324
*/
325
rp->red_count++;
326
327
/* see if we drop early */
328
droptype = DTYPE_NODROP;
329
if (avg >= rp->red_thmin_s && qlen(q) > 1) {
330
if (avg >= rp->red_thmax_s) {
331
/* avg >= th_max: forced drop */
332
droptype = DTYPE_FORCED;
333
} else if (rp->red_old == 0) {
334
/* first exceeds th_min */
335
rp->red_count = 1;
336
rp->red_old = 1;
337
} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
338
rp->red_probd, rp->red_count)) {
339
/* mark or drop by red */
340
if ((rp->red_flags & REDF_ECN) &&
341
mark_ecn(m, pktattr, rp->red_flags)) {
342
/* successfully marked. do not drop. */
343
rp->red_count = 0;
344
#ifdef RED_STATS
345
rp->red_stats.marked_packets++;
346
#endif
347
} else {
348
/* unforced drop by red */
349
droptype = DTYPE_EARLY;
350
}
351
}
352
} else {
353
/* avg < th_min */
354
rp->red_old = 0;
355
}
356
357
/*
358
* if the queue length hits the hard limit, it's a forced drop.
359
*/
360
if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
361
droptype = DTYPE_FORCED;
362
363
#ifdef RED_RANDOM_DROP
364
/* if successful or forced drop, enqueue this packet. */
365
if (droptype != DTYPE_EARLY)
366
_addq(q, m);
367
#else
368
/* if successful, enqueue this packet. */
369
if (droptype == DTYPE_NODROP)
370
_addq(q, m);
371
#endif
372
if (droptype != DTYPE_NODROP) {
373
if (droptype == DTYPE_EARLY) {
374
/* drop the incoming packet */
375
#ifdef RED_STATS
376
rp->red_stats.drop_unforced++;
377
#endif
378
} else {
379
/* forced drop, select a victim packet in the queue. */
380
#ifdef RED_RANDOM_DROP
381
m = _getq_random(q);
382
#endif
383
#ifdef RED_STATS
384
rp->red_stats.drop_forced++;
385
#endif
386
}
387
#ifdef RED_STATS
388
PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
389
#endif
390
rp->red_count = 0;
391
m_freem(m);
392
return (-1);
393
}
394
/* successfully queued */
395
#ifdef RED_STATS
396
PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
397
#endif
398
return (0);
399
}
400
401
/*
402
* early-drop probability is calculated as follows:
403
* prob = p_max * (avg - th_min) / (th_max - th_min)
404
* prob_a = prob / (2 - count*prob)
405
* = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
406
* here prob_a increases as successive undrop count increases.
407
* (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
408
* becomes 1 when (count >= (2 / prob))).
409
*/
410
int
411
drop_early(int fp_len, int fp_probd, int count)
412
{
413
int d; /* denominator of drop-probability */
414
415
d = fp_probd - count * fp_len;
416
if (d <= 0)
417
/* count exceeds the hard limit: drop or mark */
418
return (1);
419
420
/*
421
* now the range of d is [1..600] in fixed-point. (when
422
* th_max-th_min=10 and p_max=1/30)
423
* drop probability = (avg - TH_MIN) / d
424
*/
425
426
if ((arc4random() % d) < fp_len) {
427
/* drop or mark */
428
return (1);
429
}
430
/* no drop/mark */
431
return (0);
432
}
433
434
/*
435
* try to mark CE bit to the packet.
436
* returns 1 if successfully marked, 0 otherwise.
437
*/
438
int
439
mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
440
{
441
struct mbuf *m0;
442
struct pf_mtag *at;
443
void *hdr;
444
445
at = pf_find_mtag(m);
446
if (at != NULL) {
447
hdr = at->hdr;
448
} else
449
return (0);
450
451
/* verify that pattr_hdr is within the mbuf data */
452
for (m0 = m; m0 != NULL; m0 = m0->m_next)
453
if (((caddr_t)hdr >= m0->m_data) &&
454
((caddr_t)hdr < m0->m_data + m0->m_len))
455
break;
456
if (m0 == NULL) {
457
/* ick, tag info is stale */
458
return (0);
459
}
460
461
switch (((struct ip *)hdr)->ip_v) {
462
case IPVERSION:
463
if (flags & REDF_ECN4) {
464
struct ip *ip = hdr;
465
u_int8_t otos;
466
int sum;
467
468
if (ip->ip_v != 4)
469
return (0); /* version mismatch! */
470
471
if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
472
return (0); /* not-ECT */
473
if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
474
return (1); /* already marked */
475
476
/*
477
* ecn-capable but not marked,
478
* mark CE and update checksum
479
*/
480
otos = ip->ip_tos;
481
ip->ip_tos |= IPTOS_ECN_CE;
482
/*
483
* update checksum (from RFC1624)
484
* HC' = ~(~HC + ~m + m')
485
*/
486
sum = ~ntohs(ip->ip_sum) & 0xffff;
487
sum += (~otos & 0xffff) + ip->ip_tos;
488
sum = (sum >> 16) + (sum & 0xffff);
489
sum += (sum >> 16); /* add carry */
490
ip->ip_sum = htons(~sum & 0xffff);
491
return (1);
492
}
493
break;
494
#ifdef INET6
495
case (IPV6_VERSION >> 4):
496
if (flags & REDF_ECN6) {
497
struct ip6_hdr *ip6 = hdr;
498
u_int32_t flowlabel;
499
500
flowlabel = ntohl(ip6->ip6_flow);
501
if ((flowlabel >> 28) != 6)
502
return (0); /* version mismatch! */
503
if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
504
(IPTOS_ECN_NOTECT << 20))
505
return (0); /* not-ECT */
506
if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
507
(IPTOS_ECN_CE << 20))
508
return (1); /* already marked */
509
/*
510
* ecn-capable but not marked, mark CE
511
*/
512
flowlabel |= (IPTOS_ECN_CE << 20);
513
ip6->ip6_flow = htonl(flowlabel);
514
return (1);
515
}
516
break;
517
#endif /* INET6 */
518
}
519
520
/* not marked */
521
return (0);
522
}
523
524
struct mbuf *
525
red_getq(red_t *rp, class_queue_t *q)
526
{
527
struct mbuf *m;
528
529
if ((m = _getq(q)) == NULL) {
530
if (rp->red_idle == 0) {
531
rp->red_idle = 1;
532
microtime(&rp->red_last);
533
}
534
return NULL;
535
}
536
537
rp->red_idle = 0;
538
return (m);
539
}
540
541
/*
542
* helper routine to calibrate avg during idle.
543
* pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
544
* here Wq = 1/weight and the code assumes Wq is close to zero.
545
*
546
* w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
547
*/
548
static struct wtab *wtab_list = NULL; /* pointer to wtab list */
549
550
struct wtab *
551
wtab_alloc(int weight)
552
{
553
struct wtab *w;
554
int i;
555
556
for (w = wtab_list; w != NULL; w = w->w_next)
557
if (w->w_weight == weight) {
558
w->w_refcount++;
559
return (w);
560
}
561
562
w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO);
563
if (w == NULL)
564
return (NULL);
565
w->w_weight = weight;
566
w->w_refcount = 1;
567
w->w_next = wtab_list;
568
wtab_list = w;
569
570
/* initialize the weight table */
571
w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
572
for (i = 1; i < 32; i++) {
573
w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
574
if (w->w_tab[i] == 0 && w->w_param_max == 0)
575
w->w_param_max = 1 << i;
576
}
577
578
return (w);
579
}
580
581
int
582
wtab_destroy(struct wtab *w)
583
{
584
struct wtab *prev;
585
586
if (--w->w_refcount > 0)
587
return (0);
588
589
if (wtab_list == w)
590
wtab_list = w->w_next;
591
else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
592
if (prev->w_next == w) {
593
prev->w_next = w->w_next;
594
break;
595
}
596
597
free(w, M_DEVBUF);
598
return (0);
599
}
600
601
int32_t
602
pow_w(struct wtab *w, int n)
603
{
604
int i, bit;
605
int32_t val;
606
607
if (n >= w->w_param_max)
608
return (0);
609
610
val = 1 << FP_SHIFT;
611
if (n <= 0)
612
return (val);
613
614
bit = 1;
615
i = 0;
616
while (n) {
617
if (n & bit) {
618
val = (val * w->w_tab[i]) >> FP_SHIFT;
619
n &= ~bit;
620
}
621
i++;
622
bit <<= 1;
623
}
624
return (val);
625
}
626
627
#endif /* ALTQ_RED */
628
629