Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/sched/sch_choke.c
15109 views
1
/*
2
* net/sched/sch_choke.c CHOKE scheduler
3
*
4
* Copyright (c) 2011 Stephen Hemminger <[email protected]>
5
* Copyright (c) 2011 Eric Dumazet <[email protected]>
6
*
7
* This program is free software; you can redistribute it and/or
8
* modify it under the terms of the GNU General Public License
9
* version 2 as published by the Free Software Foundation.
10
*
11
*/
12
13
#include <linux/module.h>
14
#include <linux/types.h>
15
#include <linux/kernel.h>
16
#include <linux/skbuff.h>
17
#include <linux/reciprocal_div.h>
18
#include <linux/vmalloc.h>
19
#include <net/pkt_sched.h>
20
#include <net/inet_ecn.h>
21
#include <net/red.h>
22
#include <linux/ip.h>
23
#include <net/ip.h>
24
#include <linux/ipv6.h>
25
#include <net/ipv6.h>
26
27
/*
28
CHOKe stateless AQM for fair bandwidth allocation
29
=================================================
30
31
CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
32
unresponsive flows) is a variant of RED that penalizes misbehaving flows but
33
maintains no flow state. The difference from RED is an additional step
34
during the enqueuing process. If average queue size is over the
35
low threshold (qmin), a packet is chosen at random from the queue.
36
If both the new and chosen packet are from the same flow, both
37
are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
38
needs to access packets in queue randomly. It has a minimal class
39
interface to allow overriding the builtin flow classifier with
40
filters.
41
42
Source:
43
R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
44
Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
45
IEEE INFOCOM, 2000.
46
47
A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
48
Characteristics", IEEE/ACM Transactions on Networking, 2004
49
50
*/
51
52
/* Upper bound on size of sk_buff table (packets) */
53
#define CHOKE_MAX_QUEUE (128*1024 - 1)
54
55
struct choke_sched_data {
56
/* Parameters */
57
u32 limit;
58
unsigned char flags;
59
60
struct red_parms parms;
61
62
/* Variables */
63
struct tcf_proto *filter_list;
64
struct {
65
u32 prob_drop; /* Early probability drops */
66
u32 prob_mark; /* Early probability marks */
67
u32 forced_drop; /* Forced drops, qavg > max_thresh */
68
u32 forced_mark; /* Forced marks, qavg > max_thresh */
69
u32 pdrop; /* Drops due to queue limits */
70
u32 other; /* Drops due to drop() calls */
71
u32 matched; /* Drops to flow match */
72
} stats;
73
74
unsigned int head;
75
unsigned int tail;
76
77
unsigned int tab_mask; /* size - 1 */
78
79
struct sk_buff **tab;
80
};
81
82
/* deliver a random number between 0 and N - 1 */
83
static u32 random_N(unsigned int N)
84
{
85
return reciprocal_divide(random32(), N);
86
}
87
88
/* number of elements in queue including holes */
89
static unsigned int choke_len(const struct choke_sched_data *q)
90
{
91
return (q->tail - q->head) & q->tab_mask;
92
}
93
94
/* Is ECN parameter configured */
95
static int use_ecn(const struct choke_sched_data *q)
96
{
97
return q->flags & TC_RED_ECN;
98
}
99
100
/* Should packets over max just be dropped (versus marked) */
101
static int use_harddrop(const struct choke_sched_data *q)
102
{
103
return q->flags & TC_RED_HARDDROP;
104
}
105
106
/* Move head pointer forward to skip over holes */
107
static void choke_zap_head_holes(struct choke_sched_data *q)
108
{
109
do {
110
q->head = (q->head + 1) & q->tab_mask;
111
if (q->head == q->tail)
112
break;
113
} while (q->tab[q->head] == NULL);
114
}
115
116
/* Move tail pointer backwards to reuse holes */
117
static void choke_zap_tail_holes(struct choke_sched_data *q)
118
{
119
do {
120
q->tail = (q->tail - 1) & q->tab_mask;
121
if (q->head == q->tail)
122
break;
123
} while (q->tab[q->tail] == NULL);
124
}
125
126
/* Drop packet from queue array by creating a "hole" */
127
static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
128
{
129
struct choke_sched_data *q = qdisc_priv(sch);
130
struct sk_buff *skb = q->tab[idx];
131
132
q->tab[idx] = NULL;
133
134
if (idx == q->head)
135
choke_zap_head_holes(q);
136
if (idx == q->tail)
137
choke_zap_tail_holes(q);
138
139
sch->qstats.backlog -= qdisc_pkt_len(skb);
140
qdisc_drop(skb, sch);
141
qdisc_tree_decrease_qlen(sch, 1);
142
--sch->q.qlen;
143
}
144
145
/*
146
* Compare flow of two packets
147
* Returns true only if source and destination address and port match.
148
* false for special cases
149
*/
150
static bool choke_match_flow(struct sk_buff *skb1,
151
struct sk_buff *skb2)
152
{
153
int off1, off2, poff;
154
const u32 *ports1, *ports2;
155
u8 ip_proto;
156
__u32 hash1;
157
158
if (skb1->protocol != skb2->protocol)
159
return false;
160
161
/* Use hash value as quick check
162
* Assumes that __skb_get_rxhash makes IP header and ports linear
163
*/
164
hash1 = skb_get_rxhash(skb1);
165
if (!hash1 || hash1 != skb_get_rxhash(skb2))
166
return false;
167
168
/* Probably match, but be sure to avoid hash collisions */
169
off1 = skb_network_offset(skb1);
170
off2 = skb_network_offset(skb2);
171
172
switch (skb1->protocol) {
173
case __constant_htons(ETH_P_IP): {
174
const struct iphdr *ip1, *ip2;
175
176
ip1 = (const struct iphdr *) (skb1->data + off1);
177
ip2 = (const struct iphdr *) (skb2->data + off2);
178
179
ip_proto = ip1->protocol;
180
if (ip_proto != ip2->protocol ||
181
ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr)
182
return false;
183
184
if ((ip1->frag_off | ip2->frag_off) & htons(IP_MF | IP_OFFSET))
185
ip_proto = 0;
186
off1 += ip1->ihl * 4;
187
off2 += ip2->ihl * 4;
188
break;
189
}
190
191
case __constant_htons(ETH_P_IPV6): {
192
const struct ipv6hdr *ip1, *ip2;
193
194
ip1 = (const struct ipv6hdr *) (skb1->data + off1);
195
ip2 = (const struct ipv6hdr *) (skb2->data + off2);
196
197
ip_proto = ip1->nexthdr;
198
if (ip_proto != ip2->nexthdr ||
199
ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) ||
200
ipv6_addr_cmp(&ip1->daddr, &ip2->daddr))
201
return false;
202
off1 += 40;
203
off2 += 40;
204
}
205
206
default: /* Maybe compare MAC header here? */
207
return false;
208
}
209
210
poff = proto_ports_offset(ip_proto);
211
if (poff < 0)
212
return true;
213
214
off1 += poff;
215
off2 += poff;
216
217
ports1 = (__force u32 *)(skb1->data + off1);
218
ports2 = (__force u32 *)(skb2->data + off2);
219
return *ports1 == *ports2;
220
}
221
222
struct choke_skb_cb {
223
u16 classid;
224
};
225
226
static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
227
{
228
BUILD_BUG_ON(sizeof(skb->cb) <
229
sizeof(struct qdisc_skb_cb) + sizeof(struct choke_skb_cb));
230
return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
231
}
232
233
static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
234
{
235
choke_skb_cb(skb)->classid = classid;
236
}
237
238
static u16 choke_get_classid(const struct sk_buff *skb)
239
{
240
return choke_skb_cb(skb)->classid;
241
}
242
243
/*
244
* Classify flow using either:
245
* 1. pre-existing classification result in skb
246
* 2. fast internal classification
247
* 3. use TC filter based classification
248
*/
249
static bool choke_classify(struct sk_buff *skb,
250
struct Qdisc *sch, int *qerr)
251
252
{
253
struct choke_sched_data *q = qdisc_priv(sch);
254
struct tcf_result res;
255
int result;
256
257
result = tc_classify(skb, q->filter_list, &res);
258
if (result >= 0) {
259
#ifdef CONFIG_NET_CLS_ACT
260
switch (result) {
261
case TC_ACT_STOLEN:
262
case TC_ACT_QUEUED:
263
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
264
case TC_ACT_SHOT:
265
return false;
266
}
267
#endif
268
choke_set_classid(skb, TC_H_MIN(res.classid));
269
return true;
270
}
271
272
return false;
273
}
274
275
/*
276
* Select a packet at random from queue
277
* HACK: since queue can have holes from previous deletion; retry several
278
* times to find a random skb but then just give up and return the head
279
* Will return NULL if queue is empty (q->head == q->tail)
280
*/
281
static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
282
unsigned int *pidx)
283
{
284
struct sk_buff *skb;
285
int retrys = 3;
286
287
do {
288
*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
289
skb = q->tab[*pidx];
290
if (skb)
291
return skb;
292
} while (--retrys > 0);
293
294
return q->tab[*pidx = q->head];
295
}
296
297
/*
298
* Compare new packet with random packet in queue
299
* returns true if matched and sets *pidx
300
*/
301
static bool choke_match_random(const struct choke_sched_data *q,
302
struct sk_buff *nskb,
303
unsigned int *pidx)
304
{
305
struct sk_buff *oskb;
306
307
if (q->head == q->tail)
308
return false;
309
310
oskb = choke_peek_random(q, pidx);
311
if (q->filter_list)
312
return choke_get_classid(nskb) == choke_get_classid(oskb);
313
314
return choke_match_flow(oskb, nskb);
315
}
316
317
static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
318
{
319
struct choke_sched_data *q = qdisc_priv(sch);
320
struct red_parms *p = &q->parms;
321
int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
322
323
if (q->filter_list) {
324
/* If using external classifiers, get result and record it. */
325
if (!choke_classify(skb, sch, &ret))
326
goto other_drop; /* Packet was eaten by filter */
327
}
328
329
/* Compute average queue usage (see RED) */
330
p->qavg = red_calc_qavg(p, sch->q.qlen);
331
if (red_is_idling(p))
332
red_end_of_idle_period(p);
333
334
/* Is queue small? */
335
if (p->qavg <= p->qth_min)
336
p->qcount = -1;
337
else {
338
unsigned int idx;
339
340
/* Draw a packet at random from queue and compare flow */
341
if (choke_match_random(q, skb, &idx)) {
342
q->stats.matched++;
343
choke_drop_by_idx(sch, idx);
344
goto congestion_drop;
345
}
346
347
/* Queue is large, always mark/drop */
348
if (p->qavg > p->qth_max) {
349
p->qcount = -1;
350
351
sch->qstats.overlimits++;
352
if (use_harddrop(q) || !use_ecn(q) ||
353
!INET_ECN_set_ce(skb)) {
354
q->stats.forced_drop++;
355
goto congestion_drop;
356
}
357
358
q->stats.forced_mark++;
359
} else if (++p->qcount) {
360
if (red_mark_probability(p, p->qavg)) {
361
p->qcount = 0;
362
p->qR = red_random(p);
363
364
sch->qstats.overlimits++;
365
if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
366
q->stats.prob_drop++;
367
goto congestion_drop;
368
}
369
370
q->stats.prob_mark++;
371
}
372
} else
373
p->qR = red_random(p);
374
}
375
376
/* Admit new packet */
377
if (sch->q.qlen < q->limit) {
378
q->tab[q->tail] = skb;
379
q->tail = (q->tail + 1) & q->tab_mask;
380
++sch->q.qlen;
381
sch->qstats.backlog += qdisc_pkt_len(skb);
382
return NET_XMIT_SUCCESS;
383
}
384
385
q->stats.pdrop++;
386
sch->qstats.drops++;
387
kfree_skb(skb);
388
return NET_XMIT_DROP;
389
390
congestion_drop:
391
qdisc_drop(skb, sch);
392
return NET_XMIT_CN;
393
394
other_drop:
395
if (ret & __NET_XMIT_BYPASS)
396
sch->qstats.drops++;
397
kfree_skb(skb);
398
return ret;
399
}
400
401
static struct sk_buff *choke_dequeue(struct Qdisc *sch)
402
{
403
struct choke_sched_data *q = qdisc_priv(sch);
404
struct sk_buff *skb;
405
406
if (q->head == q->tail) {
407
if (!red_is_idling(&q->parms))
408
red_start_of_idle_period(&q->parms);
409
return NULL;
410
}
411
412
skb = q->tab[q->head];
413
q->tab[q->head] = NULL;
414
choke_zap_head_holes(q);
415
--sch->q.qlen;
416
sch->qstats.backlog -= qdisc_pkt_len(skb);
417
qdisc_bstats_update(sch, skb);
418
419
return skb;
420
}
421
422
static unsigned int choke_drop(struct Qdisc *sch)
423
{
424
struct choke_sched_data *q = qdisc_priv(sch);
425
unsigned int len;
426
427
len = qdisc_queue_drop(sch);
428
if (len > 0)
429
q->stats.other++;
430
else {
431
if (!red_is_idling(&q->parms))
432
red_start_of_idle_period(&q->parms);
433
}
434
435
return len;
436
}
437
438
static void choke_reset(struct Qdisc *sch)
439
{
440
struct choke_sched_data *q = qdisc_priv(sch);
441
442
red_restart(&q->parms);
443
}
444
445
static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
446
[TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) },
447
[TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE },
448
};
449
450
451
static void choke_free(void *addr)
452
{
453
if (addr) {
454
if (is_vmalloc_addr(addr))
455
vfree(addr);
456
else
457
kfree(addr);
458
}
459
}
460
461
static int choke_change(struct Qdisc *sch, struct nlattr *opt)
462
{
463
struct choke_sched_data *q = qdisc_priv(sch);
464
struct nlattr *tb[TCA_CHOKE_MAX + 1];
465
const struct tc_red_qopt *ctl;
466
int err;
467
struct sk_buff **old = NULL;
468
unsigned int mask;
469
470
if (opt == NULL)
471
return -EINVAL;
472
473
err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
474
if (err < 0)
475
return err;
476
477
if (tb[TCA_CHOKE_PARMS] == NULL ||
478
tb[TCA_CHOKE_STAB] == NULL)
479
return -EINVAL;
480
481
ctl = nla_data(tb[TCA_CHOKE_PARMS]);
482
483
if (ctl->limit > CHOKE_MAX_QUEUE)
484
return -EINVAL;
485
486
mask = roundup_pow_of_two(ctl->limit + 1) - 1;
487
if (mask != q->tab_mask) {
488
struct sk_buff **ntab;
489
490
ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
491
if (!ntab)
492
ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
493
if (!ntab)
494
return -ENOMEM;
495
496
sch_tree_lock(sch);
497
old = q->tab;
498
if (old) {
499
unsigned int oqlen = sch->q.qlen, tail = 0;
500
501
while (q->head != q->tail) {
502
struct sk_buff *skb = q->tab[q->head];
503
504
q->head = (q->head + 1) & q->tab_mask;
505
if (!skb)
506
continue;
507
if (tail < mask) {
508
ntab[tail++] = skb;
509
continue;
510
}
511
sch->qstats.backlog -= qdisc_pkt_len(skb);
512
--sch->q.qlen;
513
qdisc_drop(skb, sch);
514
}
515
qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
516
q->head = 0;
517
q->tail = tail;
518
}
519
520
q->tab_mask = mask;
521
q->tab = ntab;
522
} else
523
sch_tree_lock(sch);
524
525
q->flags = ctl->flags;
526
q->limit = ctl->limit;
527
528
red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
529
ctl->Plog, ctl->Scell_log,
530
nla_data(tb[TCA_CHOKE_STAB]));
531
532
if (q->head == q->tail)
533
red_end_of_idle_period(&q->parms);
534
535
sch_tree_unlock(sch);
536
choke_free(old);
537
return 0;
538
}
539
540
static int choke_init(struct Qdisc *sch, struct nlattr *opt)
541
{
542
return choke_change(sch, opt);
543
}
544
545
static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
546
{
547
struct choke_sched_data *q = qdisc_priv(sch);
548
struct nlattr *opts = NULL;
549
struct tc_red_qopt opt = {
550
.limit = q->limit,
551
.flags = q->flags,
552
.qth_min = q->parms.qth_min >> q->parms.Wlog,
553
.qth_max = q->parms.qth_max >> q->parms.Wlog,
554
.Wlog = q->parms.Wlog,
555
.Plog = q->parms.Plog,
556
.Scell_log = q->parms.Scell_log,
557
};
558
559
opts = nla_nest_start(skb, TCA_OPTIONS);
560
if (opts == NULL)
561
goto nla_put_failure;
562
563
NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
564
return nla_nest_end(skb, opts);
565
566
nla_put_failure:
567
nla_nest_cancel(skb, opts);
568
return -EMSGSIZE;
569
}
570
571
static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
572
{
573
struct choke_sched_data *q = qdisc_priv(sch);
574
struct tc_choke_xstats st = {
575
.early = q->stats.prob_drop + q->stats.forced_drop,
576
.marked = q->stats.prob_mark + q->stats.forced_mark,
577
.pdrop = q->stats.pdrop,
578
.other = q->stats.other,
579
.matched = q->stats.matched,
580
};
581
582
return gnet_stats_copy_app(d, &st, sizeof(st));
583
}
584
585
static void choke_destroy(struct Qdisc *sch)
586
{
587
struct choke_sched_data *q = qdisc_priv(sch);
588
589
tcf_destroy_chain(&q->filter_list);
590
choke_free(q->tab);
591
}
592
593
static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
594
{
595
return NULL;
596
}
597
598
static unsigned long choke_get(struct Qdisc *sch, u32 classid)
599
{
600
return 0;
601
}
602
603
static void choke_put(struct Qdisc *q, unsigned long cl)
604
{
605
}
606
607
static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
608
u32 classid)
609
{
610
return 0;
611
}
612
613
static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
614
{
615
struct choke_sched_data *q = qdisc_priv(sch);
616
617
if (cl)
618
return NULL;
619
return &q->filter_list;
620
}
621
622
static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
623
struct sk_buff *skb, struct tcmsg *tcm)
624
{
625
tcm->tcm_handle |= TC_H_MIN(cl);
626
return 0;
627
}
628
629
static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
630
{
631
if (!arg->stop) {
632
if (arg->fn(sch, 1, arg) < 0) {
633
arg->stop = 1;
634
return;
635
}
636
arg->count++;
637
}
638
}
639
640
static const struct Qdisc_class_ops choke_class_ops = {
641
.leaf = choke_leaf,
642
.get = choke_get,
643
.put = choke_put,
644
.tcf_chain = choke_find_tcf,
645
.bind_tcf = choke_bind,
646
.unbind_tcf = choke_put,
647
.dump = choke_dump_class,
648
.walk = choke_walk,
649
};
650
651
static struct sk_buff *choke_peek_head(struct Qdisc *sch)
652
{
653
struct choke_sched_data *q = qdisc_priv(sch);
654
655
return (q->head != q->tail) ? q->tab[q->head] : NULL;
656
}
657
658
static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
659
.id = "choke",
660
.priv_size = sizeof(struct choke_sched_data),
661
662
.enqueue = choke_enqueue,
663
.dequeue = choke_dequeue,
664
.peek = choke_peek_head,
665
.drop = choke_drop,
666
.init = choke_init,
667
.destroy = choke_destroy,
668
.reset = choke_reset,
669
.change = choke_change,
670
.dump = choke_dump,
671
.dump_stats = choke_dump_stats,
672
.owner = THIS_MODULE,
673
};
674
675
static int __init choke_module_init(void)
676
{
677
return register_qdisc(&choke_qdisc_ops);
678
}
679
680
static void __exit choke_module_exit(void)
681
{
682
unregister_qdisc(&choke_qdisc_ops);
683
}
684
685
module_init(choke_module_init)
686
module_exit(choke_module_exit)
687
688
MODULE_LICENSE("GPL");
689
690