Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/sched/sch_cbq.c
15111 views
1
/*
2
* net/sched/sch_cbq.c Class-Based Queueing discipline.
3
*
4
* This program is free software; you can redistribute it and/or
5
* modify it under the terms of the GNU General Public License
6
* as published by the Free Software Foundation; either version
7
* 2 of the License, or (at your option) any later version.
8
*
9
* Authors: Alexey Kuznetsov, <[email protected]>
10
*
11
*/
12
13
#include <linux/module.h>
14
#include <linux/slab.h>
15
#include <linux/types.h>
16
#include <linux/kernel.h>
17
#include <linux/string.h>
18
#include <linux/errno.h>
19
#include <linux/skbuff.h>
20
#include <net/netlink.h>
21
#include <net/pkt_sched.h>
22
23
24
/* Class-Based Queueing (CBQ) algorithm.
25
=======================================
26
27
Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
28
Management Models for Packet Networks",
29
IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
30
31
[2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
32
33
[3] Sally Floyd, "Notes on Class-Based Queueing: Setting
34
Parameters", 1996
35
36
[4] Sally Floyd and Michael Speer, "Experimental Results
37
for Class-Based Queueing", 1998, not published.
38
39
-----------------------------------------------------------------------
40
41
Algorithm skeleton was taken from NS simulator cbq.cc.
42
If someone wants to check this code against the LBL version,
43
he should take into account that ONLY the skeleton was borrowed,
44
the implementation is different. Particularly:
45
46
--- The WRR algorithm is different. Our version looks more
47
reasonable (I hope) and works when quanta are allowed to be
48
less than MTU, which is always the case when real time classes
49
have small rates. Note, that the statement of [3] is
50
incomplete, delay may actually be estimated even if class
51
per-round allotment is less than MTU. Namely, if per-round
52
allotment is W*r_i, and r_1+...+r_k = r < 1
53
54
delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
55
56
In the worst case we have IntServ estimate with D = W*r+k*MTU
57
and C = MTU*r. The proof (if correct at all) is trivial.
58
59
60
--- It seems that cbq-2.0 is not very accurate. At least, I cannot
61
interpret some places, which look like wrong translations
62
from NS. Anyone is advised to find these differences
63
and explain to me, why I am wrong 8).
64
65
--- Linux has no EOI event, so that we cannot estimate true class
66
idle time. Workaround is to consider the next dequeue event
67
as sign that previous packet is finished. This is wrong because of
68
internal device queueing, but on a permanently loaded link it is true.
69
Moreover, combined with clock integrator, this scheme looks
70
very close to an ideal solution. */
71
72
struct cbq_sched_data;
73
74
75
struct cbq_class {
76
struct Qdisc_class_common common;
77
struct cbq_class *next_alive; /* next class with backlog in this priority band */
78
79
/* Parameters */
80
unsigned char priority; /* class priority */
81
unsigned char priority2; /* priority to be used after overlimit */
82
unsigned char ewma_log; /* time constant for idle time calculation */
83
unsigned char ovl_strategy;
84
#ifdef CONFIG_NET_CLS_ACT
85
unsigned char police;
86
#endif
87
88
u32 defmap;
89
90
/* Link-sharing scheduler parameters */
91
long maxidle; /* Class parameters: see below. */
92
long offtime;
93
long minidle;
94
u32 avpkt;
95
struct qdisc_rate_table *R_tab;
96
97
/* Overlimit strategy parameters */
98
void (*overlimit)(struct cbq_class *cl);
99
psched_tdiff_t penalty;
100
101
/* General scheduler (WRR) parameters */
102
long allot;
103
long quantum; /* Allotment per WRR round */
104
long weight; /* Relative allotment: see below */
105
106
struct Qdisc *qdisc; /* Ptr to CBQ discipline */
107
struct cbq_class *split; /* Ptr to split node */
108
struct cbq_class *share; /* Ptr to LS parent in the class tree */
109
struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
110
struct cbq_class *borrow; /* NULL if class is bandwidth limited;
111
parent otherwise */
112
struct cbq_class *sibling; /* Sibling chain */
113
struct cbq_class *children; /* Pointer to children chain */
114
115
struct Qdisc *q; /* Elementary queueing discipline */
116
117
118
/* Variables */
119
unsigned char cpriority; /* Effective priority */
120
unsigned char delayed;
121
unsigned char level; /* level of the class in hierarchy:
122
0 for leaf classes, and maximal
123
level of children + 1 for nodes.
124
*/
125
126
psched_time_t last; /* Last end of service */
127
psched_time_t undertime;
128
long avgidle;
129
long deficit; /* Saved deficit for WRR */
130
psched_time_t penalized;
131
struct gnet_stats_basic_packed bstats;
132
struct gnet_stats_queue qstats;
133
struct gnet_stats_rate_est rate_est;
134
struct tc_cbq_xstats xstats;
135
136
struct tcf_proto *filter_list;
137
138
int refcnt;
139
int filters;
140
141
struct cbq_class *defaults[TC_PRIO_MAX + 1];
142
};
143
144
struct cbq_sched_data {
145
struct Qdisc_class_hash clhash; /* Hash table of all classes */
146
int nclasses[TC_CBQ_MAXPRIO + 1];
147
unsigned int quanta[TC_CBQ_MAXPRIO + 1];
148
149
struct cbq_class link;
150
151
unsigned int activemask;
152
struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
153
with backlog */
154
155
#ifdef CONFIG_NET_CLS_ACT
156
struct cbq_class *rx_class;
157
#endif
158
struct cbq_class *tx_class;
159
struct cbq_class *tx_borrowed;
160
int tx_len;
161
psched_time_t now; /* Cached timestamp */
162
psched_time_t now_rt; /* Cached real time */
163
unsigned int pmask;
164
165
struct hrtimer delay_timer;
166
struct qdisc_watchdog watchdog; /* Watchdog timer,
167
started when CBQ has
168
backlog, but cannot
169
transmit just now */
170
psched_tdiff_t wd_expires;
171
int toplevel;
172
u32 hgenerator;
173
};
174
175
176
#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
177
178
static inline struct cbq_class *
179
cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
180
{
181
struct Qdisc_class_common *clc;
182
183
clc = qdisc_class_find(&q->clhash, classid);
184
if (clc == NULL)
185
return NULL;
186
return container_of(clc, struct cbq_class, common);
187
}
188
189
#ifdef CONFIG_NET_CLS_ACT
190
191
static struct cbq_class *
192
cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
193
{
194
struct cbq_class *cl;
195
196
for (cl = this->tparent; cl; cl = cl->tparent) {
197
struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
198
199
if (new != NULL && new != this)
200
return new;
201
}
202
return NULL;
203
}
204
205
#endif
206
207
/* Classify packet. The procedure is pretty complicated, but
208
* it allows us to combine link sharing and priority scheduling
209
* transparently.
210
*
211
* Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
212
* so that it resolves to split nodes. Then packets are classified
213
* by logical priority, or a more specific classifier may be attached
214
* to the split node.
215
*/
216
217
static struct cbq_class *
218
cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
219
{
220
struct cbq_sched_data *q = qdisc_priv(sch);
221
struct cbq_class *head = &q->link;
222
struct cbq_class **defmap;
223
struct cbq_class *cl = NULL;
224
u32 prio = skb->priority;
225
struct tcf_result res;
226
227
/*
228
* Step 1. If skb->priority points to one of our classes, use it.
229
*/
230
if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
231
(cl = cbq_class_lookup(q, prio)) != NULL)
232
return cl;
233
234
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
235
for (;;) {
236
int result = 0;
237
defmap = head->defaults;
238
239
/*
240
* Step 2+n. Apply classifier.
241
*/
242
if (!head->filter_list ||
243
(result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
244
goto fallback;
245
246
cl = (void *)res.class;
247
if (!cl) {
248
if (TC_H_MAJ(res.classid))
249
cl = cbq_class_lookup(q, res.classid);
250
else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
251
cl = defmap[TC_PRIO_BESTEFFORT];
252
253
if (cl == NULL || cl->level >= head->level)
254
goto fallback;
255
}
256
257
#ifdef CONFIG_NET_CLS_ACT
258
switch (result) {
259
case TC_ACT_QUEUED:
260
case TC_ACT_STOLEN:
261
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
262
case TC_ACT_SHOT:
263
return NULL;
264
case TC_ACT_RECLASSIFY:
265
return cbq_reclassify(skb, cl);
266
}
267
#endif
268
if (cl->level == 0)
269
return cl;
270
271
/*
272
* Step 3+n. If classifier selected a link sharing class,
273
* apply agency specific classifier.
274
* Repeat this procdure until we hit a leaf node.
275
*/
276
head = cl;
277
}
278
279
fallback:
280
cl = head;
281
282
/*
283
* Step 4. No success...
284
*/
285
if (TC_H_MAJ(prio) == 0 &&
286
!(cl = head->defaults[prio & TC_PRIO_MAX]) &&
287
!(cl = head->defaults[TC_PRIO_BESTEFFORT]))
288
return head;
289
290
return cl;
291
}
292
293
/*
294
* A packet has just been enqueued on the empty class.
295
* cbq_activate_class adds it to the tail of active class list
296
* of its priority band.
297
*/
298
299
static inline void cbq_activate_class(struct cbq_class *cl)
300
{
301
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
302
int prio = cl->cpriority;
303
struct cbq_class *cl_tail;
304
305
cl_tail = q->active[prio];
306
q->active[prio] = cl;
307
308
if (cl_tail != NULL) {
309
cl->next_alive = cl_tail->next_alive;
310
cl_tail->next_alive = cl;
311
} else {
312
cl->next_alive = cl;
313
q->activemask |= (1<<prio);
314
}
315
}
316
317
/*
318
* Unlink class from active chain.
319
* Note that this same procedure is done directly in cbq_dequeue*
320
* during round-robin procedure.
321
*/
322
323
static void cbq_deactivate_class(struct cbq_class *this)
324
{
325
struct cbq_sched_data *q = qdisc_priv(this->qdisc);
326
int prio = this->cpriority;
327
struct cbq_class *cl;
328
struct cbq_class *cl_prev = q->active[prio];
329
330
do {
331
cl = cl_prev->next_alive;
332
if (cl == this) {
333
cl_prev->next_alive = cl->next_alive;
334
cl->next_alive = NULL;
335
336
if (cl == q->active[prio]) {
337
q->active[prio] = cl_prev;
338
if (cl == q->active[prio]) {
339
q->active[prio] = NULL;
340
q->activemask &= ~(1<<prio);
341
return;
342
}
343
}
344
return;
345
}
346
} while ((cl_prev = cl) != q->active[prio]);
347
}
348
349
static void
350
cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
351
{
352
int toplevel = q->toplevel;
353
354
if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
355
psched_time_t now;
356
psched_tdiff_t incr;
357
358
now = psched_get_time();
359
incr = now - q->now_rt;
360
now = q->now + incr;
361
362
do {
363
if (cl->undertime < now) {
364
q->toplevel = cl->level;
365
return;
366
}
367
} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
368
}
369
}
370
371
static int
372
cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
373
{
374
struct cbq_sched_data *q = qdisc_priv(sch);
375
int uninitialized_var(ret);
376
struct cbq_class *cl = cbq_classify(skb, sch, &ret);
377
378
#ifdef CONFIG_NET_CLS_ACT
379
q->rx_class = cl;
380
#endif
381
if (cl == NULL) {
382
if (ret & __NET_XMIT_BYPASS)
383
sch->qstats.drops++;
384
kfree_skb(skb);
385
return ret;
386
}
387
388
#ifdef CONFIG_NET_CLS_ACT
389
cl->q->__parent = sch;
390
#endif
391
ret = qdisc_enqueue(skb, cl->q);
392
if (ret == NET_XMIT_SUCCESS) {
393
sch->q.qlen++;
394
cbq_mark_toplevel(q, cl);
395
if (!cl->next_alive)
396
cbq_activate_class(cl);
397
return ret;
398
}
399
400
if (net_xmit_drop_count(ret)) {
401
sch->qstats.drops++;
402
cbq_mark_toplevel(q, cl);
403
cl->qstats.drops++;
404
}
405
return ret;
406
}
407
408
/* Overlimit actions */
409
410
/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
411
412
static void cbq_ovl_classic(struct cbq_class *cl)
413
{
414
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
415
psched_tdiff_t delay = cl->undertime - q->now;
416
417
if (!cl->delayed) {
418
delay += cl->offtime;
419
420
/*
421
* Class goes to sleep, so that it will have no
422
* chance to work avgidle. Let's forgive it 8)
423
*
424
* BTW cbq-2.0 has a crap in this
425
* place, apparently they forgot to shift it by cl->ewma_log.
426
*/
427
if (cl->avgidle < 0)
428
delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
429
if (cl->avgidle < cl->minidle)
430
cl->avgidle = cl->minidle;
431
if (delay <= 0)
432
delay = 1;
433
cl->undertime = q->now + delay;
434
435
cl->xstats.overactions++;
436
cl->delayed = 1;
437
}
438
if (q->wd_expires == 0 || q->wd_expires > delay)
439
q->wd_expires = delay;
440
441
/* Dirty work! We must schedule wakeups based on
442
* real available rate, rather than leaf rate,
443
* which may be tiny (even zero).
444
*/
445
if (q->toplevel == TC_CBQ_MAXLEVEL) {
446
struct cbq_class *b;
447
psched_tdiff_t base_delay = q->wd_expires;
448
449
for (b = cl->borrow; b; b = b->borrow) {
450
delay = b->undertime - q->now;
451
if (delay < base_delay) {
452
if (delay <= 0)
453
delay = 1;
454
base_delay = delay;
455
}
456
}
457
458
q->wd_expires = base_delay;
459
}
460
}
461
462
/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
463
* they go overlimit
464
*/
465
466
static void cbq_ovl_rclassic(struct cbq_class *cl)
467
{
468
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
469
struct cbq_class *this = cl;
470
471
do {
472
if (cl->level > q->toplevel) {
473
cl = NULL;
474
break;
475
}
476
} while ((cl = cl->borrow) != NULL);
477
478
if (cl == NULL)
479
cl = this;
480
cbq_ovl_classic(cl);
481
}
482
483
/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
484
485
static void cbq_ovl_delay(struct cbq_class *cl)
486
{
487
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
488
psched_tdiff_t delay = cl->undertime - q->now;
489
490
if (test_bit(__QDISC_STATE_DEACTIVATED,
491
&qdisc_root_sleeping(cl->qdisc)->state))
492
return;
493
494
if (!cl->delayed) {
495
psched_time_t sched = q->now;
496
ktime_t expires;
497
498
delay += cl->offtime;
499
if (cl->avgidle < 0)
500
delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
501
if (cl->avgidle < cl->minidle)
502
cl->avgidle = cl->minidle;
503
cl->undertime = q->now + delay;
504
505
if (delay > 0) {
506
sched += delay + cl->penalty;
507
cl->penalized = sched;
508
cl->cpriority = TC_CBQ_MAXPRIO;
509
q->pmask |= (1<<TC_CBQ_MAXPRIO);
510
511
expires = ktime_set(0, 0);
512
expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched));
513
if (hrtimer_try_to_cancel(&q->delay_timer) &&
514
ktime_to_ns(ktime_sub(
515
hrtimer_get_expires(&q->delay_timer),
516
expires)) > 0)
517
hrtimer_set_expires(&q->delay_timer, expires);
518
hrtimer_restart(&q->delay_timer);
519
cl->delayed = 1;
520
cl->xstats.overactions++;
521
return;
522
}
523
delay = 1;
524
}
525
if (q->wd_expires == 0 || q->wd_expires > delay)
526
q->wd_expires = delay;
527
}
528
529
/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
530
531
static void cbq_ovl_lowprio(struct cbq_class *cl)
532
{
533
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
534
535
cl->penalized = q->now + cl->penalty;
536
537
if (cl->cpriority != cl->priority2) {
538
cl->cpriority = cl->priority2;
539
q->pmask |= (1<<cl->cpriority);
540
cl->xstats.overactions++;
541
}
542
cbq_ovl_classic(cl);
543
}
544
545
/* TC_CBQ_OVL_DROP: penalize class by dropping */
546
547
static void cbq_ovl_drop(struct cbq_class *cl)
548
{
549
if (cl->q->ops->drop)
550
if (cl->q->ops->drop(cl->q))
551
cl->qdisc->q.qlen--;
552
cl->xstats.overactions++;
553
cbq_ovl_classic(cl);
554
}
555
556
static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
557
psched_time_t now)
558
{
559
struct cbq_class *cl;
560
struct cbq_class *cl_prev = q->active[prio];
561
psched_time_t sched = now;
562
563
if (cl_prev == NULL)
564
return 0;
565
566
do {
567
cl = cl_prev->next_alive;
568
if (now - cl->penalized > 0) {
569
cl_prev->next_alive = cl->next_alive;
570
cl->next_alive = NULL;
571
cl->cpriority = cl->priority;
572
cl->delayed = 0;
573
cbq_activate_class(cl);
574
575
if (cl == q->active[prio]) {
576
q->active[prio] = cl_prev;
577
if (cl == q->active[prio]) {
578
q->active[prio] = NULL;
579
return 0;
580
}
581
}
582
583
cl = cl_prev->next_alive;
584
} else if (sched - cl->penalized > 0)
585
sched = cl->penalized;
586
} while ((cl_prev = cl) != q->active[prio]);
587
588
return sched - now;
589
}
590
591
static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
592
{
593
struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
594
delay_timer);
595
struct Qdisc *sch = q->watchdog.qdisc;
596
psched_time_t now;
597
psched_tdiff_t delay = 0;
598
unsigned int pmask;
599
600
now = psched_get_time();
601
602
pmask = q->pmask;
603
q->pmask = 0;
604
605
while (pmask) {
606
int prio = ffz(~pmask);
607
psched_tdiff_t tmp;
608
609
pmask &= ~(1<<prio);
610
611
tmp = cbq_undelay_prio(q, prio, now);
612
if (tmp > 0) {
613
q->pmask |= 1<<prio;
614
if (tmp < delay || delay == 0)
615
delay = tmp;
616
}
617
}
618
619
if (delay) {
620
ktime_t time;
621
622
time = ktime_set(0, 0);
623
time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
624
hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
625
}
626
627
qdisc_unthrottled(sch);
628
__netif_schedule(qdisc_root(sch));
629
return HRTIMER_NORESTART;
630
}
631
632
#ifdef CONFIG_NET_CLS_ACT
633
static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
634
{
635
struct Qdisc *sch = child->__parent;
636
struct cbq_sched_data *q = qdisc_priv(sch);
637
struct cbq_class *cl = q->rx_class;
638
639
q->rx_class = NULL;
640
641
if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
642
int ret;
643
644
cbq_mark_toplevel(q, cl);
645
646
q->rx_class = cl;
647
cl->q->__parent = sch;
648
649
ret = qdisc_enqueue(skb, cl->q);
650
if (ret == NET_XMIT_SUCCESS) {
651
sch->q.qlen++;
652
if (!cl->next_alive)
653
cbq_activate_class(cl);
654
return 0;
655
}
656
if (net_xmit_drop_count(ret))
657
sch->qstats.drops++;
658
return 0;
659
}
660
661
sch->qstats.drops++;
662
return -1;
663
}
664
#endif
665
666
/*
667
* It is mission critical procedure.
668
*
669
* We "regenerate" toplevel cutoff, if transmitting class
670
* has backlog and it is not regulated. It is not part of
671
* original CBQ description, but looks more reasonable.
672
* Probably, it is wrong. This question needs further investigation.
673
*/
674
675
static inline void
676
cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
677
struct cbq_class *borrowed)
678
{
679
if (cl && q->toplevel >= borrowed->level) {
680
if (cl->q->q.qlen > 1) {
681
do {
682
if (borrowed->undertime == PSCHED_PASTPERFECT) {
683
q->toplevel = borrowed->level;
684
return;
685
}
686
} while ((borrowed = borrowed->borrow) != NULL);
687
}
688
#if 0
689
/* It is not necessary now. Uncommenting it
690
will save CPU cycles, but decrease fairness.
691
*/
692
q->toplevel = TC_CBQ_MAXLEVEL;
693
#endif
694
}
695
}
696
697
static void
698
cbq_update(struct cbq_sched_data *q)
699
{
700
struct cbq_class *this = q->tx_class;
701
struct cbq_class *cl = this;
702
int len = q->tx_len;
703
704
q->tx_class = NULL;
705
706
for ( ; cl; cl = cl->share) {
707
long avgidle = cl->avgidle;
708
long idle;
709
710
cl->bstats.packets++;
711
cl->bstats.bytes += len;
712
713
/*
714
* (now - last) is total time between packet right edges.
715
* (last_pktlen/rate) is "virtual" busy time, so that
716
*
717
* idle = (now - last) - last_pktlen/rate
718
*/
719
720
idle = q->now - cl->last;
721
if ((unsigned long)idle > 128*1024*1024) {
722
avgidle = cl->maxidle;
723
} else {
724
idle -= L2T(cl, len);
725
726
/* true_avgidle := (1-W)*true_avgidle + W*idle,
727
* where W=2^{-ewma_log}. But cl->avgidle is scaled:
728
* cl->avgidle == true_avgidle/W,
729
* hence:
730
*/
731
avgidle += idle - (avgidle>>cl->ewma_log);
732
}
733
734
if (avgidle <= 0) {
735
/* Overlimit or at-limit */
736
737
if (avgidle < cl->minidle)
738
avgidle = cl->minidle;
739
740
cl->avgidle = avgidle;
741
742
/* Calculate expected time, when this class
743
* will be allowed to send.
744
* It will occur, when:
745
* (1-W)*true_avgidle + W*delay = 0, i.e.
746
* idle = (1/W - 1)*(-true_avgidle)
747
* or
748
* idle = (1 - W)*(-cl->avgidle);
749
*/
750
idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
751
752
/*
753
* That is not all.
754
* To maintain the rate allocated to the class,
755
* we add to undertime virtual clock,
756
* necessary to complete transmitted packet.
757
* (len/phys_bandwidth has been already passed
758
* to the moment of cbq_update)
759
*/
760
761
idle -= L2T(&q->link, len);
762
idle += L2T(cl, len);
763
764
cl->undertime = q->now + idle;
765
} else {
766
/* Underlimit */
767
768
cl->undertime = PSCHED_PASTPERFECT;
769
if (avgidle > cl->maxidle)
770
cl->avgidle = cl->maxidle;
771
else
772
cl->avgidle = avgidle;
773
}
774
cl->last = q->now;
775
}
776
777
cbq_update_toplevel(q, this, q->tx_borrowed);
778
}
779
780
static inline struct cbq_class *
781
cbq_under_limit(struct cbq_class *cl)
782
{
783
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
784
struct cbq_class *this_cl = cl;
785
786
if (cl->tparent == NULL)
787
return cl;
788
789
if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
790
cl->delayed = 0;
791
return cl;
792
}
793
794
do {
795
/* It is very suspicious place. Now overlimit
796
* action is generated for not bounded classes
797
* only if link is completely congested.
798
* Though it is in agree with ancestor-only paradigm,
799
* it looks very stupid. Particularly,
800
* it means that this chunk of code will either
801
* never be called or result in strong amplification
802
* of burstiness. Dangerous, silly, and, however,
803
* no another solution exists.
804
*/
805
cl = cl->borrow;
806
if (!cl) {
807
this_cl->qstats.overlimits++;
808
this_cl->overlimit(this_cl);
809
return NULL;
810
}
811
if (cl->level > q->toplevel)
812
return NULL;
813
} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
814
815
cl->delayed = 0;
816
return cl;
817
}
818
819
static inline struct sk_buff *
820
cbq_dequeue_prio(struct Qdisc *sch, int prio)
821
{
822
struct cbq_sched_data *q = qdisc_priv(sch);
823
struct cbq_class *cl_tail, *cl_prev, *cl;
824
struct sk_buff *skb;
825
int deficit;
826
827
cl_tail = cl_prev = q->active[prio];
828
cl = cl_prev->next_alive;
829
830
do {
831
deficit = 0;
832
833
/* Start round */
834
do {
835
struct cbq_class *borrow = cl;
836
837
if (cl->q->q.qlen &&
838
(borrow = cbq_under_limit(cl)) == NULL)
839
goto skip_class;
840
841
if (cl->deficit <= 0) {
842
/* Class exhausted its allotment per
843
* this round. Switch to the next one.
844
*/
845
deficit = 1;
846
cl->deficit += cl->quantum;
847
goto next_class;
848
}
849
850
skb = cl->q->dequeue(cl->q);
851
852
/* Class did not give us any skb :-(
853
* It could occur even if cl->q->q.qlen != 0
854
* f.e. if cl->q == "tbf"
855
*/
856
if (skb == NULL)
857
goto skip_class;
858
859
cl->deficit -= qdisc_pkt_len(skb);
860
q->tx_class = cl;
861
q->tx_borrowed = borrow;
862
if (borrow != cl) {
863
#ifndef CBQ_XSTATS_BORROWS_BYTES
864
borrow->xstats.borrows++;
865
cl->xstats.borrows++;
866
#else
867
borrow->xstats.borrows += qdisc_pkt_len(skb);
868
cl->xstats.borrows += qdisc_pkt_len(skb);
869
#endif
870
}
871
q->tx_len = qdisc_pkt_len(skb);
872
873
if (cl->deficit <= 0) {
874
q->active[prio] = cl;
875
cl = cl->next_alive;
876
cl->deficit += cl->quantum;
877
}
878
return skb;
879
880
skip_class:
881
if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
882
/* Class is empty or penalized.
883
* Unlink it from active chain.
884
*/
885
cl_prev->next_alive = cl->next_alive;
886
cl->next_alive = NULL;
887
888
/* Did cl_tail point to it? */
889
if (cl == cl_tail) {
890
/* Repair it! */
891
cl_tail = cl_prev;
892
893
/* Was it the last class in this band? */
894
if (cl == cl_tail) {
895
/* Kill the band! */
896
q->active[prio] = NULL;
897
q->activemask &= ~(1<<prio);
898
if (cl->q->q.qlen)
899
cbq_activate_class(cl);
900
return NULL;
901
}
902
903
q->active[prio] = cl_tail;
904
}
905
if (cl->q->q.qlen)
906
cbq_activate_class(cl);
907
908
cl = cl_prev;
909
}
910
911
next_class:
912
cl_prev = cl;
913
cl = cl->next_alive;
914
} while (cl_prev != cl_tail);
915
} while (deficit);
916
917
q->active[prio] = cl_prev;
918
919
return NULL;
920
}
921
922
static inline struct sk_buff *
923
cbq_dequeue_1(struct Qdisc *sch)
924
{
925
struct cbq_sched_data *q = qdisc_priv(sch);
926
struct sk_buff *skb;
927
unsigned int activemask;
928
929
activemask = q->activemask & 0xFF;
930
while (activemask) {
931
int prio = ffz(~activemask);
932
activemask &= ~(1<<prio);
933
skb = cbq_dequeue_prio(sch, prio);
934
if (skb)
935
return skb;
936
}
937
return NULL;
938
}
939
940
static struct sk_buff *
941
cbq_dequeue(struct Qdisc *sch)
942
{
943
struct sk_buff *skb;
944
struct cbq_sched_data *q = qdisc_priv(sch);
945
psched_time_t now;
946
psched_tdiff_t incr;
947
948
now = psched_get_time();
949
incr = now - q->now_rt;
950
951
if (q->tx_class) {
952
psched_tdiff_t incr2;
953
/* Time integrator. We calculate EOS time
954
* by adding expected packet transmission time.
955
* If real time is greater, we warp artificial clock,
956
* so that:
957
*
958
* cbq_time = max(real_time, work);
959
*/
960
incr2 = L2T(&q->link, q->tx_len);
961
q->now += incr2;
962
cbq_update(q);
963
if ((incr -= incr2) < 0)
964
incr = 0;
965
}
966
q->now += incr;
967
q->now_rt = now;
968
969
for (;;) {
970
q->wd_expires = 0;
971
972
skb = cbq_dequeue_1(sch);
973
if (skb) {
974
qdisc_bstats_update(sch, skb);
975
sch->q.qlen--;
976
qdisc_unthrottled(sch);
977
return skb;
978
}
979
980
/* All the classes are overlimit.
981
*
982
* It is possible, if:
983
*
984
* 1. Scheduler is empty.
985
* 2. Toplevel cutoff inhibited borrowing.
986
* 3. Root class is overlimit.
987
*
988
* Reset 2d and 3d conditions and retry.
989
*
990
* Note, that NS and cbq-2.0 are buggy, peeking
991
* an arbitrary class is appropriate for ancestor-only
992
* sharing, but not for toplevel algorithm.
993
*
994
* Our version is better, but slower, because it requires
995
* two passes, but it is unavoidable with top-level sharing.
996
*/
997
998
if (q->toplevel == TC_CBQ_MAXLEVEL &&
999
q->link.undertime == PSCHED_PASTPERFECT)
1000
break;
1001
1002
q->toplevel = TC_CBQ_MAXLEVEL;
1003
q->link.undertime = PSCHED_PASTPERFECT;
1004
}
1005
1006
/* No packets in scheduler or nobody wants to give them to us :-(
1007
* Sigh... start watchdog timer in the last case.
1008
*/
1009
1010
if (sch->q.qlen) {
1011
sch->qstats.overlimits++;
1012
if (q->wd_expires)
1013
qdisc_watchdog_schedule(&q->watchdog,
1014
now + q->wd_expires);
1015
}
1016
return NULL;
1017
}
1018
1019
/* CBQ class maintanance routines */
1020
1021
static void cbq_adjust_levels(struct cbq_class *this)
1022
{
1023
if (this == NULL)
1024
return;
1025
1026
do {
1027
int level = 0;
1028
struct cbq_class *cl;
1029
1030
cl = this->children;
1031
if (cl) {
1032
do {
1033
if (cl->level > level)
1034
level = cl->level;
1035
} while ((cl = cl->sibling) != this->children);
1036
}
1037
this->level = level + 1;
1038
} while ((this = this->tparent) != NULL);
1039
}
1040
1041
static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1042
{
1043
struct cbq_class *cl;
1044
struct hlist_node *n;
1045
unsigned int h;
1046
1047
if (q->quanta[prio] == 0)
1048
return;
1049
1050
for (h = 0; h < q->clhash.hashsize; h++) {
1051
hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1052
/* BUGGGG... Beware! This expression suffer of
1053
* arithmetic overflows!
1054
*/
1055
if (cl->priority == prio) {
1056
cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1057
q->quanta[prio];
1058
}
1059
if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1060
pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n",
1061
cl->common.classid, cl->quantum);
1062
cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1063
}
1064
}
1065
}
1066
}
1067
1068
static void cbq_sync_defmap(struct cbq_class *cl)
1069
{
1070
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1071
struct cbq_class *split = cl->split;
1072
unsigned int h;
1073
int i;
1074
1075
if (split == NULL)
1076
return;
1077
1078
for (i = 0; i <= TC_PRIO_MAX; i++) {
1079
if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
1080
split->defaults[i] = NULL;
1081
}
1082
1083
for (i = 0; i <= TC_PRIO_MAX; i++) {
1084
int level = split->level;
1085
1086
if (split->defaults[i])
1087
continue;
1088
1089
for (h = 0; h < q->clhash.hashsize; h++) {
1090
struct hlist_node *n;
1091
struct cbq_class *c;
1092
1093
hlist_for_each_entry(c, n, &q->clhash.hash[h],
1094
common.hnode) {
1095
if (c->split == split && c->level < level &&
1096
c->defmap & (1<<i)) {
1097
split->defaults[i] = c;
1098
level = c->level;
1099
}
1100
}
1101
}
1102
}
1103
}
1104
1105
static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1106
{
1107
struct cbq_class *split = NULL;
1108
1109
if (splitid == 0) {
1110
split = cl->split;
1111
if (!split)
1112
return;
1113
splitid = split->common.classid;
1114
}
1115
1116
if (split == NULL || split->common.classid != splitid) {
1117
for (split = cl->tparent; split; split = split->tparent)
1118
if (split->common.classid == splitid)
1119
break;
1120
}
1121
1122
if (split == NULL)
1123
return;
1124
1125
if (cl->split != split) {
1126
cl->defmap = 0;
1127
cbq_sync_defmap(cl);
1128
cl->split = split;
1129
cl->defmap = def & mask;
1130
} else
1131
cl->defmap = (cl->defmap & ~mask) | (def & mask);
1132
1133
cbq_sync_defmap(cl);
1134
}
1135
1136
static void cbq_unlink_class(struct cbq_class *this)
1137
{
1138
struct cbq_class *cl, **clp;
1139
struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1140
1141
qdisc_class_hash_remove(&q->clhash, &this->common);
1142
1143
if (this->tparent) {
1144
clp = &this->sibling;
1145
cl = *clp;
1146
do {
1147
if (cl == this) {
1148
*clp = cl->sibling;
1149
break;
1150
}
1151
clp = &cl->sibling;
1152
} while ((cl = *clp) != this->sibling);
1153
1154
if (this->tparent->children == this) {
1155
this->tparent->children = this->sibling;
1156
if (this->sibling == this)
1157
this->tparent->children = NULL;
1158
}
1159
} else {
1160
WARN_ON(this->sibling != this);
1161
}
1162
}
1163
1164
static void cbq_link_class(struct cbq_class *this)
1165
{
1166
struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1167
struct cbq_class *parent = this->tparent;
1168
1169
this->sibling = this;
1170
qdisc_class_hash_insert(&q->clhash, &this->common);
1171
1172
if (parent == NULL)
1173
return;
1174
1175
if (parent->children == NULL) {
1176
parent->children = this;
1177
} else {
1178
this->sibling = parent->children->sibling;
1179
parent->children->sibling = this;
1180
}
1181
}
1182
1183
static unsigned int cbq_drop(struct Qdisc *sch)
1184
{
1185
struct cbq_sched_data *q = qdisc_priv(sch);
1186
struct cbq_class *cl, *cl_head;
1187
int prio;
1188
unsigned int len;
1189
1190
for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1191
cl_head = q->active[prio];
1192
if (!cl_head)
1193
continue;
1194
1195
cl = cl_head;
1196
do {
1197
if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1198
sch->q.qlen--;
1199
if (!cl->q->q.qlen)
1200
cbq_deactivate_class(cl);
1201
return len;
1202
}
1203
} while ((cl = cl->next_alive) != cl_head);
1204
}
1205
return 0;
1206
}
1207
1208
static void
1209
cbq_reset(struct Qdisc *sch)
1210
{
1211
struct cbq_sched_data *q = qdisc_priv(sch);
1212
struct cbq_class *cl;
1213
struct hlist_node *n;
1214
int prio;
1215
unsigned int h;
1216
1217
q->activemask = 0;
1218
q->pmask = 0;
1219
q->tx_class = NULL;
1220
q->tx_borrowed = NULL;
1221
qdisc_watchdog_cancel(&q->watchdog);
1222
hrtimer_cancel(&q->delay_timer);
1223
q->toplevel = TC_CBQ_MAXLEVEL;
1224
q->now = psched_get_time();
1225
q->now_rt = q->now;
1226
1227
for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1228
q->active[prio] = NULL;
1229
1230
for (h = 0; h < q->clhash.hashsize; h++) {
1231
hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1232
qdisc_reset(cl->q);
1233
1234
cl->next_alive = NULL;
1235
cl->undertime = PSCHED_PASTPERFECT;
1236
cl->avgidle = cl->maxidle;
1237
cl->deficit = cl->quantum;
1238
cl->cpriority = cl->priority;
1239
}
1240
}
1241
sch->q.qlen = 0;
1242
}
1243
1244
1245
static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1246
{
1247
if (lss->change & TCF_CBQ_LSS_FLAGS) {
1248
cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1249
cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1250
}
1251
if (lss->change & TCF_CBQ_LSS_EWMA)
1252
cl->ewma_log = lss->ewma_log;
1253
if (lss->change & TCF_CBQ_LSS_AVPKT)
1254
cl->avpkt = lss->avpkt;
1255
if (lss->change & TCF_CBQ_LSS_MINIDLE)
1256
cl->minidle = -(long)lss->minidle;
1257
if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1258
cl->maxidle = lss->maxidle;
1259
cl->avgidle = lss->maxidle;
1260
}
1261
if (lss->change & TCF_CBQ_LSS_OFFTIME)
1262
cl->offtime = lss->offtime;
1263
return 0;
1264
}
1265
1266
static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1267
{
1268
q->nclasses[cl->priority]--;
1269
q->quanta[cl->priority] -= cl->weight;
1270
cbq_normalize_quanta(q, cl->priority);
1271
}
1272
1273
static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1274
{
1275
q->nclasses[cl->priority]++;
1276
q->quanta[cl->priority] += cl->weight;
1277
cbq_normalize_quanta(q, cl->priority);
1278
}
1279
1280
static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1281
{
1282
struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1283
1284
if (wrr->allot)
1285
cl->allot = wrr->allot;
1286
if (wrr->weight)
1287
cl->weight = wrr->weight;
1288
if (wrr->priority) {
1289
cl->priority = wrr->priority - 1;
1290
cl->cpriority = cl->priority;
1291
if (cl->priority >= cl->priority2)
1292
cl->priority2 = TC_CBQ_MAXPRIO - 1;
1293
}
1294
1295
cbq_addprio(q, cl);
1296
return 0;
1297
}
1298
1299
static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1300
{
1301
switch (ovl->strategy) {
1302
case TC_CBQ_OVL_CLASSIC:
1303
cl->overlimit = cbq_ovl_classic;
1304
break;
1305
case TC_CBQ_OVL_DELAY:
1306
cl->overlimit = cbq_ovl_delay;
1307
break;
1308
case TC_CBQ_OVL_LOWPRIO:
1309
if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
1310
ovl->priority2 - 1 <= cl->priority)
1311
return -EINVAL;
1312
cl->priority2 = ovl->priority2 - 1;
1313
cl->overlimit = cbq_ovl_lowprio;
1314
break;
1315
case TC_CBQ_OVL_DROP:
1316
cl->overlimit = cbq_ovl_drop;
1317
break;
1318
case TC_CBQ_OVL_RCLASSIC:
1319
cl->overlimit = cbq_ovl_rclassic;
1320
break;
1321
default:
1322
return -EINVAL;
1323
}
1324
cl->penalty = ovl->penalty;
1325
return 0;
1326
}
1327
1328
#ifdef CONFIG_NET_CLS_ACT
1329
static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1330
{
1331
cl->police = p->police;
1332
1333
if (cl->q->handle) {
1334
if (p->police == TC_POLICE_RECLASSIFY)
1335
cl->q->reshape_fail = cbq_reshape_fail;
1336
else
1337
cl->q->reshape_fail = NULL;
1338
}
1339
return 0;
1340
}
1341
#endif
1342
1343
static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1344
{
1345
cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1346
return 0;
1347
}
1348
1349
static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1350
[TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1351
[TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1352
[TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1353
[TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1354
[TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1355
[TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1356
[TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1357
};
1358
1359
static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1360
{
1361
struct cbq_sched_data *q = qdisc_priv(sch);
1362
struct nlattr *tb[TCA_CBQ_MAX + 1];
1363
struct tc_ratespec *r;
1364
int err;
1365
1366
err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1367
if (err < 0)
1368
return err;
1369
1370
if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1371
return -EINVAL;
1372
1373
r = nla_data(tb[TCA_CBQ_RATE]);
1374
1375
if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1376
return -EINVAL;
1377
1378
err = qdisc_class_hash_init(&q->clhash);
1379
if (err < 0)
1380
goto put_rtab;
1381
1382
q->link.refcnt = 1;
1383
q->link.sibling = &q->link;
1384
q->link.common.classid = sch->handle;
1385
q->link.qdisc = sch;
1386
q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1387
sch->handle);
1388
if (!q->link.q)
1389
q->link.q = &noop_qdisc;
1390
1391
q->link.priority = TC_CBQ_MAXPRIO - 1;
1392
q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1393
q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1394
q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1395
q->link.overlimit = cbq_ovl_classic;
1396
q->link.allot = psched_mtu(qdisc_dev(sch));
1397
q->link.quantum = q->link.allot;
1398
q->link.weight = q->link.R_tab->rate.rate;
1399
1400
q->link.ewma_log = TC_CBQ_DEF_EWMA;
1401
q->link.avpkt = q->link.allot/2;
1402
q->link.minidle = -0x7FFFFFFF;
1403
1404
qdisc_watchdog_init(&q->watchdog, sch);
1405
hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1406
q->delay_timer.function = cbq_undelay;
1407
q->toplevel = TC_CBQ_MAXLEVEL;
1408
q->now = psched_get_time();
1409
q->now_rt = q->now;
1410
1411
cbq_link_class(&q->link);
1412
1413
if (tb[TCA_CBQ_LSSOPT])
1414
cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1415
1416
cbq_addprio(q, &q->link);
1417
return 0;
1418
1419
put_rtab:
1420
qdisc_put_rtab(q->link.R_tab);
1421
return err;
1422
}
1423
1424
static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1425
{
1426
unsigned char *b = skb_tail_pointer(skb);
1427
1428
NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1429
return skb->len;
1430
1431
nla_put_failure:
1432
nlmsg_trim(skb, b);
1433
return -1;
1434
}
1435
1436
static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1437
{
1438
unsigned char *b = skb_tail_pointer(skb);
1439
struct tc_cbq_lssopt opt;
1440
1441
opt.flags = 0;
1442
if (cl->borrow == NULL)
1443
opt.flags |= TCF_CBQ_LSS_BOUNDED;
1444
if (cl->share == NULL)
1445
opt.flags |= TCF_CBQ_LSS_ISOLATED;
1446
opt.ewma_log = cl->ewma_log;
1447
opt.level = cl->level;
1448
opt.avpkt = cl->avpkt;
1449
opt.maxidle = cl->maxidle;
1450
opt.minidle = (u32)(-cl->minidle);
1451
opt.offtime = cl->offtime;
1452
opt.change = ~0;
1453
NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1454
return skb->len;
1455
1456
nla_put_failure:
1457
nlmsg_trim(skb, b);
1458
return -1;
1459
}
1460
1461
static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1462
{
1463
unsigned char *b = skb_tail_pointer(skb);
1464
struct tc_cbq_wrropt opt;
1465
1466
opt.flags = 0;
1467
opt.allot = cl->allot;
1468
opt.priority = cl->priority + 1;
1469
opt.cpriority = cl->cpriority + 1;
1470
opt.weight = cl->weight;
1471
NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1472
return skb->len;
1473
1474
nla_put_failure:
1475
nlmsg_trim(skb, b);
1476
return -1;
1477
}
1478
1479
static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1480
{
1481
unsigned char *b = skb_tail_pointer(skb);
1482
struct tc_cbq_ovl opt;
1483
1484
opt.strategy = cl->ovl_strategy;
1485
opt.priority2 = cl->priority2 + 1;
1486
opt.pad = 0;
1487
opt.penalty = cl->penalty;
1488
NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1489
return skb->len;
1490
1491
nla_put_failure:
1492
nlmsg_trim(skb, b);
1493
return -1;
1494
}
1495
1496
static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1497
{
1498
unsigned char *b = skb_tail_pointer(skb);
1499
struct tc_cbq_fopt opt;
1500
1501
if (cl->split || cl->defmap) {
1502
opt.split = cl->split ? cl->split->common.classid : 0;
1503
opt.defmap = cl->defmap;
1504
opt.defchange = ~0;
1505
NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1506
}
1507
return skb->len;
1508
1509
nla_put_failure:
1510
nlmsg_trim(skb, b);
1511
return -1;
1512
}
1513
1514
#ifdef CONFIG_NET_CLS_ACT
1515
static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1516
{
1517
unsigned char *b = skb_tail_pointer(skb);
1518
struct tc_cbq_police opt;
1519
1520
if (cl->police) {
1521
opt.police = cl->police;
1522
opt.__res1 = 0;
1523
opt.__res2 = 0;
1524
NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1525
}
1526
return skb->len;
1527
1528
nla_put_failure:
1529
nlmsg_trim(skb, b);
1530
return -1;
1531
}
1532
#endif
1533
1534
static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1535
{
1536
if (cbq_dump_lss(skb, cl) < 0 ||
1537
cbq_dump_rate(skb, cl) < 0 ||
1538
cbq_dump_wrr(skb, cl) < 0 ||
1539
cbq_dump_ovl(skb, cl) < 0 ||
1540
#ifdef CONFIG_NET_CLS_ACT
1541
cbq_dump_police(skb, cl) < 0 ||
1542
#endif
1543
cbq_dump_fopt(skb, cl) < 0)
1544
return -1;
1545
return 0;
1546
}
1547
1548
static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1549
{
1550
struct cbq_sched_data *q = qdisc_priv(sch);
1551
struct nlattr *nest;
1552
1553
nest = nla_nest_start(skb, TCA_OPTIONS);
1554
if (nest == NULL)
1555
goto nla_put_failure;
1556
if (cbq_dump_attr(skb, &q->link) < 0)
1557
goto nla_put_failure;
1558
nla_nest_end(skb, nest);
1559
return skb->len;
1560
1561
nla_put_failure:
1562
nla_nest_cancel(skb, nest);
1563
return -1;
1564
}
1565
1566
static int
1567
cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1568
{
1569
struct cbq_sched_data *q = qdisc_priv(sch);
1570
1571
q->link.xstats.avgidle = q->link.avgidle;
1572
return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1573
}
1574
1575
static int
1576
cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1577
struct sk_buff *skb, struct tcmsg *tcm)
1578
{
1579
struct cbq_class *cl = (struct cbq_class *)arg;
1580
struct nlattr *nest;
1581
1582
if (cl->tparent)
1583
tcm->tcm_parent = cl->tparent->common.classid;
1584
else
1585
tcm->tcm_parent = TC_H_ROOT;
1586
tcm->tcm_handle = cl->common.classid;
1587
tcm->tcm_info = cl->q->handle;
1588
1589
nest = nla_nest_start(skb, TCA_OPTIONS);
1590
if (nest == NULL)
1591
goto nla_put_failure;
1592
if (cbq_dump_attr(skb, cl) < 0)
1593
goto nla_put_failure;
1594
nla_nest_end(skb, nest);
1595
return skb->len;
1596
1597
nla_put_failure:
1598
nla_nest_cancel(skb, nest);
1599
return -1;
1600
}
1601
1602
static int
1603
cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1604
struct gnet_dump *d)
1605
{
1606
struct cbq_sched_data *q = qdisc_priv(sch);
1607
struct cbq_class *cl = (struct cbq_class *)arg;
1608
1609
cl->qstats.qlen = cl->q->q.qlen;
1610
cl->xstats.avgidle = cl->avgidle;
1611
cl->xstats.undertime = 0;
1612
1613
if (cl->undertime != PSCHED_PASTPERFECT)
1614
cl->xstats.undertime = cl->undertime - q->now;
1615
1616
if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1617
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1618
gnet_stats_copy_queue(d, &cl->qstats) < 0)
1619
return -1;
1620
1621
return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1622
}
1623
1624
static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1625
struct Qdisc **old)
1626
{
1627
struct cbq_class *cl = (struct cbq_class *)arg;
1628
1629
if (new == NULL) {
1630
new = qdisc_create_dflt(sch->dev_queue,
1631
&pfifo_qdisc_ops, cl->common.classid);
1632
if (new == NULL)
1633
return -ENOBUFS;
1634
} else {
1635
#ifdef CONFIG_NET_CLS_ACT
1636
if (cl->police == TC_POLICE_RECLASSIFY)
1637
new->reshape_fail = cbq_reshape_fail;
1638
#endif
1639
}
1640
sch_tree_lock(sch);
1641
*old = cl->q;
1642
cl->q = new;
1643
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1644
qdisc_reset(*old);
1645
sch_tree_unlock(sch);
1646
1647
return 0;
1648
}
1649
1650
static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1651
{
1652
struct cbq_class *cl = (struct cbq_class *)arg;
1653
1654
return cl->q;
1655
}
1656
1657
static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1658
{
1659
struct cbq_class *cl = (struct cbq_class *)arg;
1660
1661
if (cl->q->q.qlen == 0)
1662
cbq_deactivate_class(cl);
1663
}
1664
1665
static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1666
{
1667
struct cbq_sched_data *q = qdisc_priv(sch);
1668
struct cbq_class *cl = cbq_class_lookup(q, classid);
1669
1670
if (cl) {
1671
cl->refcnt++;
1672
return (unsigned long)cl;
1673
}
1674
return 0;
1675
}
1676
1677
static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1678
{
1679
struct cbq_sched_data *q = qdisc_priv(sch);
1680
1681
WARN_ON(cl->filters);
1682
1683
tcf_destroy_chain(&cl->filter_list);
1684
qdisc_destroy(cl->q);
1685
qdisc_put_rtab(cl->R_tab);
1686
gen_kill_estimator(&cl->bstats, &cl->rate_est);
1687
if (cl != &q->link)
1688
kfree(cl);
1689
}
1690
1691
static void cbq_destroy(struct Qdisc *sch)
1692
{
1693
struct cbq_sched_data *q = qdisc_priv(sch);
1694
struct hlist_node *n, *next;
1695
struct cbq_class *cl;
1696
unsigned int h;
1697
1698
#ifdef CONFIG_NET_CLS_ACT
1699
q->rx_class = NULL;
1700
#endif
1701
/*
1702
* Filters must be destroyed first because we don't destroy the
1703
* classes from root to leafs which means that filters can still
1704
* be bound to classes which have been destroyed already. --TGR '04
1705
*/
1706
for (h = 0; h < q->clhash.hashsize; h++) {
1707
hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
1708
tcf_destroy_chain(&cl->filter_list);
1709
}
1710
for (h = 0; h < q->clhash.hashsize; h++) {
1711
hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
1712
common.hnode)
1713
cbq_destroy_class(sch, cl);
1714
}
1715
qdisc_class_hash_destroy(&q->clhash);
1716
}
1717
1718
static void cbq_put(struct Qdisc *sch, unsigned long arg)
1719
{
1720
struct cbq_class *cl = (struct cbq_class *)arg;
1721
1722
if (--cl->refcnt == 0) {
1723
#ifdef CONFIG_NET_CLS_ACT
1724
spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1725
struct cbq_sched_data *q = qdisc_priv(sch);
1726
1727
spin_lock_bh(root_lock);
1728
if (q->rx_class == cl)
1729
q->rx_class = NULL;
1730
spin_unlock_bh(root_lock);
1731
#endif
1732
1733
cbq_destroy_class(sch, cl);
1734
}
1735
}
1736
1737
static int
1738
cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1739
unsigned long *arg)
1740
{
1741
int err;
1742
struct cbq_sched_data *q = qdisc_priv(sch);
1743
struct cbq_class *cl = (struct cbq_class *)*arg;
1744
struct nlattr *opt = tca[TCA_OPTIONS];
1745
struct nlattr *tb[TCA_CBQ_MAX + 1];
1746
struct cbq_class *parent;
1747
struct qdisc_rate_table *rtab = NULL;
1748
1749
if (opt == NULL)
1750
return -EINVAL;
1751
1752
err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1753
if (err < 0)
1754
return err;
1755
1756
if (cl) {
1757
/* Check parent */
1758
if (parentid) {
1759
if (cl->tparent &&
1760
cl->tparent->common.classid != parentid)
1761
return -EINVAL;
1762
if (!cl->tparent && parentid != TC_H_ROOT)
1763
return -EINVAL;
1764
}
1765
1766
if (tb[TCA_CBQ_RATE]) {
1767
rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1768
tb[TCA_CBQ_RTAB]);
1769
if (rtab == NULL)
1770
return -EINVAL;
1771
}
1772
1773
if (tca[TCA_RATE]) {
1774
err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1775
qdisc_root_sleeping_lock(sch),
1776
tca[TCA_RATE]);
1777
if (err) {
1778
if (rtab)
1779
qdisc_put_rtab(rtab);
1780
return err;
1781
}
1782
}
1783
1784
/* Change class parameters */
1785
sch_tree_lock(sch);
1786
1787
if (cl->next_alive != NULL)
1788
cbq_deactivate_class(cl);
1789
1790
if (rtab) {
1791
qdisc_put_rtab(cl->R_tab);
1792
cl->R_tab = rtab;
1793
}
1794
1795
if (tb[TCA_CBQ_LSSOPT])
1796
cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1797
1798
if (tb[TCA_CBQ_WRROPT]) {
1799
cbq_rmprio(q, cl);
1800
cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1801
}
1802
1803
if (tb[TCA_CBQ_OVL_STRATEGY])
1804
cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1805
1806
#ifdef CONFIG_NET_CLS_ACT
1807
if (tb[TCA_CBQ_POLICE])
1808
cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1809
#endif
1810
1811
if (tb[TCA_CBQ_FOPT])
1812
cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1813
1814
if (cl->q->q.qlen)
1815
cbq_activate_class(cl);
1816
1817
sch_tree_unlock(sch);
1818
1819
return 0;
1820
}
1821
1822
if (parentid == TC_H_ROOT)
1823
return -EINVAL;
1824
1825
if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1826
tb[TCA_CBQ_LSSOPT] == NULL)
1827
return -EINVAL;
1828
1829
rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1830
if (rtab == NULL)
1831
return -EINVAL;
1832
1833
if (classid) {
1834
err = -EINVAL;
1835
if (TC_H_MAJ(classid ^ sch->handle) ||
1836
cbq_class_lookup(q, classid))
1837
goto failure;
1838
} else {
1839
int i;
1840
classid = TC_H_MAKE(sch->handle, 0x8000);
1841
1842
for (i = 0; i < 0x8000; i++) {
1843
if (++q->hgenerator >= 0x8000)
1844
q->hgenerator = 1;
1845
if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1846
break;
1847
}
1848
err = -ENOSR;
1849
if (i >= 0x8000)
1850
goto failure;
1851
classid = classid|q->hgenerator;
1852
}
1853
1854
parent = &q->link;
1855
if (parentid) {
1856
parent = cbq_class_lookup(q, parentid);
1857
err = -EINVAL;
1858
if (parent == NULL)
1859
goto failure;
1860
}
1861
1862
err = -ENOBUFS;
1863
cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1864
if (cl == NULL)
1865
goto failure;
1866
1867
if (tca[TCA_RATE]) {
1868
err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1869
qdisc_root_sleeping_lock(sch),
1870
tca[TCA_RATE]);
1871
if (err) {
1872
kfree(cl);
1873
goto failure;
1874
}
1875
}
1876
1877
cl->R_tab = rtab;
1878
rtab = NULL;
1879
cl->refcnt = 1;
1880
cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1881
if (!cl->q)
1882
cl->q = &noop_qdisc;
1883
cl->common.classid = classid;
1884
cl->tparent = parent;
1885
cl->qdisc = sch;
1886
cl->allot = parent->allot;
1887
cl->quantum = cl->allot;
1888
cl->weight = cl->R_tab->rate.rate;
1889
1890
sch_tree_lock(sch);
1891
cbq_link_class(cl);
1892
cl->borrow = cl->tparent;
1893
if (cl->tparent != &q->link)
1894
cl->share = cl->tparent;
1895
cbq_adjust_levels(parent);
1896
cl->minidle = -0x7FFFFFFF;
1897
cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1898
cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1899
if (cl->ewma_log == 0)
1900
cl->ewma_log = q->link.ewma_log;
1901
if (cl->maxidle == 0)
1902
cl->maxidle = q->link.maxidle;
1903
if (cl->avpkt == 0)
1904
cl->avpkt = q->link.avpkt;
1905
cl->overlimit = cbq_ovl_classic;
1906
if (tb[TCA_CBQ_OVL_STRATEGY])
1907
cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1908
#ifdef CONFIG_NET_CLS_ACT
1909
if (tb[TCA_CBQ_POLICE])
1910
cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1911
#endif
1912
if (tb[TCA_CBQ_FOPT])
1913
cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1914
sch_tree_unlock(sch);
1915
1916
qdisc_class_hash_grow(sch, &q->clhash);
1917
1918
*arg = (unsigned long)cl;
1919
return 0;
1920
1921
failure:
1922
qdisc_put_rtab(rtab);
1923
return err;
1924
}
1925
1926
static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1927
{
1928
struct cbq_sched_data *q = qdisc_priv(sch);
1929
struct cbq_class *cl = (struct cbq_class *)arg;
1930
unsigned int qlen;
1931
1932
if (cl->filters || cl->children || cl == &q->link)
1933
return -EBUSY;
1934
1935
sch_tree_lock(sch);
1936
1937
qlen = cl->q->q.qlen;
1938
qdisc_reset(cl->q);
1939
qdisc_tree_decrease_qlen(cl->q, qlen);
1940
1941
if (cl->next_alive)
1942
cbq_deactivate_class(cl);
1943
1944
if (q->tx_borrowed == cl)
1945
q->tx_borrowed = q->tx_class;
1946
if (q->tx_class == cl) {
1947
q->tx_class = NULL;
1948
q->tx_borrowed = NULL;
1949
}
1950
#ifdef CONFIG_NET_CLS_ACT
1951
if (q->rx_class == cl)
1952
q->rx_class = NULL;
1953
#endif
1954
1955
cbq_unlink_class(cl);
1956
cbq_adjust_levels(cl->tparent);
1957
cl->defmap = 0;
1958
cbq_sync_defmap(cl);
1959
1960
cbq_rmprio(q, cl);
1961
sch_tree_unlock(sch);
1962
1963
BUG_ON(--cl->refcnt == 0);
1964
/*
1965
* This shouldn't happen: we "hold" one cops->get() when called
1966
* from tc_ctl_tclass; the destroy method is done from cops->put().
1967
*/
1968
1969
return 0;
1970
}
1971
1972
static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1973
{
1974
struct cbq_sched_data *q = qdisc_priv(sch);
1975
struct cbq_class *cl = (struct cbq_class *)arg;
1976
1977
if (cl == NULL)
1978
cl = &q->link;
1979
1980
return &cl->filter_list;
1981
}
1982
1983
static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1984
u32 classid)
1985
{
1986
struct cbq_sched_data *q = qdisc_priv(sch);
1987
struct cbq_class *p = (struct cbq_class *)parent;
1988
struct cbq_class *cl = cbq_class_lookup(q, classid);
1989
1990
if (cl) {
1991
if (p && p->level <= cl->level)
1992
return 0;
1993
cl->filters++;
1994
return (unsigned long)cl;
1995
}
1996
return 0;
1997
}
1998
1999
static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2000
{
2001
struct cbq_class *cl = (struct cbq_class *)arg;
2002
2003
cl->filters--;
2004
}
2005
2006
static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2007
{
2008
struct cbq_sched_data *q = qdisc_priv(sch);
2009
struct cbq_class *cl;
2010
struct hlist_node *n;
2011
unsigned int h;
2012
2013
if (arg->stop)
2014
return;
2015
2016
for (h = 0; h < q->clhash.hashsize; h++) {
2017
hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
2018
if (arg->count < arg->skip) {
2019
arg->count++;
2020
continue;
2021
}
2022
if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2023
arg->stop = 1;
2024
return;
2025
}
2026
arg->count++;
2027
}
2028
}
2029
}
2030
2031
static const struct Qdisc_class_ops cbq_class_ops = {
2032
.graft = cbq_graft,
2033
.leaf = cbq_leaf,
2034
.qlen_notify = cbq_qlen_notify,
2035
.get = cbq_get,
2036
.put = cbq_put,
2037
.change = cbq_change_class,
2038
.delete = cbq_delete,
2039
.walk = cbq_walk,
2040
.tcf_chain = cbq_find_tcf,
2041
.bind_tcf = cbq_bind_filter,
2042
.unbind_tcf = cbq_unbind_filter,
2043
.dump = cbq_dump_class,
2044
.dump_stats = cbq_dump_class_stats,
2045
};
2046
2047
static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2048
.next = NULL,
2049
.cl_ops = &cbq_class_ops,
2050
.id = "cbq",
2051
.priv_size = sizeof(struct cbq_sched_data),
2052
.enqueue = cbq_enqueue,
2053
.dequeue = cbq_dequeue,
2054
.peek = qdisc_peek_dequeued,
2055
.drop = cbq_drop,
2056
.init = cbq_init,
2057
.reset = cbq_reset,
2058
.destroy = cbq_destroy,
2059
.change = NULL,
2060
.dump = cbq_dump,
2061
.dump_stats = cbq_dump_stats,
2062
.owner = THIS_MODULE,
2063
};
2064
2065
static int __init cbq_module_init(void)
2066
{
2067
return register_qdisc(&cbq_qdisc_ops);
2068
}
2069
static void __exit cbq_module_exit(void)
2070
{
2071
unregister_qdisc(&cbq_qdisc_ops);
2072
}
2073
module_init(cbq_module_init)
2074
module_exit(cbq_module_exit)
2075
MODULE_LICENSE("GPL");
2076
2077