Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/sched/sch_api.c
15109 views
1
/*
2
* net/sched/sch_api.c Packet scheduler API.
3
*
4
* This program is free software; you can redistribute it and/or
5
* modify it under the terms of the GNU General Public License
6
* as published by the Free Software Foundation; either version
7
* 2 of the License, or (at your option) any later version.
8
*
9
* Authors: Alexey Kuznetsov, <[email protected]>
10
*
11
* Fixes:
12
*
13
* Rani Assaf <[email protected]> :980802: JIFFIES and CPU clock sources are repaired.
14
* Eduardo J. Blanco <[email protected]> :990222: kmod support
15
* Jamal Hadi Salim <[email protected]>: 990601: ingress support
16
*/
17
18
#include <linux/module.h>
19
#include <linux/types.h>
20
#include <linux/kernel.h>
21
#include <linux/string.h>
22
#include <linux/errno.h>
23
#include <linux/skbuff.h>
24
#include <linux/init.h>
25
#include <linux/proc_fs.h>
26
#include <linux/seq_file.h>
27
#include <linux/kmod.h>
28
#include <linux/list.h>
29
#include <linux/hrtimer.h>
30
#include <linux/lockdep.h>
31
#include <linux/slab.h>
32
33
#include <net/net_namespace.h>
34
#include <net/sock.h>
35
#include <net/netlink.h>
36
#include <net/pkt_sched.h>
37
38
static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39
struct nlmsghdr *n, u32 clid,
40
struct Qdisc *old, struct Qdisc *new);
41
static int tclass_notify(struct net *net, struct sk_buff *oskb,
42
struct nlmsghdr *n, struct Qdisc *q,
43
unsigned long cl, int event);
44
45
/*
46
47
Short review.
48
-------------
49
50
This file consists of two interrelated parts:
51
52
1. queueing disciplines manager frontend.
53
2. traffic classes manager frontend.
54
55
Generally, queueing discipline ("qdisc") is a black box,
56
which is able to enqueue packets and to dequeue them (when
57
device is ready to send something) in order and at times
58
determined by algorithm hidden in it.
59
60
qdisc's are divided to two categories:
61
- "queues", which have no internal structure visible from outside.
62
- "schedulers", which split all the packets to "traffic classes",
63
using "packet classifiers" (look at cls_api.c)
64
65
In turn, classes may have child qdiscs (as rule, queues)
66
attached to them etc. etc. etc.
67
68
The goal of the routines in this file is to translate
69
information supplied by user in the form of handles
70
to more intelligible for kernel form, to make some sanity
71
checks and part of work, which is common to all qdiscs
72
and to provide rtnetlink notifications.
73
74
All real intelligent work is done inside qdisc modules.
75
76
77
78
Every discipline has two major routines: enqueue and dequeue.
79
80
---dequeue
81
82
dequeue usually returns a skb to send. It is allowed to return NULL,
83
but it does not mean that queue is empty, it just means that
84
discipline does not want to send anything this time.
85
Queue is really empty if q->q.qlen == 0.
86
For complicated disciplines with multiple queues q->q is not
87
real packet queue, but however q->q.qlen must be valid.
88
89
---enqueue
90
91
enqueue returns 0, if packet was enqueued successfully.
92
If packet (this one or another one) was dropped, it returns
93
not zero error code.
94
NET_XMIT_DROP - this packet dropped
95
Expected action: do not backoff, but wait until queue will clear.
96
NET_XMIT_CN - probably this packet enqueued, but another one dropped.
97
Expected action: backoff or ignore
98
NET_XMIT_POLICED - dropped by police.
99
Expected action: backoff or error to real-time apps.
100
101
Auxiliary routines:
102
103
---peek
104
105
like dequeue but without removing a packet from the queue
106
107
---reset
108
109
returns qdisc to initial state: purge all buffers, clear all
110
timers, counters (except for statistics) etc.
111
112
---init
113
114
initializes newly created qdisc.
115
116
---destroy
117
118
destroys resources allocated by init and during lifetime of qdisc.
119
120
---change
121
122
changes qdisc parameters.
123
*/
124
125
/* Protects list of registered TC modules. It is pure SMP lock. */
126
static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129
/************************************************
130
* Queueing disciplines manipulation. *
131
************************************************/
132
133
134
/* The list of all installed queueing disciplines. */
135
136
static struct Qdisc_ops *qdisc_base;
137
138
/* Register/uregister queueing discipline */
139
140
int register_qdisc(struct Qdisc_ops *qops)
141
{
142
struct Qdisc_ops *q, **qp;
143
int rc = -EEXIST;
144
145
write_lock(&qdisc_mod_lock);
146
for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147
if (!strcmp(qops->id, q->id))
148
goto out;
149
150
if (qops->enqueue == NULL)
151
qops->enqueue = noop_qdisc_ops.enqueue;
152
if (qops->peek == NULL) {
153
if (qops->dequeue == NULL)
154
qops->peek = noop_qdisc_ops.peek;
155
else
156
goto out_einval;
157
}
158
if (qops->dequeue == NULL)
159
qops->dequeue = noop_qdisc_ops.dequeue;
160
161
if (qops->cl_ops) {
162
const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164
if (!(cops->get && cops->put && cops->walk && cops->leaf))
165
goto out_einval;
166
167
if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168
goto out_einval;
169
}
170
171
qops->next = NULL;
172
*qp = qops;
173
rc = 0;
174
out:
175
write_unlock(&qdisc_mod_lock);
176
return rc;
177
178
out_einval:
179
rc = -EINVAL;
180
goto out;
181
}
182
EXPORT_SYMBOL(register_qdisc);
183
184
int unregister_qdisc(struct Qdisc_ops *qops)
185
{
186
struct Qdisc_ops *q, **qp;
187
int err = -ENOENT;
188
189
write_lock(&qdisc_mod_lock);
190
for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191
if (q == qops)
192
break;
193
if (q) {
194
*qp = q->next;
195
q->next = NULL;
196
err = 0;
197
}
198
write_unlock(&qdisc_mod_lock);
199
return err;
200
}
201
EXPORT_SYMBOL(unregister_qdisc);
202
203
/* We know handle. Find qdisc among all qdisc's attached to device
204
(root qdisc, all its children, children of children etc.)
205
*/
206
207
static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208
{
209
struct Qdisc *q;
210
211
if (!(root->flags & TCQ_F_BUILTIN) &&
212
root->handle == handle)
213
return root;
214
215
list_for_each_entry(q, &root->list, list) {
216
if (q->handle == handle)
217
return q;
218
}
219
return NULL;
220
}
221
222
static void qdisc_list_add(struct Qdisc *q)
223
{
224
if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225
list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226
}
227
228
void qdisc_list_del(struct Qdisc *q)
229
{
230
if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231
list_del(&q->list);
232
}
233
EXPORT_SYMBOL(qdisc_list_del);
234
235
struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236
{
237
struct Qdisc *q;
238
239
q = qdisc_match_from_root(dev->qdisc, handle);
240
if (q)
241
goto out;
242
243
if (dev_ingress_queue(dev))
244
q = qdisc_match_from_root(
245
dev_ingress_queue(dev)->qdisc_sleeping,
246
handle);
247
out:
248
return q;
249
}
250
251
static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252
{
253
unsigned long cl;
254
struct Qdisc *leaf;
255
const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256
257
if (cops == NULL)
258
return NULL;
259
cl = cops->get(p, classid);
260
261
if (cl == 0)
262
return NULL;
263
leaf = cops->leaf(p, cl);
264
cops->put(p, cl);
265
return leaf;
266
}
267
268
/* Find queueing discipline by name */
269
270
static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271
{
272
struct Qdisc_ops *q = NULL;
273
274
if (kind) {
275
read_lock(&qdisc_mod_lock);
276
for (q = qdisc_base; q; q = q->next) {
277
if (nla_strcmp(kind, q->id) == 0) {
278
if (!try_module_get(q->owner))
279
q = NULL;
280
break;
281
}
282
}
283
read_unlock(&qdisc_mod_lock);
284
}
285
return q;
286
}
287
288
static struct qdisc_rate_table *qdisc_rtab_list;
289
290
struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291
{
292
struct qdisc_rate_table *rtab;
293
294
for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295
if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296
rtab->refcnt++;
297
return rtab;
298
}
299
}
300
301
if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302
nla_len(tab) != TC_RTAB_SIZE)
303
return NULL;
304
305
rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306
if (rtab) {
307
rtab->rate = *r;
308
rtab->refcnt = 1;
309
memcpy(rtab->data, nla_data(tab), 1024);
310
rtab->next = qdisc_rtab_list;
311
qdisc_rtab_list = rtab;
312
}
313
return rtab;
314
}
315
EXPORT_SYMBOL(qdisc_get_rtab);
316
317
void qdisc_put_rtab(struct qdisc_rate_table *tab)
318
{
319
struct qdisc_rate_table *rtab, **rtabp;
320
321
if (!tab || --tab->refcnt)
322
return;
323
324
for (rtabp = &qdisc_rtab_list;
325
(rtab = *rtabp) != NULL;
326
rtabp = &rtab->next) {
327
if (rtab == tab) {
328
*rtabp = rtab->next;
329
kfree(rtab);
330
return;
331
}
332
}
333
}
334
EXPORT_SYMBOL(qdisc_put_rtab);
335
336
static LIST_HEAD(qdisc_stab_list);
337
static DEFINE_SPINLOCK(qdisc_stab_lock);
338
339
static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
340
[TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
341
[TCA_STAB_DATA] = { .type = NLA_BINARY },
342
};
343
344
static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
345
{
346
struct nlattr *tb[TCA_STAB_MAX + 1];
347
struct qdisc_size_table *stab;
348
struct tc_sizespec *s;
349
unsigned int tsize = 0;
350
u16 *tab = NULL;
351
int err;
352
353
err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
354
if (err < 0)
355
return ERR_PTR(err);
356
if (!tb[TCA_STAB_BASE])
357
return ERR_PTR(-EINVAL);
358
359
s = nla_data(tb[TCA_STAB_BASE]);
360
361
if (s->tsize > 0) {
362
if (!tb[TCA_STAB_DATA])
363
return ERR_PTR(-EINVAL);
364
tab = nla_data(tb[TCA_STAB_DATA]);
365
tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
366
}
367
368
if (tsize != s->tsize || (!tab && tsize > 0))
369
return ERR_PTR(-EINVAL);
370
371
spin_lock(&qdisc_stab_lock);
372
373
list_for_each_entry(stab, &qdisc_stab_list, list) {
374
if (memcmp(&stab->szopts, s, sizeof(*s)))
375
continue;
376
if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
377
continue;
378
stab->refcnt++;
379
spin_unlock(&qdisc_stab_lock);
380
return stab;
381
}
382
383
spin_unlock(&qdisc_stab_lock);
384
385
stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
386
if (!stab)
387
return ERR_PTR(-ENOMEM);
388
389
stab->refcnt = 1;
390
stab->szopts = *s;
391
if (tsize > 0)
392
memcpy(stab->data, tab, tsize * sizeof(u16));
393
394
spin_lock(&qdisc_stab_lock);
395
list_add_tail(&stab->list, &qdisc_stab_list);
396
spin_unlock(&qdisc_stab_lock);
397
398
return stab;
399
}
400
401
static void stab_kfree_rcu(struct rcu_head *head)
402
{
403
kfree(container_of(head, struct qdisc_size_table, rcu));
404
}
405
406
void qdisc_put_stab(struct qdisc_size_table *tab)
407
{
408
if (!tab)
409
return;
410
411
spin_lock(&qdisc_stab_lock);
412
413
if (--tab->refcnt == 0) {
414
list_del(&tab->list);
415
call_rcu_bh(&tab->rcu, stab_kfree_rcu);
416
}
417
418
spin_unlock(&qdisc_stab_lock);
419
}
420
EXPORT_SYMBOL(qdisc_put_stab);
421
422
static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
423
{
424
struct nlattr *nest;
425
426
nest = nla_nest_start(skb, TCA_STAB);
427
if (nest == NULL)
428
goto nla_put_failure;
429
NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
430
nla_nest_end(skb, nest);
431
432
return skb->len;
433
434
nla_put_failure:
435
return -1;
436
}
437
438
void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
439
{
440
int pkt_len, slot;
441
442
pkt_len = skb->len + stab->szopts.overhead;
443
if (unlikely(!stab->szopts.tsize))
444
goto out;
445
446
slot = pkt_len + stab->szopts.cell_align;
447
if (unlikely(slot < 0))
448
slot = 0;
449
450
slot >>= stab->szopts.cell_log;
451
if (likely(slot < stab->szopts.tsize))
452
pkt_len = stab->data[slot];
453
else
454
pkt_len = stab->data[stab->szopts.tsize - 1] *
455
(slot / stab->szopts.tsize) +
456
stab->data[slot % stab->szopts.tsize];
457
458
pkt_len <<= stab->szopts.size_log;
459
out:
460
if (unlikely(pkt_len < 1))
461
pkt_len = 1;
462
qdisc_skb_cb(skb)->pkt_len = pkt_len;
463
}
464
EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
465
466
void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
467
{
468
if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
469
pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
470
txt, qdisc->ops->id, qdisc->handle >> 16);
471
qdisc->flags |= TCQ_F_WARN_NONWC;
472
}
473
}
474
EXPORT_SYMBOL(qdisc_warn_nonwc);
475
476
static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
477
{
478
struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
479
timer);
480
481
qdisc_unthrottled(wd->qdisc);
482
__netif_schedule(qdisc_root(wd->qdisc));
483
484
return HRTIMER_NORESTART;
485
}
486
487
void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
488
{
489
hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
490
wd->timer.function = qdisc_watchdog;
491
wd->qdisc = qdisc;
492
}
493
EXPORT_SYMBOL(qdisc_watchdog_init);
494
495
void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
496
{
497
ktime_t time;
498
499
if (test_bit(__QDISC_STATE_DEACTIVATED,
500
&qdisc_root_sleeping(wd->qdisc)->state))
501
return;
502
503
qdisc_throttled(wd->qdisc);
504
time = ktime_set(0, 0);
505
time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
506
hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
507
}
508
EXPORT_SYMBOL(qdisc_watchdog_schedule);
509
510
void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
511
{
512
hrtimer_cancel(&wd->timer);
513
qdisc_unthrottled(wd->qdisc);
514
}
515
EXPORT_SYMBOL(qdisc_watchdog_cancel);
516
517
static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
518
{
519
unsigned int size = n * sizeof(struct hlist_head), i;
520
struct hlist_head *h;
521
522
if (size <= PAGE_SIZE)
523
h = kmalloc(size, GFP_KERNEL);
524
else
525
h = (struct hlist_head *)
526
__get_free_pages(GFP_KERNEL, get_order(size));
527
528
if (h != NULL) {
529
for (i = 0; i < n; i++)
530
INIT_HLIST_HEAD(&h[i]);
531
}
532
return h;
533
}
534
535
static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
536
{
537
unsigned int size = n * sizeof(struct hlist_head);
538
539
if (size <= PAGE_SIZE)
540
kfree(h);
541
else
542
free_pages((unsigned long)h, get_order(size));
543
}
544
545
void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
546
{
547
struct Qdisc_class_common *cl;
548
struct hlist_node *n, *next;
549
struct hlist_head *nhash, *ohash;
550
unsigned int nsize, nmask, osize;
551
unsigned int i, h;
552
553
/* Rehash when load factor exceeds 0.75 */
554
if (clhash->hashelems * 4 <= clhash->hashsize * 3)
555
return;
556
nsize = clhash->hashsize * 2;
557
nmask = nsize - 1;
558
nhash = qdisc_class_hash_alloc(nsize);
559
if (nhash == NULL)
560
return;
561
562
ohash = clhash->hash;
563
osize = clhash->hashsize;
564
565
sch_tree_lock(sch);
566
for (i = 0; i < osize; i++) {
567
hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
568
h = qdisc_class_hash(cl->classid, nmask);
569
hlist_add_head(&cl->hnode, &nhash[h]);
570
}
571
}
572
clhash->hash = nhash;
573
clhash->hashsize = nsize;
574
clhash->hashmask = nmask;
575
sch_tree_unlock(sch);
576
577
qdisc_class_hash_free(ohash, osize);
578
}
579
EXPORT_SYMBOL(qdisc_class_hash_grow);
580
581
int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
582
{
583
unsigned int size = 4;
584
585
clhash->hash = qdisc_class_hash_alloc(size);
586
if (clhash->hash == NULL)
587
return -ENOMEM;
588
clhash->hashsize = size;
589
clhash->hashmask = size - 1;
590
clhash->hashelems = 0;
591
return 0;
592
}
593
EXPORT_SYMBOL(qdisc_class_hash_init);
594
595
void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
596
{
597
qdisc_class_hash_free(clhash->hash, clhash->hashsize);
598
}
599
EXPORT_SYMBOL(qdisc_class_hash_destroy);
600
601
void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
602
struct Qdisc_class_common *cl)
603
{
604
unsigned int h;
605
606
INIT_HLIST_NODE(&cl->hnode);
607
h = qdisc_class_hash(cl->classid, clhash->hashmask);
608
hlist_add_head(&cl->hnode, &clhash->hash[h]);
609
clhash->hashelems++;
610
}
611
EXPORT_SYMBOL(qdisc_class_hash_insert);
612
613
void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
614
struct Qdisc_class_common *cl)
615
{
616
hlist_del(&cl->hnode);
617
clhash->hashelems--;
618
}
619
EXPORT_SYMBOL(qdisc_class_hash_remove);
620
621
/* Allocate an unique handle from space managed by kernel */
622
623
static u32 qdisc_alloc_handle(struct net_device *dev)
624
{
625
int i = 0x10000;
626
static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
627
628
do {
629
autohandle += TC_H_MAKE(0x10000U, 0);
630
if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
631
autohandle = TC_H_MAKE(0x80000000U, 0);
632
} while (qdisc_lookup(dev, autohandle) && --i > 0);
633
634
return i > 0 ? autohandle : 0;
635
}
636
637
void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
638
{
639
const struct Qdisc_class_ops *cops;
640
unsigned long cl;
641
u32 parentid;
642
643
if (n == 0)
644
return;
645
while ((parentid = sch->parent)) {
646
if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
647
return;
648
649
sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
650
if (sch == NULL) {
651
WARN_ON(parentid != TC_H_ROOT);
652
return;
653
}
654
cops = sch->ops->cl_ops;
655
if (cops->qlen_notify) {
656
cl = cops->get(sch, parentid);
657
cops->qlen_notify(sch, cl);
658
cops->put(sch, cl);
659
}
660
sch->q.qlen -= n;
661
}
662
}
663
EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
664
665
static void notify_and_destroy(struct net *net, struct sk_buff *skb,
666
struct nlmsghdr *n, u32 clid,
667
struct Qdisc *old, struct Qdisc *new)
668
{
669
if (new || old)
670
qdisc_notify(net, skb, n, clid, old, new);
671
672
if (old)
673
qdisc_destroy(old);
674
}
675
676
/* Graft qdisc "new" to class "classid" of qdisc "parent" or
677
* to device "dev".
678
*
679
* When appropriate send a netlink notification using 'skb'
680
* and "n".
681
*
682
* On success, destroy old qdisc.
683
*/
684
685
static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
686
struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
687
struct Qdisc *new, struct Qdisc *old)
688
{
689
struct Qdisc *q = old;
690
struct net *net = dev_net(dev);
691
int err = 0;
692
693
if (parent == NULL) {
694
unsigned int i, num_q, ingress;
695
696
ingress = 0;
697
num_q = dev->num_tx_queues;
698
if ((q && q->flags & TCQ_F_INGRESS) ||
699
(new && new->flags & TCQ_F_INGRESS)) {
700
num_q = 1;
701
ingress = 1;
702
if (!dev_ingress_queue(dev))
703
return -ENOENT;
704
}
705
706
if (dev->flags & IFF_UP)
707
dev_deactivate(dev);
708
709
if (new && new->ops->attach) {
710
new->ops->attach(new);
711
num_q = 0;
712
}
713
714
for (i = 0; i < num_q; i++) {
715
struct netdev_queue *dev_queue = dev_ingress_queue(dev);
716
717
if (!ingress)
718
dev_queue = netdev_get_tx_queue(dev, i);
719
720
old = dev_graft_qdisc(dev_queue, new);
721
if (new && i > 0)
722
atomic_inc(&new->refcnt);
723
724
if (!ingress)
725
qdisc_destroy(old);
726
}
727
728
if (!ingress) {
729
notify_and_destroy(net, skb, n, classid,
730
dev->qdisc, new);
731
if (new && !new->ops->attach)
732
atomic_inc(&new->refcnt);
733
dev->qdisc = new ? : &noop_qdisc;
734
} else {
735
notify_and_destroy(net, skb, n, classid, old, new);
736
}
737
738
if (dev->flags & IFF_UP)
739
dev_activate(dev);
740
} else {
741
const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
742
743
err = -EOPNOTSUPP;
744
if (cops && cops->graft) {
745
unsigned long cl = cops->get(parent, classid);
746
if (cl) {
747
err = cops->graft(parent, cl, new, &old);
748
cops->put(parent, cl);
749
} else
750
err = -ENOENT;
751
}
752
if (!err)
753
notify_and_destroy(net, skb, n, classid, old, new);
754
}
755
return err;
756
}
757
758
/* lockdep annotation is needed for ingress; egress gets it only for name */
759
static struct lock_class_key qdisc_tx_lock;
760
static struct lock_class_key qdisc_rx_lock;
761
762
/*
763
Allocate and initialize new qdisc.
764
765
Parameters are passed via opt.
766
*/
767
768
static struct Qdisc *
769
qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
770
struct Qdisc *p, u32 parent, u32 handle,
771
struct nlattr **tca, int *errp)
772
{
773
int err;
774
struct nlattr *kind = tca[TCA_KIND];
775
struct Qdisc *sch;
776
struct Qdisc_ops *ops;
777
struct qdisc_size_table *stab;
778
779
ops = qdisc_lookup_ops(kind);
780
#ifdef CONFIG_MODULES
781
if (ops == NULL && kind != NULL) {
782
char name[IFNAMSIZ];
783
if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
784
/* We dropped the RTNL semaphore in order to
785
* perform the module load. So, even if we
786
* succeeded in loading the module we have to
787
* tell the caller to replay the request. We
788
* indicate this using -EAGAIN.
789
* We replay the request because the device may
790
* go away in the mean time.
791
*/
792
rtnl_unlock();
793
request_module("sch_%s", name);
794
rtnl_lock();
795
ops = qdisc_lookup_ops(kind);
796
if (ops != NULL) {
797
/* We will try again qdisc_lookup_ops,
798
* so don't keep a reference.
799
*/
800
module_put(ops->owner);
801
err = -EAGAIN;
802
goto err_out;
803
}
804
}
805
}
806
#endif
807
808
err = -ENOENT;
809
if (ops == NULL)
810
goto err_out;
811
812
sch = qdisc_alloc(dev_queue, ops);
813
if (IS_ERR(sch)) {
814
err = PTR_ERR(sch);
815
goto err_out2;
816
}
817
818
sch->parent = parent;
819
820
if (handle == TC_H_INGRESS) {
821
sch->flags |= TCQ_F_INGRESS;
822
handle = TC_H_MAKE(TC_H_INGRESS, 0);
823
lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
824
} else {
825
if (handle == 0) {
826
handle = qdisc_alloc_handle(dev);
827
err = -ENOMEM;
828
if (handle == 0)
829
goto err_out3;
830
}
831
lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
832
}
833
834
sch->handle = handle;
835
836
if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
837
if (tca[TCA_STAB]) {
838
stab = qdisc_get_stab(tca[TCA_STAB]);
839
if (IS_ERR(stab)) {
840
err = PTR_ERR(stab);
841
goto err_out4;
842
}
843
rcu_assign_pointer(sch->stab, stab);
844
}
845
if (tca[TCA_RATE]) {
846
spinlock_t *root_lock;
847
848
err = -EOPNOTSUPP;
849
if (sch->flags & TCQ_F_MQROOT)
850
goto err_out4;
851
852
if ((sch->parent != TC_H_ROOT) &&
853
!(sch->flags & TCQ_F_INGRESS) &&
854
(!p || !(p->flags & TCQ_F_MQROOT)))
855
root_lock = qdisc_root_sleeping_lock(sch);
856
else
857
root_lock = qdisc_lock(sch);
858
859
err = gen_new_estimator(&sch->bstats, &sch->rate_est,
860
root_lock, tca[TCA_RATE]);
861
if (err)
862
goto err_out4;
863
}
864
865
qdisc_list_add(sch);
866
867
return sch;
868
}
869
err_out3:
870
dev_put(dev);
871
kfree((char *) sch - sch->padded);
872
err_out2:
873
module_put(ops->owner);
874
err_out:
875
*errp = err;
876
return NULL;
877
878
err_out4:
879
/*
880
* Any broken qdiscs that would require a ops->reset() here?
881
* The qdisc was never in action so it shouldn't be necessary.
882
*/
883
qdisc_put_stab(rtnl_dereference(sch->stab));
884
if (ops->destroy)
885
ops->destroy(sch);
886
goto err_out3;
887
}
888
889
static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
890
{
891
struct qdisc_size_table *ostab, *stab = NULL;
892
int err = 0;
893
894
if (tca[TCA_OPTIONS]) {
895
if (sch->ops->change == NULL)
896
return -EINVAL;
897
err = sch->ops->change(sch, tca[TCA_OPTIONS]);
898
if (err)
899
return err;
900
}
901
902
if (tca[TCA_STAB]) {
903
stab = qdisc_get_stab(tca[TCA_STAB]);
904
if (IS_ERR(stab))
905
return PTR_ERR(stab);
906
}
907
908
ostab = rtnl_dereference(sch->stab);
909
rcu_assign_pointer(sch->stab, stab);
910
qdisc_put_stab(ostab);
911
912
if (tca[TCA_RATE]) {
913
/* NB: ignores errors from replace_estimator
914
because change can't be undone. */
915
if (sch->flags & TCQ_F_MQROOT)
916
goto out;
917
gen_replace_estimator(&sch->bstats, &sch->rate_est,
918
qdisc_root_sleeping_lock(sch),
919
tca[TCA_RATE]);
920
}
921
out:
922
return 0;
923
}
924
925
struct check_loop_arg {
926
struct qdisc_walker w;
927
struct Qdisc *p;
928
int depth;
929
};
930
931
static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
932
933
static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
934
{
935
struct check_loop_arg arg;
936
937
if (q->ops->cl_ops == NULL)
938
return 0;
939
940
arg.w.stop = arg.w.skip = arg.w.count = 0;
941
arg.w.fn = check_loop_fn;
942
arg.depth = depth;
943
arg.p = p;
944
q->ops->cl_ops->walk(q, &arg.w);
945
return arg.w.stop ? -ELOOP : 0;
946
}
947
948
static int
949
check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
950
{
951
struct Qdisc *leaf;
952
const struct Qdisc_class_ops *cops = q->ops->cl_ops;
953
struct check_loop_arg *arg = (struct check_loop_arg *)w;
954
955
leaf = cops->leaf(q, cl);
956
if (leaf) {
957
if (leaf == arg->p || arg->depth > 7)
958
return -ELOOP;
959
return check_loop(leaf, arg->p, arg->depth + 1);
960
}
961
return 0;
962
}
963
964
/*
965
* Delete/get qdisc.
966
*/
967
968
static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
969
{
970
struct net *net = sock_net(skb->sk);
971
struct tcmsg *tcm = NLMSG_DATA(n);
972
struct nlattr *tca[TCA_MAX + 1];
973
struct net_device *dev;
974
u32 clid = tcm->tcm_parent;
975
struct Qdisc *q = NULL;
976
struct Qdisc *p = NULL;
977
int err;
978
979
dev = __dev_get_by_index(net, tcm->tcm_ifindex);
980
if (!dev)
981
return -ENODEV;
982
983
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
984
if (err < 0)
985
return err;
986
987
if (clid) {
988
if (clid != TC_H_ROOT) {
989
if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
990
p = qdisc_lookup(dev, TC_H_MAJ(clid));
991
if (!p)
992
return -ENOENT;
993
q = qdisc_leaf(p, clid);
994
} else if (dev_ingress_queue(dev)) {
995
q = dev_ingress_queue(dev)->qdisc_sleeping;
996
}
997
} else {
998
q = dev->qdisc;
999
}
1000
if (!q)
1001
return -ENOENT;
1002
1003
if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1004
return -EINVAL;
1005
} else {
1006
q = qdisc_lookup(dev, tcm->tcm_handle);
1007
if (!q)
1008
return -ENOENT;
1009
}
1010
1011
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1012
return -EINVAL;
1013
1014
if (n->nlmsg_type == RTM_DELQDISC) {
1015
if (!clid)
1016
return -EINVAL;
1017
if (q->handle == 0)
1018
return -ENOENT;
1019
err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1020
if (err != 0)
1021
return err;
1022
} else {
1023
qdisc_notify(net, skb, n, clid, NULL, q);
1024
}
1025
return 0;
1026
}
1027
1028
/*
1029
* Create/change qdisc.
1030
*/
1031
1032
static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1033
{
1034
struct net *net = sock_net(skb->sk);
1035
struct tcmsg *tcm;
1036
struct nlattr *tca[TCA_MAX + 1];
1037
struct net_device *dev;
1038
u32 clid;
1039
struct Qdisc *q, *p;
1040
int err;
1041
1042
replay:
1043
/* Reinit, just in case something touches this. */
1044
tcm = NLMSG_DATA(n);
1045
clid = tcm->tcm_parent;
1046
q = p = NULL;
1047
1048
dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1049
if (!dev)
1050
return -ENODEV;
1051
1052
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1053
if (err < 0)
1054
return err;
1055
1056
if (clid) {
1057
if (clid != TC_H_ROOT) {
1058
if (clid != TC_H_INGRESS) {
1059
p = qdisc_lookup(dev, TC_H_MAJ(clid));
1060
if (!p)
1061
return -ENOENT;
1062
q = qdisc_leaf(p, clid);
1063
} else if (dev_ingress_queue_create(dev)) {
1064
q = dev_ingress_queue(dev)->qdisc_sleeping;
1065
}
1066
} else {
1067
q = dev->qdisc;
1068
}
1069
1070
/* It may be default qdisc, ignore it */
1071
if (q && q->handle == 0)
1072
q = NULL;
1073
1074
if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1075
if (tcm->tcm_handle) {
1076
if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1077
return -EEXIST;
1078
if (TC_H_MIN(tcm->tcm_handle))
1079
return -EINVAL;
1080
q = qdisc_lookup(dev, tcm->tcm_handle);
1081
if (!q)
1082
goto create_n_graft;
1083
if (n->nlmsg_flags & NLM_F_EXCL)
1084
return -EEXIST;
1085
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1086
return -EINVAL;
1087
if (q == p ||
1088
(p && check_loop(q, p, 0)))
1089
return -ELOOP;
1090
atomic_inc(&q->refcnt);
1091
goto graft;
1092
} else {
1093
if (!q)
1094
goto create_n_graft;
1095
1096
/* This magic test requires explanation.
1097
*
1098
* We know, that some child q is already
1099
* attached to this parent and have choice:
1100
* either to change it or to create/graft new one.
1101
*
1102
* 1. We are allowed to create/graft only
1103
* if CREATE and REPLACE flags are set.
1104
*
1105
* 2. If EXCL is set, requestor wanted to say,
1106
* that qdisc tcm_handle is not expected
1107
* to exist, so that we choose create/graft too.
1108
*
1109
* 3. The last case is when no flags are set.
1110
* Alas, it is sort of hole in API, we
1111
* cannot decide what to do unambiguously.
1112
* For now we select create/graft, if
1113
* user gave KIND, which does not match existing.
1114
*/
1115
if ((n->nlmsg_flags & NLM_F_CREATE) &&
1116
(n->nlmsg_flags & NLM_F_REPLACE) &&
1117
((n->nlmsg_flags & NLM_F_EXCL) ||
1118
(tca[TCA_KIND] &&
1119
nla_strcmp(tca[TCA_KIND], q->ops->id))))
1120
goto create_n_graft;
1121
}
1122
}
1123
} else {
1124
if (!tcm->tcm_handle)
1125
return -EINVAL;
1126
q = qdisc_lookup(dev, tcm->tcm_handle);
1127
}
1128
1129
/* Change qdisc parameters */
1130
if (q == NULL)
1131
return -ENOENT;
1132
if (n->nlmsg_flags & NLM_F_EXCL)
1133
return -EEXIST;
1134
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1135
return -EINVAL;
1136
err = qdisc_change(q, tca);
1137
if (err == 0)
1138
qdisc_notify(net, skb, n, clid, NULL, q);
1139
return err;
1140
1141
create_n_graft:
1142
if (!(n->nlmsg_flags & NLM_F_CREATE))
1143
return -ENOENT;
1144
if (clid == TC_H_INGRESS) {
1145
if (dev_ingress_queue(dev))
1146
q = qdisc_create(dev, dev_ingress_queue(dev), p,
1147
tcm->tcm_parent, tcm->tcm_parent,
1148
tca, &err);
1149
else
1150
err = -ENOENT;
1151
} else {
1152
struct netdev_queue *dev_queue;
1153
1154
if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1155
dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1156
else if (p)
1157
dev_queue = p->dev_queue;
1158
else
1159
dev_queue = netdev_get_tx_queue(dev, 0);
1160
1161
q = qdisc_create(dev, dev_queue, p,
1162
tcm->tcm_parent, tcm->tcm_handle,
1163
tca, &err);
1164
}
1165
if (q == NULL) {
1166
if (err == -EAGAIN)
1167
goto replay;
1168
return err;
1169
}
1170
1171
graft:
1172
err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1173
if (err) {
1174
if (q)
1175
qdisc_destroy(q);
1176
return err;
1177
}
1178
1179
return 0;
1180
}
1181
1182
static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1183
u32 pid, u32 seq, u16 flags, int event)
1184
{
1185
struct tcmsg *tcm;
1186
struct nlmsghdr *nlh;
1187
unsigned char *b = skb_tail_pointer(skb);
1188
struct gnet_dump d;
1189
struct qdisc_size_table *stab;
1190
1191
nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1192
tcm = NLMSG_DATA(nlh);
1193
tcm->tcm_family = AF_UNSPEC;
1194
tcm->tcm__pad1 = 0;
1195
tcm->tcm__pad2 = 0;
1196
tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1197
tcm->tcm_parent = clid;
1198
tcm->tcm_handle = q->handle;
1199
tcm->tcm_info = atomic_read(&q->refcnt);
1200
NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1201
if (q->ops->dump && q->ops->dump(q, skb) < 0)
1202
goto nla_put_failure;
1203
q->qstats.qlen = q->q.qlen;
1204
1205
stab = rtnl_dereference(q->stab);
1206
if (stab && qdisc_dump_stab(skb, stab) < 0)
1207
goto nla_put_failure;
1208
1209
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1210
qdisc_root_sleeping_lock(q), &d) < 0)
1211
goto nla_put_failure;
1212
1213
if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1214
goto nla_put_failure;
1215
1216
if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1217
gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1218
gnet_stats_copy_queue(&d, &q->qstats) < 0)
1219
goto nla_put_failure;
1220
1221
if (gnet_stats_finish_copy(&d) < 0)
1222
goto nla_put_failure;
1223
1224
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1225
return skb->len;
1226
1227
nlmsg_failure:
1228
nla_put_failure:
1229
nlmsg_trim(skb, b);
1230
return -1;
1231
}
1232
1233
static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1234
{
1235
return (q->flags & TCQ_F_BUILTIN) ? true : false;
1236
}
1237
1238
static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1239
struct nlmsghdr *n, u32 clid,
1240
struct Qdisc *old, struct Qdisc *new)
1241
{
1242
struct sk_buff *skb;
1243
u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1244
1245
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1246
if (!skb)
1247
return -ENOBUFS;
1248
1249
if (old && !tc_qdisc_dump_ignore(old)) {
1250
if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
1251
0, RTM_DELQDISC) < 0)
1252
goto err_out;
1253
}
1254
if (new && !tc_qdisc_dump_ignore(new)) {
1255
if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
1256
old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1257
goto err_out;
1258
}
1259
1260
if (skb->len)
1261
return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1262
n->nlmsg_flags & NLM_F_ECHO);
1263
1264
err_out:
1265
kfree_skb(skb);
1266
return -EINVAL;
1267
}
1268
1269
static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1270
struct netlink_callback *cb,
1271
int *q_idx_p, int s_q_idx)
1272
{
1273
int ret = 0, q_idx = *q_idx_p;
1274
struct Qdisc *q;
1275
1276
if (!root)
1277
return 0;
1278
1279
q = root;
1280
if (q_idx < s_q_idx) {
1281
q_idx++;
1282
} else {
1283
if (!tc_qdisc_dump_ignore(q) &&
1284
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1285
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1286
goto done;
1287
q_idx++;
1288
}
1289
list_for_each_entry(q, &root->list, list) {
1290
if (q_idx < s_q_idx) {
1291
q_idx++;
1292
continue;
1293
}
1294
if (!tc_qdisc_dump_ignore(q) &&
1295
tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1296
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1297
goto done;
1298
q_idx++;
1299
}
1300
1301
out:
1302
*q_idx_p = q_idx;
1303
return ret;
1304
done:
1305
ret = -1;
1306
goto out;
1307
}
1308
1309
static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1310
{
1311
struct net *net = sock_net(skb->sk);
1312
int idx, q_idx;
1313
int s_idx, s_q_idx;
1314
struct net_device *dev;
1315
1316
s_idx = cb->args[0];
1317
s_q_idx = q_idx = cb->args[1];
1318
1319
rcu_read_lock();
1320
idx = 0;
1321
for_each_netdev_rcu(net, dev) {
1322
struct netdev_queue *dev_queue;
1323
1324
if (idx < s_idx)
1325
goto cont;
1326
if (idx > s_idx)
1327
s_q_idx = 0;
1328
q_idx = 0;
1329
1330
if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1331
goto done;
1332
1333
dev_queue = dev_ingress_queue(dev);
1334
if (dev_queue &&
1335
tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1336
&q_idx, s_q_idx) < 0)
1337
goto done;
1338
1339
cont:
1340
idx++;
1341
}
1342
1343
done:
1344
rcu_read_unlock();
1345
1346
cb->args[0] = idx;
1347
cb->args[1] = q_idx;
1348
1349
return skb->len;
1350
}
1351
1352
1353
1354
/************************************************
1355
* Traffic classes manipulation. *
1356
************************************************/
1357
1358
1359
1360
static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1361
{
1362
struct net *net = sock_net(skb->sk);
1363
struct tcmsg *tcm = NLMSG_DATA(n);
1364
struct nlattr *tca[TCA_MAX + 1];
1365
struct net_device *dev;
1366
struct Qdisc *q = NULL;
1367
const struct Qdisc_class_ops *cops;
1368
unsigned long cl = 0;
1369
unsigned long new_cl;
1370
u32 pid = tcm->tcm_parent;
1371
u32 clid = tcm->tcm_handle;
1372
u32 qid = TC_H_MAJ(clid);
1373
int err;
1374
1375
dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1376
if (!dev)
1377
return -ENODEV;
1378
1379
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1380
if (err < 0)
1381
return err;
1382
1383
/*
1384
parent == TC_H_UNSPEC - unspecified parent.
1385
parent == TC_H_ROOT - class is root, which has no parent.
1386
parent == X:0 - parent is root class.
1387
parent == X:Y - parent is a node in hierarchy.
1388
parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1389
1390
handle == 0:0 - generate handle from kernel pool.
1391
handle == 0:Y - class is X:Y, where X:0 is qdisc.
1392
handle == X:Y - clear.
1393
handle == X:0 - root class.
1394
*/
1395
1396
/* Step 1. Determine qdisc handle X:0 */
1397
1398
if (pid != TC_H_ROOT) {
1399
u32 qid1 = TC_H_MAJ(pid);
1400
1401
if (qid && qid1) {
1402
/* If both majors are known, they must be identical. */
1403
if (qid != qid1)
1404
return -EINVAL;
1405
} else if (qid1) {
1406
qid = qid1;
1407
} else if (qid == 0)
1408
qid = dev->qdisc->handle;
1409
1410
/* Now qid is genuine qdisc handle consistent
1411
* both with parent and child.
1412
*
1413
* TC_H_MAJ(pid) still may be unspecified, complete it now.
1414
*/
1415
if (pid)
1416
pid = TC_H_MAKE(qid, pid);
1417
} else {
1418
if (qid == 0)
1419
qid = dev->qdisc->handle;
1420
}
1421
1422
/* OK. Locate qdisc */
1423
q = qdisc_lookup(dev, qid);
1424
if (!q)
1425
return -ENOENT;
1426
1427
/* An check that it supports classes */
1428
cops = q->ops->cl_ops;
1429
if (cops == NULL)
1430
return -EINVAL;
1431
1432
/* Now try to get class */
1433
if (clid == 0) {
1434
if (pid == TC_H_ROOT)
1435
clid = qid;
1436
} else
1437
clid = TC_H_MAKE(qid, clid);
1438
1439
if (clid)
1440
cl = cops->get(q, clid);
1441
1442
if (cl == 0) {
1443
err = -ENOENT;
1444
if (n->nlmsg_type != RTM_NEWTCLASS ||
1445
!(n->nlmsg_flags & NLM_F_CREATE))
1446
goto out;
1447
} else {
1448
switch (n->nlmsg_type) {
1449
case RTM_NEWTCLASS:
1450
err = -EEXIST;
1451
if (n->nlmsg_flags & NLM_F_EXCL)
1452
goto out;
1453
break;
1454
case RTM_DELTCLASS:
1455
err = -EOPNOTSUPP;
1456
if (cops->delete)
1457
err = cops->delete(q, cl);
1458
if (err == 0)
1459
tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1460
goto out;
1461
case RTM_GETTCLASS:
1462
err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1463
goto out;
1464
default:
1465
err = -EINVAL;
1466
goto out;
1467
}
1468
}
1469
1470
new_cl = cl;
1471
err = -EOPNOTSUPP;
1472
if (cops->change)
1473
err = cops->change(q, clid, pid, tca, &new_cl);
1474
if (err == 0)
1475
tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1476
1477
out:
1478
if (cl)
1479
cops->put(q, cl);
1480
1481
return err;
1482
}
1483
1484
1485
static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1486
unsigned long cl,
1487
u32 pid, u32 seq, u16 flags, int event)
1488
{
1489
struct tcmsg *tcm;
1490
struct nlmsghdr *nlh;
1491
unsigned char *b = skb_tail_pointer(skb);
1492
struct gnet_dump d;
1493
const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1494
1495
nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1496
tcm = NLMSG_DATA(nlh);
1497
tcm->tcm_family = AF_UNSPEC;
1498
tcm->tcm__pad1 = 0;
1499
tcm->tcm__pad2 = 0;
1500
tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1501
tcm->tcm_parent = q->handle;
1502
tcm->tcm_handle = q->handle;
1503
tcm->tcm_info = 0;
1504
NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1505
if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1506
goto nla_put_failure;
1507
1508
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1509
qdisc_root_sleeping_lock(q), &d) < 0)
1510
goto nla_put_failure;
1511
1512
if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1513
goto nla_put_failure;
1514
1515
if (gnet_stats_finish_copy(&d) < 0)
1516
goto nla_put_failure;
1517
1518
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1519
return skb->len;
1520
1521
nlmsg_failure:
1522
nla_put_failure:
1523
nlmsg_trim(skb, b);
1524
return -1;
1525
}
1526
1527
static int tclass_notify(struct net *net, struct sk_buff *oskb,
1528
struct nlmsghdr *n, struct Qdisc *q,
1529
unsigned long cl, int event)
1530
{
1531
struct sk_buff *skb;
1532
u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1533
1534
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1535
if (!skb)
1536
return -ENOBUFS;
1537
1538
if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1539
kfree_skb(skb);
1540
return -EINVAL;
1541
}
1542
1543
return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1544
n->nlmsg_flags & NLM_F_ECHO);
1545
}
1546
1547
struct qdisc_dump_args {
1548
struct qdisc_walker w;
1549
struct sk_buff *skb;
1550
struct netlink_callback *cb;
1551
};
1552
1553
static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1554
{
1555
struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1556
1557
return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1558
a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1559
}
1560
1561
static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1562
struct tcmsg *tcm, struct netlink_callback *cb,
1563
int *t_p, int s_t)
1564
{
1565
struct qdisc_dump_args arg;
1566
1567
if (tc_qdisc_dump_ignore(q) ||
1568
*t_p < s_t || !q->ops->cl_ops ||
1569
(tcm->tcm_parent &&
1570
TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1571
(*t_p)++;
1572
return 0;
1573
}
1574
if (*t_p > s_t)
1575
memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1576
arg.w.fn = qdisc_class_dump;
1577
arg.skb = skb;
1578
arg.cb = cb;
1579
arg.w.stop = 0;
1580
arg.w.skip = cb->args[1];
1581
arg.w.count = 0;
1582
q->ops->cl_ops->walk(q, &arg.w);
1583
cb->args[1] = arg.w.count;
1584
if (arg.w.stop)
1585
return -1;
1586
(*t_p)++;
1587
return 0;
1588
}
1589
1590
static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1591
struct tcmsg *tcm, struct netlink_callback *cb,
1592
int *t_p, int s_t)
1593
{
1594
struct Qdisc *q;
1595
1596
if (!root)
1597
return 0;
1598
1599
if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1600
return -1;
1601
1602
list_for_each_entry(q, &root->list, list) {
1603
if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1604
return -1;
1605
}
1606
1607
return 0;
1608
}
1609
1610
static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1611
{
1612
struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
1613
struct net *net = sock_net(skb->sk);
1614
struct netdev_queue *dev_queue;
1615
struct net_device *dev;
1616
int t, s_t;
1617
1618
if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1619
return 0;
1620
dev = dev_get_by_index(net, tcm->tcm_ifindex);
1621
if (!dev)
1622
return 0;
1623
1624
s_t = cb->args[0];
1625
t = 0;
1626
1627
if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1628
goto done;
1629
1630
dev_queue = dev_ingress_queue(dev);
1631
if (dev_queue &&
1632
tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1633
&t, s_t) < 0)
1634
goto done;
1635
1636
done:
1637
cb->args[0] = t;
1638
1639
dev_put(dev);
1640
return skb->len;
1641
}
1642
1643
/* Main classifier routine: scans classifier chain attached
1644
* to this qdisc, (optionally) tests for protocol and asks
1645
* specific classifiers.
1646
*/
1647
int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1648
struct tcf_result *res)
1649
{
1650
__be16 protocol = skb->protocol;
1651
int err;
1652
1653
for (; tp; tp = tp->next) {
1654
if (tp->protocol != protocol &&
1655
tp->protocol != htons(ETH_P_ALL))
1656
continue;
1657
err = tp->classify(skb, tp, res);
1658
1659
if (err >= 0) {
1660
#ifdef CONFIG_NET_CLS_ACT
1661
if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1662
skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1663
#endif
1664
return err;
1665
}
1666
}
1667
return -1;
1668
}
1669
EXPORT_SYMBOL(tc_classify_compat);
1670
1671
int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1672
struct tcf_result *res)
1673
{
1674
int err = 0;
1675
#ifdef CONFIG_NET_CLS_ACT
1676
struct tcf_proto *otp = tp;
1677
reclassify:
1678
#endif
1679
1680
err = tc_classify_compat(skb, tp, res);
1681
#ifdef CONFIG_NET_CLS_ACT
1682
if (err == TC_ACT_RECLASSIFY) {
1683
u32 verd = G_TC_VERD(skb->tc_verd);
1684
tp = otp;
1685
1686
if (verd++ >= MAX_REC_LOOP) {
1687
if (net_ratelimit())
1688
pr_notice("%s: packet reclassify loop"
1689
" rule prio %u protocol %02x\n",
1690
tp->q->ops->id,
1691
tp->prio & 0xffff,
1692
ntohs(tp->protocol));
1693
return TC_ACT_SHOT;
1694
}
1695
skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1696
goto reclassify;
1697
}
1698
#endif
1699
return err;
1700
}
1701
EXPORT_SYMBOL(tc_classify);
1702
1703
void tcf_destroy(struct tcf_proto *tp)
1704
{
1705
tp->ops->destroy(tp);
1706
module_put(tp->ops->owner);
1707
kfree(tp);
1708
}
1709
1710
void tcf_destroy_chain(struct tcf_proto **fl)
1711
{
1712
struct tcf_proto *tp;
1713
1714
while ((tp = *fl) != NULL) {
1715
*fl = tp->next;
1716
tcf_destroy(tp);
1717
}
1718
}
1719
EXPORT_SYMBOL(tcf_destroy_chain);
1720
1721
#ifdef CONFIG_PROC_FS
1722
static int psched_show(struct seq_file *seq, void *v)
1723
{
1724
struct timespec ts;
1725
1726
hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1727
seq_printf(seq, "%08x %08x %08x %08x\n",
1728
(u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1729
1000000,
1730
(u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1731
1732
return 0;
1733
}
1734
1735
static int psched_open(struct inode *inode, struct file *file)
1736
{
1737
return single_open(file, psched_show, NULL);
1738
}
1739
1740
static const struct file_operations psched_fops = {
1741
.owner = THIS_MODULE,
1742
.open = psched_open,
1743
.read = seq_read,
1744
.llseek = seq_lseek,
1745
.release = single_release,
1746
};
1747
1748
static int __net_init psched_net_init(struct net *net)
1749
{
1750
struct proc_dir_entry *e;
1751
1752
e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1753
if (e == NULL)
1754
return -ENOMEM;
1755
1756
return 0;
1757
}
1758
1759
static void __net_exit psched_net_exit(struct net *net)
1760
{
1761
proc_net_remove(net, "psched");
1762
}
1763
#else
1764
static int __net_init psched_net_init(struct net *net)
1765
{
1766
return 0;
1767
}
1768
1769
static void __net_exit psched_net_exit(struct net *net)
1770
{
1771
}
1772
#endif
1773
1774
static struct pernet_operations psched_net_ops = {
1775
.init = psched_net_init,
1776
.exit = psched_net_exit,
1777
};
1778
1779
static int __init pktsched_init(void)
1780
{
1781
int err;
1782
1783
err = register_pernet_subsys(&psched_net_ops);
1784
if (err) {
1785
pr_err("pktsched_init: "
1786
"cannot initialize per netns operations\n");
1787
return err;
1788
}
1789
1790
register_qdisc(&pfifo_qdisc_ops);
1791
register_qdisc(&bfifo_qdisc_ops);
1792
register_qdisc(&pfifo_head_drop_qdisc_ops);
1793
register_qdisc(&mq_qdisc_ops);
1794
1795
rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1796
rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1797
rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1798
rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1799
rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1800
rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1801
1802
return 0;
1803
}
1804
1805
subsys_initcall(pktsched_init);
1806
1807