Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/net/core/dev.c
49039 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* NET3 Protocol independent device support routines.
4
*
5
* Derived from the non IP parts of dev.c 1.0.19
6
* Authors: Ross Biro
7
* Fred N. van Kempen, <[email protected]>
8
* Mark Evans, <[email protected]>
9
*
10
* Additional Authors:
11
* Florian la Roche <[email protected]>
12
* Alan Cox <[email protected]>
13
* David Hinds <[email protected]>
14
* Alexey Kuznetsov <[email protected]>
15
* Adam Sulmicki <[email protected]>
16
* Pekka Riikonen <[email protected]>
17
*
18
* Changes:
19
* D.J. Barrow : Fixed bug where dev->refcnt gets set
20
* to 2 if register_netdev gets called
21
* before net_dev_init & also removed a
22
* few lines of code in the process.
23
* Alan Cox : device private ioctl copies fields back.
24
* Alan Cox : Transmit queue code does relevant
25
* stunts to keep the queue safe.
26
* Alan Cox : Fixed double lock.
27
* Alan Cox : Fixed promisc NULL pointer trap
28
* ???????? : Support the full private ioctl range
29
* Alan Cox : Moved ioctl permission check into
30
* drivers
31
* Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
32
* Alan Cox : 100 backlog just doesn't cut it when
33
* you start doing multicast video 8)
34
* Alan Cox : Rewrote net_bh and list manager.
35
* Alan Cox : Fix ETH_P_ALL echoback lengths.
36
* Alan Cox : Took out transmit every packet pass
37
* Saved a few bytes in the ioctl handler
38
* Alan Cox : Network driver sets packet type before
39
* calling netif_rx. Saves a function
40
* call a packet.
41
* Alan Cox : Hashed net_bh()
42
* Richard Kooijman: Timestamp fixes.
43
* Alan Cox : Wrong field in SIOCGIFDSTADDR
44
* Alan Cox : Device lock protection.
45
* Alan Cox : Fixed nasty side effect of device close
46
* changes.
47
* Rudi Cilibrasi : Pass the right thing to
48
* set_mac_address()
49
* Dave Miller : 32bit quantity for the device lock to
50
* make it work out on a Sparc.
51
* Bjorn Ekwall : Added KERNELD hack.
52
* Alan Cox : Cleaned up the backlog initialise.
53
* Craig Metz : SIOCGIFCONF fix if space for under
54
* 1 device.
55
* Thomas Bogendoerfer : Return ENODEV for dev_open, if there
56
* is no device open function.
57
* Andi Kleen : Fix error reporting for SIOCGIFCONF
58
* Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
59
* Cyrus Durgin : Cleaned for KMOD
60
* Adam Sulmicki : Bug Fix : Network Device Unload
61
* A network device unload needs to purge
62
* the backlog queue.
63
* Paul Rusty Russell : SIOCSIFNAME
64
* Pekka Riikonen : Netdev boot-time settings code
65
* Andrew Morton : Make unregister_netdevice wait
66
* indefinitely on dev->refcnt
67
* J Hadi Salim : - Backlog queue sampling
68
* - netif_rx() feedback
69
*/
70
71
#include <linux/uaccess.h>
72
#include <linux/bitmap.h>
73
#include <linux/capability.h>
74
#include <linux/cpu.h>
75
#include <linux/types.h>
76
#include <linux/kernel.h>
77
#include <linux/hash.h>
78
#include <linux/slab.h>
79
#include <linux/sched.h>
80
#include <linux/sched/isolation.h>
81
#include <linux/sched/mm.h>
82
#include <linux/smpboot.h>
83
#include <linux/mutex.h>
84
#include <linux/rwsem.h>
85
#include <linux/string.h>
86
#include <linux/mm.h>
87
#include <linux/socket.h>
88
#include <linux/sockios.h>
89
#include <linux/errno.h>
90
#include <linux/interrupt.h>
91
#include <linux/if_ether.h>
92
#include <linux/netdevice.h>
93
#include <linux/etherdevice.h>
94
#include <linux/ethtool.h>
95
#include <linux/ethtool_netlink.h>
96
#include <linux/skbuff.h>
97
#include <linux/kthread.h>
98
#include <linux/bpf.h>
99
#include <linux/bpf_trace.h>
100
#include <net/net_namespace.h>
101
#include <net/sock.h>
102
#include <net/busy_poll.h>
103
#include <linux/rtnetlink.h>
104
#include <linux/stat.h>
105
#include <net/dsa.h>
106
#include <net/dst.h>
107
#include <net/dst_metadata.h>
108
#include <net/gro.h>
109
#include <net/netdev_queues.h>
110
#include <net/pkt_sched.h>
111
#include <net/pkt_cls.h>
112
#include <net/checksum.h>
113
#include <net/xfrm.h>
114
#include <net/tcx.h>
115
#include <linux/highmem.h>
116
#include <linux/init.h>
117
#include <linux/module.h>
118
#include <linux/netpoll.h>
119
#include <linux/rcupdate.h>
120
#include <linux/delay.h>
121
#include <net/iw_handler.h>
122
#include <asm/current.h>
123
#include <linux/audit.h>
124
#include <linux/dmaengine.h>
125
#include <linux/err.h>
126
#include <linux/ctype.h>
127
#include <linux/if_arp.h>
128
#include <linux/if_vlan.h>
129
#include <linux/ip.h>
130
#include <net/ip.h>
131
#include <net/mpls.h>
132
#include <linux/ipv6.h>
133
#include <linux/in.h>
134
#include <linux/jhash.h>
135
#include <linux/random.h>
136
#include <trace/events/napi.h>
137
#include <trace/events/net.h>
138
#include <trace/events/skb.h>
139
#include <trace/events/qdisc.h>
140
#include <trace/events/xdp.h>
141
#include <linux/inetdevice.h>
142
#include <linux/cpu_rmap.h>
143
#include <linux/static_key.h>
144
#include <linux/hashtable.h>
145
#include <linux/vmalloc.h>
146
#include <linux/if_macvlan.h>
147
#include <linux/errqueue.h>
148
#include <linux/hrtimer.h>
149
#include <linux/netfilter_netdev.h>
150
#include <linux/crash_dump.h>
151
#include <linux/sctp.h>
152
#include <net/udp_tunnel.h>
153
#include <linux/net_namespace.h>
154
#include <linux/indirect_call_wrapper.h>
155
#include <net/devlink.h>
156
#include <linux/pm_runtime.h>
157
#include <linux/prandom.h>
158
#include <linux/once_lite.h>
159
#include <net/netdev_lock.h>
160
#include <net/netdev_rx_queue.h>
161
#include <net/page_pool/types.h>
162
#include <net/page_pool/helpers.h>
163
#include <net/page_pool/memory_provider.h>
164
#include <net/rps.h>
165
#include <linux/phy_link_topology.h>
166
167
#include "dev.h"
168
#include "devmem.h"
169
#include "net-sysfs.h"
170
171
static DEFINE_SPINLOCK(ptype_lock);
172
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
173
174
static int netif_rx_internal(struct sk_buff *skb);
175
static int call_netdevice_notifiers_extack(unsigned long val,
176
struct net_device *dev,
177
struct netlink_ext_ack *extack);
178
179
static DEFINE_MUTEX(ifalias_mutex);
180
181
/* protects napi_hash addition/deletion and napi_gen_id */
182
static DEFINE_SPINLOCK(napi_hash_lock);
183
184
static unsigned int napi_gen_id = NR_CPUS;
185
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
186
187
static inline void dev_base_seq_inc(struct net *net)
188
{
189
unsigned int val = net->dev_base_seq + 1;
190
191
WRITE_ONCE(net->dev_base_seq, val ?: 1);
192
}
193
194
static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
195
{
196
unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
197
198
return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
199
}
200
201
static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
202
{
203
return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
204
}
205
206
#ifndef CONFIG_PREEMPT_RT
207
208
static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
209
210
static int __init setup_backlog_napi_threads(char *arg)
211
{
212
static_branch_enable(&use_backlog_threads_key);
213
return 0;
214
}
215
early_param("thread_backlog_napi", setup_backlog_napi_threads);
216
217
static bool use_backlog_threads(void)
218
{
219
return static_branch_unlikely(&use_backlog_threads_key);
220
}
221
222
#else
223
224
static bool use_backlog_threads(void)
225
{
226
return true;
227
}
228
229
#endif
230
231
static inline void backlog_lock_irq_save(struct softnet_data *sd,
232
unsigned long *flags)
233
{
234
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
235
spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
236
else
237
local_irq_save(*flags);
238
}
239
240
static inline void backlog_lock_irq_disable(struct softnet_data *sd)
241
{
242
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
243
spin_lock_irq(&sd->input_pkt_queue.lock);
244
else
245
local_irq_disable();
246
}
247
248
static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
249
unsigned long *flags)
250
{
251
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
252
spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
253
else
254
local_irq_restore(*flags);
255
}
256
257
static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
258
{
259
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
260
spin_unlock_irq(&sd->input_pkt_queue.lock);
261
else
262
local_irq_enable();
263
}
264
265
static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
266
const char *name)
267
{
268
struct netdev_name_node *name_node;
269
270
name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
271
if (!name_node)
272
return NULL;
273
INIT_HLIST_NODE(&name_node->hlist);
274
name_node->dev = dev;
275
name_node->name = name;
276
return name_node;
277
}
278
279
static struct netdev_name_node *
280
netdev_name_node_head_alloc(struct net_device *dev)
281
{
282
struct netdev_name_node *name_node;
283
284
name_node = netdev_name_node_alloc(dev, dev->name);
285
if (!name_node)
286
return NULL;
287
INIT_LIST_HEAD(&name_node->list);
288
return name_node;
289
}
290
291
static void netdev_name_node_free(struct netdev_name_node *name_node)
292
{
293
kfree(name_node);
294
}
295
296
static void netdev_name_node_add(struct net *net,
297
struct netdev_name_node *name_node)
298
{
299
hlist_add_head_rcu(&name_node->hlist,
300
dev_name_hash(net, name_node->name));
301
}
302
303
static void netdev_name_node_del(struct netdev_name_node *name_node)
304
{
305
hlist_del_rcu(&name_node->hlist);
306
}
307
308
static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
309
const char *name)
310
{
311
struct hlist_head *head = dev_name_hash(net, name);
312
struct netdev_name_node *name_node;
313
314
hlist_for_each_entry(name_node, head, hlist)
315
if (!strcmp(name_node->name, name))
316
return name_node;
317
return NULL;
318
}
319
320
static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
321
const char *name)
322
{
323
struct hlist_head *head = dev_name_hash(net, name);
324
struct netdev_name_node *name_node;
325
326
hlist_for_each_entry_rcu(name_node, head, hlist)
327
if (!strcmp(name_node->name, name))
328
return name_node;
329
return NULL;
330
}
331
332
bool netdev_name_in_use(struct net *net, const char *name)
333
{
334
return netdev_name_node_lookup(net, name);
335
}
336
EXPORT_SYMBOL(netdev_name_in_use);
337
338
int netdev_name_node_alt_create(struct net_device *dev, const char *name)
339
{
340
struct netdev_name_node *name_node;
341
struct net *net = dev_net(dev);
342
343
name_node = netdev_name_node_lookup(net, name);
344
if (name_node)
345
return -EEXIST;
346
name_node = netdev_name_node_alloc(dev, name);
347
if (!name_node)
348
return -ENOMEM;
349
netdev_name_node_add(net, name_node);
350
/* The node that holds dev->name acts as a head of per-device list. */
351
list_add_tail_rcu(&name_node->list, &dev->name_node->list);
352
353
return 0;
354
}
355
356
static void netdev_name_node_alt_free(struct rcu_head *head)
357
{
358
struct netdev_name_node *name_node =
359
container_of(head, struct netdev_name_node, rcu);
360
361
kfree(name_node->name);
362
netdev_name_node_free(name_node);
363
}
364
365
static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
366
{
367
netdev_name_node_del(name_node);
368
list_del(&name_node->list);
369
call_rcu(&name_node->rcu, netdev_name_node_alt_free);
370
}
371
372
int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
373
{
374
struct netdev_name_node *name_node;
375
struct net *net = dev_net(dev);
376
377
name_node = netdev_name_node_lookup(net, name);
378
if (!name_node)
379
return -ENOENT;
380
/* lookup might have found our primary name or a name belonging
381
* to another device.
382
*/
383
if (name_node == dev->name_node || name_node->dev != dev)
384
return -EINVAL;
385
386
__netdev_name_node_alt_destroy(name_node);
387
return 0;
388
}
389
390
static void netdev_name_node_alt_flush(struct net_device *dev)
391
{
392
struct netdev_name_node *name_node, *tmp;
393
394
list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
395
list_del(&name_node->list);
396
netdev_name_node_alt_free(&name_node->rcu);
397
}
398
}
399
400
/* Device list insertion */
401
static void list_netdevice(struct net_device *dev)
402
{
403
struct netdev_name_node *name_node;
404
struct net *net = dev_net(dev);
405
406
ASSERT_RTNL();
407
408
list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
409
netdev_name_node_add(net, dev->name_node);
410
hlist_add_head_rcu(&dev->index_hlist,
411
dev_index_hash(net, dev->ifindex));
412
413
netdev_for_each_altname(dev, name_node)
414
netdev_name_node_add(net, name_node);
415
416
/* We reserved the ifindex, this can't fail */
417
WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
418
419
dev_base_seq_inc(net);
420
}
421
422
/* Device list removal
423
* caller must respect a RCU grace period before freeing/reusing dev
424
*/
425
static void unlist_netdevice(struct net_device *dev)
426
{
427
struct netdev_name_node *name_node;
428
struct net *net = dev_net(dev);
429
430
ASSERT_RTNL();
431
432
xa_erase(&net->dev_by_index, dev->ifindex);
433
434
netdev_for_each_altname(dev, name_node)
435
netdev_name_node_del(name_node);
436
437
/* Unlink dev from the device chain */
438
list_del_rcu(&dev->dev_list);
439
netdev_name_node_del(dev->name_node);
440
hlist_del_rcu(&dev->index_hlist);
441
442
dev_base_seq_inc(dev_net(dev));
443
}
444
445
/*
446
* Our notifier list
447
*/
448
449
static RAW_NOTIFIER_HEAD(netdev_chain);
450
451
/*
452
* Device drivers call our routines to queue packets here. We empty the
453
* queue in the local softnet handler.
454
*/
455
456
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
457
.process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
458
};
459
EXPORT_PER_CPU_SYMBOL(softnet_data);
460
461
/* Page_pool has a lockless array/stack to alloc/recycle pages.
462
* PP consumers must pay attention to run APIs in the appropriate context
463
* (e.g. NAPI context).
464
*/
465
DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = {
466
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
467
};
468
469
#ifdef CONFIG_LOCKDEP
470
/*
471
* register_netdevice() inits txq->_xmit_lock and sets lockdep class
472
* according to dev->type
473
*/
474
static const unsigned short netdev_lock_type[] = {
475
ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
476
ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
477
ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
478
ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
479
ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
480
ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
481
ARPHRD_CAN, ARPHRD_MCTP,
482
ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
483
ARPHRD_RAWHDLC, ARPHRD_RAWIP,
484
ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
485
ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
486
ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
487
ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
488
ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
489
ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
490
ARPHRD_IEEE80211_RADIOTAP,
491
ARPHRD_IEEE802154, ARPHRD_IEEE802154_MONITOR,
492
ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
493
ARPHRD_CAIF, ARPHRD_IP6GRE, ARPHRD_NETLINK, ARPHRD_6LOWPAN,
494
ARPHRD_VSOCKMON,
495
ARPHRD_VOID, ARPHRD_NONE};
496
497
static const char *const netdev_lock_name[] = {
498
"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
499
"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
500
"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
501
"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
502
"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
503
"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
504
"_xmit_CAN", "_xmit_MCTP",
505
"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
506
"_xmit_RAWHDLC", "_xmit_RAWIP",
507
"_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
508
"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
509
"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
510
"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
511
"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
512
"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
513
"_xmit_IEEE80211_RADIOTAP",
514
"_xmit_IEEE802154", "_xmit_IEEE802154_MONITOR",
515
"_xmit_PHONET", "_xmit_PHONET_PIPE",
516
"_xmit_CAIF", "_xmit_IP6GRE", "_xmit_NETLINK", "_xmit_6LOWPAN",
517
"_xmit_VSOCKMON",
518
"_xmit_VOID", "_xmit_NONE"};
519
520
static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
521
static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
522
523
static inline unsigned short netdev_lock_pos(unsigned short dev_type)
524
{
525
int i;
526
527
for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
528
if (netdev_lock_type[i] == dev_type)
529
return i;
530
/* the last key is used by default */
531
WARN_ONCE(1, "netdev_lock_pos() could not find dev_type=%u\n", dev_type);
532
return ARRAY_SIZE(netdev_lock_type) - 1;
533
}
534
535
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
536
unsigned short dev_type)
537
{
538
int i;
539
540
i = netdev_lock_pos(dev_type);
541
lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
542
netdev_lock_name[i]);
543
}
544
545
static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
546
{
547
int i;
548
549
i = netdev_lock_pos(dev->type);
550
lockdep_set_class_and_name(&dev->addr_list_lock,
551
&netdev_addr_lock_key[i],
552
netdev_lock_name[i]);
553
}
554
#else
555
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
556
unsigned short dev_type)
557
{
558
}
559
560
static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
561
{
562
}
563
#endif
564
565
/*******************************************************************************
566
*
567
* Protocol management and registration routines
568
*
569
*******************************************************************************/
570
571
572
/*
573
* Add a protocol ID to the list. Now that the input handler is
574
* smarter we can dispense with all the messy stuff that used to be
575
* here.
576
*
577
* BEWARE!!! Protocol handlers, mangling input packets,
578
* MUST BE last in hash buckets and checking protocol handlers
579
* MUST start from promiscuous ptype_all chain in net_bh.
580
* It is true now, do not change it.
581
* Explanation follows: if protocol handler, mangling packet, will
582
* be the first on list, it is not able to sense, that packet
583
* is cloned and should be copied-on-write, so that it will
584
* change it and subsequent readers will get broken packet.
585
* --ANK (980803)
586
*/
587
588
static inline struct list_head *ptype_head(const struct packet_type *pt)
589
{
590
if (pt->type == htons(ETH_P_ALL)) {
591
if (!pt->af_packet_net && !pt->dev)
592
return NULL;
593
594
return pt->dev ? &pt->dev->ptype_all :
595
&pt->af_packet_net->ptype_all;
596
}
597
598
if (pt->dev)
599
return &pt->dev->ptype_specific;
600
601
return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
602
&ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
603
}
604
605
/**
606
* dev_add_pack - add packet handler
607
* @pt: packet type declaration
608
*
609
* Add a protocol handler to the networking stack. The passed &packet_type
610
* is linked into kernel lists and may not be freed until it has been
611
* removed from the kernel lists.
612
*
613
* This call does not sleep therefore it can not
614
* guarantee all CPU's that are in middle of receiving packets
615
* will see the new packet type (until the next received packet).
616
*/
617
618
void dev_add_pack(struct packet_type *pt)
619
{
620
struct list_head *head = ptype_head(pt);
621
622
if (WARN_ON_ONCE(!head))
623
return;
624
625
spin_lock(&ptype_lock);
626
list_add_rcu(&pt->list, head);
627
spin_unlock(&ptype_lock);
628
}
629
EXPORT_SYMBOL(dev_add_pack);
630
631
/**
632
* __dev_remove_pack - remove packet handler
633
* @pt: packet type declaration
634
*
635
* Remove a protocol handler that was previously added to the kernel
636
* protocol handlers by dev_add_pack(). The passed &packet_type is removed
637
* from the kernel lists and can be freed or reused once this function
638
* returns.
639
*
640
* The packet type might still be in use by receivers
641
* and must not be freed until after all the CPU's have gone
642
* through a quiescent state.
643
*/
644
void __dev_remove_pack(struct packet_type *pt)
645
{
646
struct list_head *head = ptype_head(pt);
647
struct packet_type *pt1;
648
649
if (!head)
650
return;
651
652
spin_lock(&ptype_lock);
653
654
list_for_each_entry(pt1, head, list) {
655
if (pt == pt1) {
656
list_del_rcu(&pt->list);
657
goto out;
658
}
659
}
660
661
pr_warn("dev_remove_pack: %p not found\n", pt);
662
out:
663
spin_unlock(&ptype_lock);
664
}
665
EXPORT_SYMBOL(__dev_remove_pack);
666
667
/**
668
* dev_remove_pack - remove packet handler
669
* @pt: packet type declaration
670
*
671
* Remove a protocol handler that was previously added to the kernel
672
* protocol handlers by dev_add_pack(). The passed &packet_type is removed
673
* from the kernel lists and can be freed or reused once this function
674
* returns.
675
*
676
* This call sleeps to guarantee that no CPU is looking at the packet
677
* type after return.
678
*/
679
void dev_remove_pack(struct packet_type *pt)
680
{
681
__dev_remove_pack(pt);
682
683
synchronize_net();
684
}
685
EXPORT_SYMBOL(dev_remove_pack);
686
687
688
/*******************************************************************************
689
*
690
* Device Interface Subroutines
691
*
692
*******************************************************************************/
693
694
/**
695
* dev_get_iflink - get 'iflink' value of a interface
696
* @dev: targeted interface
697
*
698
* Indicates the ifindex the interface is linked to.
699
* Physical interfaces have the same 'ifindex' and 'iflink' values.
700
*/
701
702
int dev_get_iflink(const struct net_device *dev)
703
{
704
if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
705
return dev->netdev_ops->ndo_get_iflink(dev);
706
707
return READ_ONCE(dev->ifindex);
708
}
709
EXPORT_SYMBOL(dev_get_iflink);
710
711
/**
712
* dev_fill_metadata_dst - Retrieve tunnel egress information.
713
* @dev: targeted interface
714
* @skb: The packet.
715
*
716
* For better visibility of tunnel traffic OVS needs to retrieve
717
* egress tunnel information for a packet. Following API allows
718
* user to get this info.
719
*/
720
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
721
{
722
struct ip_tunnel_info *info;
723
724
if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
725
return -EINVAL;
726
727
info = skb_tunnel_info_unclone(skb);
728
if (!info)
729
return -ENOMEM;
730
if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
731
return -EINVAL;
732
733
return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
734
}
735
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
736
737
static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
738
{
739
int k = stack->num_paths++;
740
741
if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
742
return NULL;
743
744
return &stack->path[k];
745
}
746
747
int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
748
struct net_device_path_stack *stack)
749
{
750
const struct net_device *last_dev;
751
struct net_device_path_ctx ctx = {
752
.dev = dev,
753
};
754
struct net_device_path *path;
755
int ret = 0;
756
757
memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
758
stack->num_paths = 0;
759
while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
760
last_dev = ctx.dev;
761
path = dev_fwd_path(stack);
762
if (!path)
763
return -1;
764
765
memset(path, 0, sizeof(struct net_device_path));
766
ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
767
if (ret < 0)
768
return -1;
769
770
if (WARN_ON_ONCE(last_dev == ctx.dev))
771
return -1;
772
}
773
774
if (!ctx.dev)
775
return ret;
776
777
path = dev_fwd_path(stack);
778
if (!path)
779
return -1;
780
path->type = DEV_PATH_ETHERNET;
781
path->dev = ctx.dev;
782
783
return ret;
784
}
785
EXPORT_SYMBOL_GPL(dev_fill_forward_path);
786
787
/* must be called under rcu_read_lock(), as we dont take a reference */
788
static struct napi_struct *napi_by_id(unsigned int napi_id)
789
{
790
unsigned int hash = napi_id % HASH_SIZE(napi_hash);
791
struct napi_struct *napi;
792
793
hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
794
if (napi->napi_id == napi_id)
795
return napi;
796
797
return NULL;
798
}
799
800
/* must be called under rcu_read_lock(), as we dont take a reference */
801
static struct napi_struct *
802
netdev_napi_by_id(struct net *net, unsigned int napi_id)
803
{
804
struct napi_struct *napi;
805
806
napi = napi_by_id(napi_id);
807
if (!napi)
808
return NULL;
809
810
if (WARN_ON_ONCE(!napi->dev))
811
return NULL;
812
if (!net_eq(net, dev_net(napi->dev)))
813
return NULL;
814
815
return napi;
816
}
817
818
/**
819
* netdev_napi_by_id_lock() - find a device by NAPI ID and lock it
820
* @net: the applicable net namespace
821
* @napi_id: ID of a NAPI of a target device
822
*
823
* Find a NAPI instance with @napi_id. Lock its device.
824
* The device must be in %NETREG_REGISTERED state for lookup to succeed.
825
* netdev_unlock() must be called to release it.
826
*
827
* Return: pointer to NAPI, its device with lock held, NULL if not found.
828
*/
829
struct napi_struct *
830
netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
831
{
832
struct napi_struct *napi;
833
struct net_device *dev;
834
835
rcu_read_lock();
836
napi = netdev_napi_by_id(net, napi_id);
837
if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
838
rcu_read_unlock();
839
return NULL;
840
}
841
842
dev = napi->dev;
843
dev_hold(dev);
844
rcu_read_unlock();
845
846
dev = __netdev_put_lock(dev, net);
847
if (!dev)
848
return NULL;
849
850
rcu_read_lock();
851
napi = netdev_napi_by_id(net, napi_id);
852
if (napi && napi->dev != dev)
853
napi = NULL;
854
rcu_read_unlock();
855
856
if (!napi)
857
netdev_unlock(dev);
858
return napi;
859
}
860
861
/**
862
* __dev_get_by_name - find a device by its name
863
* @net: the applicable net namespace
864
* @name: name to find
865
*
866
* Find an interface by name. Must be called under RTNL semaphore.
867
* If the name is found a pointer to the device is returned.
868
* If the name is not found then %NULL is returned. The
869
* reference counters are not incremented so the caller must be
870
* careful with locks.
871
*/
872
873
struct net_device *__dev_get_by_name(struct net *net, const char *name)
874
{
875
struct netdev_name_node *node_name;
876
877
node_name = netdev_name_node_lookup(net, name);
878
return node_name ? node_name->dev : NULL;
879
}
880
EXPORT_SYMBOL(__dev_get_by_name);
881
882
/**
883
* dev_get_by_name_rcu - find a device by its name
884
* @net: the applicable net namespace
885
* @name: name to find
886
*
887
* Find an interface by name.
888
* If the name is found a pointer to the device is returned.
889
* If the name is not found then %NULL is returned.
890
* The reference counters are not incremented so the caller must be
891
* careful with locks. The caller must hold RCU lock.
892
*/
893
894
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
895
{
896
struct netdev_name_node *node_name;
897
898
node_name = netdev_name_node_lookup_rcu(net, name);
899
return node_name ? node_name->dev : NULL;
900
}
901
EXPORT_SYMBOL(dev_get_by_name_rcu);
902
903
/* Deprecated for new users, call netdev_get_by_name() instead */
904
struct net_device *dev_get_by_name(struct net *net, const char *name)
905
{
906
struct net_device *dev;
907
908
rcu_read_lock();
909
dev = dev_get_by_name_rcu(net, name);
910
dev_hold(dev);
911
rcu_read_unlock();
912
return dev;
913
}
914
EXPORT_SYMBOL(dev_get_by_name);
915
916
/**
917
* netdev_get_by_name() - find a device by its name
918
* @net: the applicable net namespace
919
* @name: name to find
920
* @tracker: tracking object for the acquired reference
921
* @gfp: allocation flags for the tracker
922
*
923
* Find an interface by name. This can be called from any
924
* context and does its own locking. The returned handle has
925
* the usage count incremented and the caller must use netdev_put() to
926
* release it when it is no longer needed. %NULL is returned if no
927
* matching device is found.
928
*/
929
struct net_device *netdev_get_by_name(struct net *net, const char *name,
930
netdevice_tracker *tracker, gfp_t gfp)
931
{
932
struct net_device *dev;
933
934
dev = dev_get_by_name(net, name);
935
if (dev)
936
netdev_tracker_alloc(dev, tracker, gfp);
937
return dev;
938
}
939
EXPORT_SYMBOL(netdev_get_by_name);
940
941
/**
942
* __dev_get_by_index - find a device by its ifindex
943
* @net: the applicable net namespace
944
* @ifindex: index of device
945
*
946
* Search for an interface by index. Returns %NULL if the device
947
* is not found or a pointer to the device. The device has not
948
* had its reference counter increased so the caller must be careful
949
* about locking. The caller must hold the RTNL semaphore.
950
*/
951
952
struct net_device *__dev_get_by_index(struct net *net, int ifindex)
953
{
954
struct net_device *dev;
955
struct hlist_head *head = dev_index_hash(net, ifindex);
956
957
hlist_for_each_entry(dev, head, index_hlist)
958
if (dev->ifindex == ifindex)
959
return dev;
960
961
return NULL;
962
}
963
EXPORT_SYMBOL(__dev_get_by_index);
964
965
/**
966
* dev_get_by_index_rcu - find a device by its ifindex
967
* @net: the applicable net namespace
968
* @ifindex: index of device
969
*
970
* Search for an interface by index. Returns %NULL if the device
971
* is not found or a pointer to the device. The device has not
972
* had its reference counter increased so the caller must be careful
973
* about locking. The caller must hold RCU lock.
974
*/
975
976
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
977
{
978
struct net_device *dev;
979
struct hlist_head *head = dev_index_hash(net, ifindex);
980
981
hlist_for_each_entry_rcu(dev, head, index_hlist)
982
if (dev->ifindex == ifindex)
983
return dev;
984
985
return NULL;
986
}
987
EXPORT_SYMBOL(dev_get_by_index_rcu);
988
989
/* Deprecated for new users, call netdev_get_by_index() instead */
990
struct net_device *dev_get_by_index(struct net *net, int ifindex)
991
{
992
struct net_device *dev;
993
994
rcu_read_lock();
995
dev = dev_get_by_index_rcu(net, ifindex);
996
dev_hold(dev);
997
rcu_read_unlock();
998
return dev;
999
}
1000
EXPORT_SYMBOL(dev_get_by_index);
1001
1002
/**
1003
* netdev_get_by_index() - find a device by its ifindex
1004
* @net: the applicable net namespace
1005
* @ifindex: index of device
1006
* @tracker: tracking object for the acquired reference
1007
* @gfp: allocation flags for the tracker
1008
*
1009
* Search for an interface by index. Returns NULL if the device
1010
* is not found or a pointer to the device. The device returned has
1011
* had a reference added and the pointer is safe until the user calls
1012
* netdev_put() to indicate they have finished with it.
1013
*/
1014
struct net_device *netdev_get_by_index(struct net *net, int ifindex,
1015
netdevice_tracker *tracker, gfp_t gfp)
1016
{
1017
struct net_device *dev;
1018
1019
dev = dev_get_by_index(net, ifindex);
1020
if (dev)
1021
netdev_tracker_alloc(dev, tracker, gfp);
1022
return dev;
1023
}
1024
EXPORT_SYMBOL(netdev_get_by_index);
1025
1026
/**
1027
* dev_get_by_napi_id - find a device by napi_id
1028
* @napi_id: ID of the NAPI struct
1029
*
1030
* Search for an interface by NAPI ID. Returns %NULL if the device
1031
* is not found or a pointer to the device. The device has not had
1032
* its reference counter increased so the caller must be careful
1033
* about locking. The caller must hold RCU lock.
1034
*/
1035
struct net_device *dev_get_by_napi_id(unsigned int napi_id)
1036
{
1037
struct napi_struct *napi;
1038
1039
WARN_ON_ONCE(!rcu_read_lock_held());
1040
1041
if (!napi_id_valid(napi_id))
1042
return NULL;
1043
1044
napi = napi_by_id(napi_id);
1045
1046
return napi ? napi->dev : NULL;
1047
}
1048
1049
/* Release the held reference on the net_device, and if the net_device
1050
* is still registered try to lock the instance lock. If device is being
1051
* unregistered NULL will be returned (but the reference has been released,
1052
* either way!)
1053
*
1054
* This helper is intended for locking net_device after it has been looked up
1055
* using a lockless lookup helper. Lock prevents the instance from going away.
1056
*/
1057
struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net)
1058
{
1059
netdev_lock(dev);
1060
if (dev->reg_state > NETREG_REGISTERED ||
1061
dev->moving_ns || !net_eq(dev_net(dev), net)) {
1062
netdev_unlock(dev);
1063
dev_put(dev);
1064
return NULL;
1065
}
1066
dev_put(dev);
1067
return dev;
1068
}
1069
1070
static struct net_device *
1071
__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net)
1072
{
1073
netdev_lock_ops_compat(dev);
1074
if (dev->reg_state > NETREG_REGISTERED ||
1075
dev->moving_ns || !net_eq(dev_net(dev), net)) {
1076
netdev_unlock_ops_compat(dev);
1077
dev_put(dev);
1078
return NULL;
1079
}
1080
dev_put(dev);
1081
return dev;
1082
}
1083
1084
/**
1085
* netdev_get_by_index_lock() - find a device by its ifindex
1086
* @net: the applicable net namespace
1087
* @ifindex: index of device
1088
*
1089
* Search for an interface by index. If a valid device
1090
* with @ifindex is found it will be returned with netdev->lock held.
1091
* netdev_unlock() must be called to release it.
1092
*
1093
* Return: pointer to a device with lock held, NULL if not found.
1094
*/
1095
struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
1096
{
1097
struct net_device *dev;
1098
1099
dev = dev_get_by_index(net, ifindex);
1100
if (!dev)
1101
return NULL;
1102
1103
return __netdev_put_lock(dev, net);
1104
}
1105
1106
struct net_device *
1107
netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
1108
{
1109
struct net_device *dev;
1110
1111
dev = dev_get_by_index(net, ifindex);
1112
if (!dev)
1113
return NULL;
1114
1115
return __netdev_put_lock_ops_compat(dev, net);
1116
}
1117
1118
struct net_device *
1119
netdev_xa_find_lock(struct net *net, struct net_device *dev,
1120
unsigned long *index)
1121
{
1122
if (dev)
1123
netdev_unlock(dev);
1124
1125
do {
1126
rcu_read_lock();
1127
dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
1128
if (!dev) {
1129
rcu_read_unlock();
1130
return NULL;
1131
}
1132
dev_hold(dev);
1133
rcu_read_unlock();
1134
1135
dev = __netdev_put_lock(dev, net);
1136
if (dev)
1137
return dev;
1138
1139
(*index)++;
1140
} while (true);
1141
}
1142
1143
struct net_device *
1144
netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
1145
unsigned long *index)
1146
{
1147
if (dev)
1148
netdev_unlock_ops_compat(dev);
1149
1150
do {
1151
rcu_read_lock();
1152
dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
1153
if (!dev) {
1154
rcu_read_unlock();
1155
return NULL;
1156
}
1157
dev_hold(dev);
1158
rcu_read_unlock();
1159
1160
dev = __netdev_put_lock_ops_compat(dev, net);
1161
if (dev)
1162
return dev;
1163
1164
(*index)++;
1165
} while (true);
1166
}
1167
1168
static DEFINE_SEQLOCK(netdev_rename_lock);
1169
1170
void netdev_copy_name(struct net_device *dev, char *name)
1171
{
1172
unsigned int seq;
1173
1174
do {
1175
seq = read_seqbegin(&netdev_rename_lock);
1176
strscpy(name, dev->name, IFNAMSIZ);
1177
} while (read_seqretry(&netdev_rename_lock, seq));
1178
}
1179
EXPORT_IPV6_MOD_GPL(netdev_copy_name);
1180
1181
/**
1182
* netdev_get_name - get a netdevice name, knowing its ifindex.
1183
* @net: network namespace
1184
* @name: a pointer to the buffer where the name will be stored.
1185
* @ifindex: the ifindex of the interface to get the name from.
1186
*/
1187
int netdev_get_name(struct net *net, char *name, int ifindex)
1188
{
1189
struct net_device *dev;
1190
int ret;
1191
1192
rcu_read_lock();
1193
1194
dev = dev_get_by_index_rcu(net, ifindex);
1195
if (!dev) {
1196
ret = -ENODEV;
1197
goto out;
1198
}
1199
1200
netdev_copy_name(dev, name);
1201
1202
ret = 0;
1203
out:
1204
rcu_read_unlock();
1205
return ret;
1206
}
1207
1208
static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
1209
const char *ha)
1210
{
1211
return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
1212
}
1213
1214
/**
1215
* dev_getbyhwaddr_rcu - find a device by its hardware address
1216
* @net: the applicable net namespace
1217
* @type: media type of device
1218
* @ha: hardware address
1219
*
1220
* Search for an interface by MAC address. Returns NULL if the device
1221
* is not found or a pointer to the device.
1222
* The caller must hold RCU.
1223
* The returned device has not had its ref count increased
1224
* and the caller must therefore be careful about locking
1225
*
1226
*/
1227
1228
struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
1229
const char *ha)
1230
{
1231
struct net_device *dev;
1232
1233
for_each_netdev_rcu(net, dev)
1234
if (dev_addr_cmp(dev, type, ha))
1235
return dev;
1236
1237
return NULL;
1238
}
1239
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
1240
1241
/**
1242
* dev_getbyhwaddr() - find a device by its hardware address
1243
* @net: the applicable net namespace
1244
* @type: media type of device
1245
* @ha: hardware address
1246
*
1247
* Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
1248
* rtnl_lock.
1249
*
1250
* Context: rtnl_lock() must be held.
1251
* Return: pointer to the net_device, or NULL if not found
1252
*/
1253
struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
1254
const char *ha)
1255
{
1256
struct net_device *dev;
1257
1258
ASSERT_RTNL();
1259
for_each_netdev(net, dev)
1260
if (dev_addr_cmp(dev, type, ha))
1261
return dev;
1262
1263
return NULL;
1264
}
1265
EXPORT_SYMBOL(dev_getbyhwaddr);
1266
1267
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1268
{
1269
struct net_device *dev, *ret = NULL;
1270
1271
rcu_read_lock();
1272
for_each_netdev_rcu(net, dev)
1273
if (dev->type == type) {
1274
dev_hold(dev);
1275
ret = dev;
1276
break;
1277
}
1278
rcu_read_unlock();
1279
return ret;
1280
}
1281
EXPORT_SYMBOL(dev_getfirstbyhwtype);
1282
1283
/**
1284
* netdev_get_by_flags_rcu - find any device with given flags
1285
* @net: the applicable net namespace
1286
* @tracker: tracking object for the acquired reference
1287
* @if_flags: IFF_* values
1288
* @mask: bitmask of bits in if_flags to check
1289
*
1290
* Search for any interface with the given flags.
1291
*
1292
* Context: rcu_read_lock() must be held.
1293
* Returns: NULL if a device is not found or a pointer to the device.
1294
*/
1295
struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker,
1296
unsigned short if_flags, unsigned short mask)
1297
{
1298
struct net_device *dev;
1299
1300
for_each_netdev_rcu(net, dev) {
1301
if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) {
1302
netdev_hold(dev, tracker, GFP_ATOMIC);
1303
return dev;
1304
}
1305
}
1306
1307
return NULL;
1308
}
1309
EXPORT_IPV6_MOD(netdev_get_by_flags_rcu);
1310
1311
/**
1312
* dev_valid_name - check if name is okay for network device
1313
* @name: name string
1314
*
1315
* Network device names need to be valid file names to
1316
* allow sysfs to work. We also disallow any kind of
1317
* whitespace.
1318
*/
1319
bool dev_valid_name(const char *name)
1320
{
1321
if (*name == '\0')
1322
return false;
1323
if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1324
return false;
1325
if (!strcmp(name, ".") || !strcmp(name, ".."))
1326
return false;
1327
1328
while (*name) {
1329
if (*name == '/' || *name == ':' || isspace(*name))
1330
return false;
1331
name++;
1332
}
1333
return true;
1334
}
1335
EXPORT_SYMBOL(dev_valid_name);
1336
1337
/**
1338
* __dev_alloc_name - allocate a name for a device
1339
* @net: network namespace to allocate the device name in
1340
* @name: name format string
1341
* @res: result name string
1342
*
1343
* Passed a format string - eg "lt%d" it will try and find a suitable
1344
* id. It scans list of devices to build up a free map, then chooses
1345
* the first empty slot. The caller must hold the dev_base or rtnl lock
1346
* while allocating the name and adding the device in order to avoid
1347
* duplicates.
1348
* Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1349
* Returns the number of the unit assigned or a negative errno code.
1350
*/
1351
1352
static int __dev_alloc_name(struct net *net, const char *name, char *res)
1353
{
1354
int i = 0;
1355
const char *p;
1356
const int max_netdevices = 8*PAGE_SIZE;
1357
unsigned long *inuse;
1358
struct net_device *d;
1359
char buf[IFNAMSIZ];
1360
1361
/* Verify the string as this thing may have come from the user.
1362
* There must be one "%d" and no other "%" characters.
1363
*/
1364
p = strchr(name, '%');
1365
if (!p || p[1] != 'd' || strchr(p + 2, '%'))
1366
return -EINVAL;
1367
1368
/* Use one page as a bit array of possible slots */
1369
inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
1370
if (!inuse)
1371
return -ENOMEM;
1372
1373
for_each_netdev(net, d) {
1374
struct netdev_name_node *name_node;
1375
1376
netdev_for_each_altname(d, name_node) {
1377
if (!sscanf(name_node->name, name, &i))
1378
continue;
1379
if (i < 0 || i >= max_netdevices)
1380
continue;
1381
1382
/* avoid cases where sscanf is not exact inverse of printf */
1383
snprintf(buf, IFNAMSIZ, name, i);
1384
if (!strncmp(buf, name_node->name, IFNAMSIZ))
1385
__set_bit(i, inuse);
1386
}
1387
if (!sscanf(d->name, name, &i))
1388
continue;
1389
if (i < 0 || i >= max_netdevices)
1390
continue;
1391
1392
/* avoid cases where sscanf is not exact inverse of printf */
1393
snprintf(buf, IFNAMSIZ, name, i);
1394
if (!strncmp(buf, d->name, IFNAMSIZ))
1395
__set_bit(i, inuse);
1396
}
1397
1398
i = find_first_zero_bit(inuse, max_netdevices);
1399
bitmap_free(inuse);
1400
if (i == max_netdevices)
1401
return -ENFILE;
1402
1403
/* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
1404
strscpy(buf, name, IFNAMSIZ);
1405
snprintf(res, IFNAMSIZ, buf, i);
1406
return i;
1407
}
1408
1409
/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
1410
static int dev_prep_valid_name(struct net *net, struct net_device *dev,
1411
const char *want_name, char *out_name,
1412
int dup_errno)
1413
{
1414
if (!dev_valid_name(want_name))
1415
return -EINVAL;
1416
1417
if (strchr(want_name, '%'))
1418
return __dev_alloc_name(net, want_name, out_name);
1419
1420
if (netdev_name_in_use(net, want_name))
1421
return -dup_errno;
1422
if (out_name != want_name)
1423
strscpy(out_name, want_name, IFNAMSIZ);
1424
return 0;
1425
}
1426
1427
/**
1428
* dev_alloc_name - allocate a name for a device
1429
* @dev: device
1430
* @name: name format string
1431
*
1432
* Passed a format string - eg "lt%d" it will try and find a suitable
1433
* id. It scans list of devices to build up a free map, then chooses
1434
* the first empty slot. The caller must hold the dev_base or rtnl lock
1435
* while allocating the name and adding the device in order to avoid
1436
* duplicates.
1437
* Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1438
* Returns the number of the unit assigned or a negative errno code.
1439
*/
1440
1441
int dev_alloc_name(struct net_device *dev, const char *name)
1442
{
1443
return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
1444
}
1445
EXPORT_SYMBOL(dev_alloc_name);
1446
1447
static int dev_get_valid_name(struct net *net, struct net_device *dev,
1448
const char *name)
1449
{
1450
int ret;
1451
1452
ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
1453
return ret < 0 ? ret : 0;
1454
}
1455
1456
int netif_change_name(struct net_device *dev, const char *newname)
1457
{
1458
struct net *net = dev_net(dev);
1459
unsigned char old_assign_type;
1460
char oldname[IFNAMSIZ];
1461
int err = 0;
1462
int ret;
1463
1464
ASSERT_RTNL_NET(net);
1465
1466
if (!strncmp(newname, dev->name, IFNAMSIZ))
1467
return 0;
1468
1469
memcpy(oldname, dev->name, IFNAMSIZ);
1470
1471
write_seqlock_bh(&netdev_rename_lock);
1472
err = dev_get_valid_name(net, dev, newname);
1473
write_sequnlock_bh(&netdev_rename_lock);
1474
1475
if (err < 0)
1476
return err;
1477
1478
if (oldname[0] && !strchr(oldname, '%'))
1479
netdev_info(dev, "renamed from %s%s\n", oldname,
1480
dev->flags & IFF_UP ? " (while UP)" : "");
1481
1482
old_assign_type = dev->name_assign_type;
1483
WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
1484
1485
rollback:
1486
ret = device_rename(&dev->dev, dev->name);
1487
if (ret) {
1488
write_seqlock_bh(&netdev_rename_lock);
1489
memcpy(dev->name, oldname, IFNAMSIZ);
1490
write_sequnlock_bh(&netdev_rename_lock);
1491
WRITE_ONCE(dev->name_assign_type, old_assign_type);
1492
return ret;
1493
}
1494
1495
netdev_adjacent_rename_links(dev, oldname);
1496
1497
netdev_name_node_del(dev->name_node);
1498
1499
synchronize_net();
1500
1501
netdev_name_node_add(net, dev->name_node);
1502
1503
ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1504
ret = notifier_to_errno(ret);
1505
1506
if (ret) {
1507
/* err >= 0 after dev_alloc_name() or stores the first errno */
1508
if (err >= 0) {
1509
err = ret;
1510
write_seqlock_bh(&netdev_rename_lock);
1511
memcpy(dev->name, oldname, IFNAMSIZ);
1512
write_sequnlock_bh(&netdev_rename_lock);
1513
memcpy(oldname, newname, IFNAMSIZ);
1514
WRITE_ONCE(dev->name_assign_type, old_assign_type);
1515
old_assign_type = NET_NAME_RENAMED;
1516
goto rollback;
1517
} else {
1518
netdev_err(dev, "name change rollback failed: %d\n",
1519
ret);
1520
}
1521
}
1522
1523
return err;
1524
}
1525
1526
int netif_set_alias(struct net_device *dev, const char *alias, size_t len)
1527
{
1528
struct dev_ifalias *new_alias = NULL;
1529
1530
if (len >= IFALIASZ)
1531
return -EINVAL;
1532
1533
if (len) {
1534
new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1535
if (!new_alias)
1536
return -ENOMEM;
1537
1538
memcpy(new_alias->ifalias, alias, len);
1539
new_alias->ifalias[len] = 0;
1540
}
1541
1542
mutex_lock(&ifalias_mutex);
1543
new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1544
mutex_is_locked(&ifalias_mutex));
1545
mutex_unlock(&ifalias_mutex);
1546
1547
if (new_alias)
1548
kfree_rcu(new_alias, rcuhead);
1549
1550
return len;
1551
}
1552
1553
/**
1554
* dev_get_alias - get ifalias of a device
1555
* @dev: device
1556
* @name: buffer to store name of ifalias
1557
* @len: size of buffer
1558
*
1559
* get ifalias for a device. Caller must make sure dev cannot go
1560
* away, e.g. rcu read lock or own a reference count to device.
1561
*/
1562
int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1563
{
1564
const struct dev_ifalias *alias;
1565
int ret = 0;
1566
1567
rcu_read_lock();
1568
alias = rcu_dereference(dev->ifalias);
1569
if (alias)
1570
ret = snprintf(name, len, "%s", alias->ifalias);
1571
rcu_read_unlock();
1572
1573
return ret;
1574
}
1575
1576
/**
1577
* netdev_features_change - device changes features
1578
* @dev: device to cause notification
1579
*
1580
* Called to indicate a device has changed features.
1581
*/
1582
void netdev_features_change(struct net_device *dev)
1583
{
1584
call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1585
}
1586
EXPORT_SYMBOL(netdev_features_change);
1587
1588
void netif_state_change(struct net_device *dev)
1589
{
1590
netdev_ops_assert_locked_or_invisible(dev);
1591
1592
if (dev->flags & IFF_UP) {
1593
struct netdev_notifier_change_info change_info = {
1594
.info.dev = dev,
1595
};
1596
1597
call_netdevice_notifiers_info(NETDEV_CHANGE,
1598
&change_info.info);
1599
rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
1600
}
1601
}
1602
1603
/**
1604
* __netdev_notify_peers - notify network peers about existence of @dev,
1605
* to be called when rtnl lock is already held.
1606
* @dev: network device
1607
*
1608
* Generate traffic such that interested network peers are aware of
1609
* @dev, such as by generating a gratuitous ARP. This may be used when
1610
* a device wants to inform the rest of the network about some sort of
1611
* reconfiguration such as a failover event or virtual machine
1612
* migration.
1613
*/
1614
void __netdev_notify_peers(struct net_device *dev)
1615
{
1616
ASSERT_RTNL();
1617
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1618
call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1619
}
1620
EXPORT_SYMBOL(__netdev_notify_peers);
1621
1622
/**
1623
* netdev_notify_peers - notify network peers about existence of @dev
1624
* @dev: network device
1625
*
1626
* Generate traffic such that interested network peers are aware of
1627
* @dev, such as by generating a gratuitous ARP. This may be used when
1628
* a device wants to inform the rest of the network about some sort of
1629
* reconfiguration such as a failover event or virtual machine
1630
* migration.
1631
*/
1632
void netdev_notify_peers(struct net_device *dev)
1633
{
1634
rtnl_lock();
1635
__netdev_notify_peers(dev);
1636
rtnl_unlock();
1637
}
1638
EXPORT_SYMBOL(netdev_notify_peers);
1639
1640
static int napi_threaded_poll(void *data);
1641
1642
static int napi_kthread_create(struct napi_struct *n)
1643
{
1644
int err = 0;
1645
1646
/* Create and wake up the kthread once to put it in
1647
* TASK_INTERRUPTIBLE mode to avoid the blocked task
1648
* warning and work with loadavg.
1649
*/
1650
n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
1651
n->dev->name, n->napi_id);
1652
if (IS_ERR(n->thread)) {
1653
err = PTR_ERR(n->thread);
1654
pr_err("kthread_run failed with err %d\n", err);
1655
n->thread = NULL;
1656
}
1657
1658
return err;
1659
}
1660
1661
static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1662
{
1663
const struct net_device_ops *ops = dev->netdev_ops;
1664
int ret;
1665
1666
ASSERT_RTNL();
1667
dev_addr_check(dev);
1668
1669
if (!netif_device_present(dev)) {
1670
/* may be detached because parent is runtime-suspended */
1671
if (dev->dev.parent)
1672
pm_runtime_resume(dev->dev.parent);
1673
if (!netif_device_present(dev))
1674
return -ENODEV;
1675
}
1676
1677
/* Block netpoll from trying to do any rx path servicing.
1678
* If we don't do this there is a chance ndo_poll_controller
1679
* or ndo_poll may be running while we open the device
1680
*/
1681
netpoll_poll_disable(dev);
1682
1683
ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1684
ret = notifier_to_errno(ret);
1685
if (ret)
1686
return ret;
1687
1688
set_bit(__LINK_STATE_START, &dev->state);
1689
1690
netdev_ops_assert_locked(dev);
1691
1692
if (ops->ndo_validate_addr)
1693
ret = ops->ndo_validate_addr(dev);
1694
1695
if (!ret && ops->ndo_open)
1696
ret = ops->ndo_open(dev);
1697
1698
netpoll_poll_enable(dev);
1699
1700
if (ret)
1701
clear_bit(__LINK_STATE_START, &dev->state);
1702
else {
1703
netif_set_up(dev, true);
1704
dev_set_rx_mode(dev);
1705
dev_activate(dev);
1706
add_device_randomness(dev->dev_addr, dev->addr_len);
1707
}
1708
1709
return ret;
1710
}
1711
1712
int netif_open(struct net_device *dev, struct netlink_ext_ack *extack)
1713
{
1714
int ret;
1715
1716
if (dev->flags & IFF_UP)
1717
return 0;
1718
1719
ret = __dev_open(dev, extack);
1720
if (ret < 0)
1721
return ret;
1722
1723
rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1724
call_netdevice_notifiers(NETDEV_UP, dev);
1725
1726
return ret;
1727
}
1728
1729
static void __dev_close_many(struct list_head *head)
1730
{
1731
struct net_device *dev;
1732
1733
ASSERT_RTNL();
1734
might_sleep();
1735
1736
list_for_each_entry(dev, head, close_list) {
1737
/* Temporarily disable netpoll until the interface is down */
1738
netpoll_poll_disable(dev);
1739
1740
call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1741
1742
clear_bit(__LINK_STATE_START, &dev->state);
1743
1744
/* Synchronize to scheduled poll. We cannot touch poll list, it
1745
* can be even on different cpu. So just clear netif_running().
1746
*
1747
* dev->stop() will invoke napi_disable() on all of it's
1748
* napi_struct instances on this device.
1749
*/
1750
smp_mb__after_atomic(); /* Commit netif_running(). */
1751
}
1752
1753
dev_deactivate_many(head);
1754
1755
list_for_each_entry(dev, head, close_list) {
1756
const struct net_device_ops *ops = dev->netdev_ops;
1757
1758
/*
1759
* Call the device specific close. This cannot fail.
1760
* Only if device is UP
1761
*
1762
* We allow it to be called even after a DETACH hot-plug
1763
* event.
1764
*/
1765
1766
netdev_ops_assert_locked(dev);
1767
1768
if (ops->ndo_stop)
1769
ops->ndo_stop(dev);
1770
1771
netif_set_up(dev, false);
1772
netpoll_poll_enable(dev);
1773
}
1774
}
1775
1776
static void __dev_close(struct net_device *dev)
1777
{
1778
LIST_HEAD(single);
1779
1780
list_add(&dev->close_list, &single);
1781
__dev_close_many(&single);
1782
list_del(&single);
1783
}
1784
1785
void netif_close_many(struct list_head *head, bool unlink)
1786
{
1787
struct net_device *dev, *tmp;
1788
1789
/* Remove the devices that don't need to be closed */
1790
list_for_each_entry_safe(dev, tmp, head, close_list)
1791
if (!(dev->flags & IFF_UP))
1792
list_del_init(&dev->close_list);
1793
1794
__dev_close_many(head);
1795
1796
list_for_each_entry_safe(dev, tmp, head, close_list) {
1797
rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1798
call_netdevice_notifiers(NETDEV_DOWN, dev);
1799
if (unlink)
1800
list_del_init(&dev->close_list);
1801
}
1802
}
1803
EXPORT_SYMBOL_NS_GPL(netif_close_many, "NETDEV_INTERNAL");
1804
1805
void netif_close(struct net_device *dev)
1806
{
1807
if (dev->flags & IFF_UP) {
1808
LIST_HEAD(single);
1809
1810
list_add(&dev->close_list, &single);
1811
netif_close_many(&single, true);
1812
list_del(&single);
1813
}
1814
}
1815
EXPORT_SYMBOL(netif_close);
1816
1817
void netif_disable_lro(struct net_device *dev)
1818
{
1819
struct net_device *lower_dev;
1820
struct list_head *iter;
1821
1822
dev->wanted_features &= ~NETIF_F_LRO;
1823
netdev_update_features(dev);
1824
1825
if (unlikely(dev->features & NETIF_F_LRO))
1826
netdev_WARN(dev, "failed to disable LRO!\n");
1827
1828
netdev_for_each_lower_dev(dev, lower_dev, iter) {
1829
netdev_lock_ops(lower_dev);
1830
netif_disable_lro(lower_dev);
1831
netdev_unlock_ops(lower_dev);
1832
}
1833
}
1834
EXPORT_IPV6_MOD(netif_disable_lro);
1835
1836
/**
1837
* dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1838
* @dev: device
1839
*
1840
* Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
1841
* called under RTNL. This is needed if Generic XDP is installed on
1842
* the device.
1843
*/
1844
static void dev_disable_gro_hw(struct net_device *dev)
1845
{
1846
dev->wanted_features &= ~NETIF_F_GRO_HW;
1847
netdev_update_features(dev);
1848
1849
if (unlikely(dev->features & NETIF_F_GRO_HW))
1850
netdev_WARN(dev, "failed to disable GRO_HW!\n");
1851
}
1852
1853
const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1854
{
1855
#define N(val) \
1856
case NETDEV_##val: \
1857
return "NETDEV_" __stringify(val);
1858
switch (cmd) {
1859
N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1860
N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1861
N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1862
N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
1863
N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
1864
N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
1865
N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1866
N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1867
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1868
N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
1869
N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
1870
N(XDP_FEAT_CHANGE)
1871
}
1872
#undef N
1873
return "UNKNOWN_NETDEV_EVENT";
1874
}
1875
EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1876
1877
static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1878
struct net_device *dev)
1879
{
1880
struct netdev_notifier_info info = {
1881
.dev = dev,
1882
};
1883
1884
return nb->notifier_call(nb, val, &info);
1885
}
1886
1887
static int call_netdevice_register_notifiers(struct notifier_block *nb,
1888
struct net_device *dev)
1889
{
1890
int err;
1891
1892
err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1893
err = notifier_to_errno(err);
1894
if (err)
1895
return err;
1896
1897
if (!(dev->flags & IFF_UP))
1898
return 0;
1899
1900
call_netdevice_notifier(nb, NETDEV_UP, dev);
1901
return 0;
1902
}
1903
1904
static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1905
struct net_device *dev)
1906
{
1907
if (dev->flags & IFF_UP) {
1908
call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1909
dev);
1910
call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1911
}
1912
call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1913
}
1914
1915
static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1916
struct net *net)
1917
{
1918
struct net_device *dev;
1919
int err;
1920
1921
for_each_netdev(net, dev) {
1922
netdev_lock_ops(dev);
1923
err = call_netdevice_register_notifiers(nb, dev);
1924
netdev_unlock_ops(dev);
1925
if (err)
1926
goto rollback;
1927
}
1928
return 0;
1929
1930
rollback:
1931
for_each_netdev_continue_reverse(net, dev)
1932
call_netdevice_unregister_notifiers(nb, dev);
1933
return err;
1934
}
1935
1936
static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1937
struct net *net)
1938
{
1939
struct net_device *dev;
1940
1941
for_each_netdev(net, dev)
1942
call_netdevice_unregister_notifiers(nb, dev);
1943
}
1944
1945
static int dev_boot_phase = 1;
1946
1947
/**
1948
* register_netdevice_notifier - register a network notifier block
1949
* @nb: notifier
1950
*
1951
* Register a notifier to be called when network device events occur.
1952
* The notifier passed is linked into the kernel structures and must
1953
* not be reused until it has been unregistered. A negative errno code
1954
* is returned on a failure.
1955
*
1956
* When registered all registration and up events are replayed
1957
* to the new notifier to allow device to have a race free
1958
* view of the network device list.
1959
*/
1960
1961
int register_netdevice_notifier(struct notifier_block *nb)
1962
{
1963
struct net *net;
1964
int err;
1965
1966
/* Close race with setup_net() and cleanup_net() */
1967
down_write(&pernet_ops_rwsem);
1968
1969
/* When RTNL is removed, we need protection for netdev_chain. */
1970
rtnl_lock();
1971
1972
err = raw_notifier_chain_register(&netdev_chain, nb);
1973
if (err)
1974
goto unlock;
1975
if (dev_boot_phase)
1976
goto unlock;
1977
for_each_net(net) {
1978
__rtnl_net_lock(net);
1979
err = call_netdevice_register_net_notifiers(nb, net);
1980
__rtnl_net_unlock(net);
1981
if (err)
1982
goto rollback;
1983
}
1984
1985
unlock:
1986
rtnl_unlock();
1987
up_write(&pernet_ops_rwsem);
1988
return err;
1989
1990
rollback:
1991
for_each_net_continue_reverse(net) {
1992
__rtnl_net_lock(net);
1993
call_netdevice_unregister_net_notifiers(nb, net);
1994
__rtnl_net_unlock(net);
1995
}
1996
1997
raw_notifier_chain_unregister(&netdev_chain, nb);
1998
goto unlock;
1999
}
2000
EXPORT_SYMBOL(register_netdevice_notifier);
2001
2002
/**
2003
* unregister_netdevice_notifier - unregister a network notifier block
2004
* @nb: notifier
2005
*
2006
* Unregister a notifier previously registered by
2007
* register_netdevice_notifier(). The notifier is unlinked into the
2008
* kernel structures and may then be reused. A negative errno code
2009
* is returned on a failure.
2010
*
2011
* After unregistering unregister and down device events are synthesized
2012
* for all devices on the device list to the removed notifier to remove
2013
* the need for special case cleanup code.
2014
*/
2015
2016
int unregister_netdevice_notifier(struct notifier_block *nb)
2017
{
2018
struct net *net;
2019
int err;
2020
2021
/* Close race with setup_net() and cleanup_net() */
2022
down_write(&pernet_ops_rwsem);
2023
rtnl_lock();
2024
err = raw_notifier_chain_unregister(&netdev_chain, nb);
2025
if (err)
2026
goto unlock;
2027
2028
for_each_net(net) {
2029
__rtnl_net_lock(net);
2030
call_netdevice_unregister_net_notifiers(nb, net);
2031
__rtnl_net_unlock(net);
2032
}
2033
2034
unlock:
2035
rtnl_unlock();
2036
up_write(&pernet_ops_rwsem);
2037
return err;
2038
}
2039
EXPORT_SYMBOL(unregister_netdevice_notifier);
2040
2041
static int __register_netdevice_notifier_net(struct net *net,
2042
struct notifier_block *nb,
2043
bool ignore_call_fail)
2044
{
2045
int err;
2046
2047
err = raw_notifier_chain_register(&net->netdev_chain, nb);
2048
if (err)
2049
return err;
2050
if (dev_boot_phase)
2051
return 0;
2052
2053
err = call_netdevice_register_net_notifiers(nb, net);
2054
if (err && !ignore_call_fail)
2055
goto chain_unregister;
2056
2057
return 0;
2058
2059
chain_unregister:
2060
raw_notifier_chain_unregister(&net->netdev_chain, nb);
2061
return err;
2062
}
2063
2064
static int __unregister_netdevice_notifier_net(struct net *net,
2065
struct notifier_block *nb)
2066
{
2067
int err;
2068
2069
err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
2070
if (err)
2071
return err;
2072
2073
call_netdevice_unregister_net_notifiers(nb, net);
2074
return 0;
2075
}
2076
2077
/**
2078
* register_netdevice_notifier_net - register a per-netns network notifier block
2079
* @net: network namespace
2080
* @nb: notifier
2081
*
2082
* Register a notifier to be called when network device events occur.
2083
* The notifier passed is linked into the kernel structures and must
2084
* not be reused until it has been unregistered. A negative errno code
2085
* is returned on a failure.
2086
*
2087
* When registered all registration and up events are replayed
2088
* to the new notifier to allow device to have a race free
2089
* view of the network device list.
2090
*/
2091
2092
int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
2093
{
2094
int err;
2095
2096
rtnl_net_lock(net);
2097
err = __register_netdevice_notifier_net(net, nb, false);
2098
rtnl_net_unlock(net);
2099
2100
return err;
2101
}
2102
EXPORT_SYMBOL(register_netdevice_notifier_net);
2103
2104
/**
2105
* unregister_netdevice_notifier_net - unregister a per-netns
2106
* network notifier block
2107
* @net: network namespace
2108
* @nb: notifier
2109
*
2110
* Unregister a notifier previously registered by
2111
* register_netdevice_notifier_net(). The notifier is unlinked from the
2112
* kernel structures and may then be reused. A negative errno code
2113
* is returned on a failure.
2114
*
2115
* After unregistering unregister and down device events are synthesized
2116
* for all devices on the device list to the removed notifier to remove
2117
* the need for special case cleanup code.
2118
*/
2119
2120
int unregister_netdevice_notifier_net(struct net *net,
2121
struct notifier_block *nb)
2122
{
2123
int err;
2124
2125
rtnl_net_lock(net);
2126
err = __unregister_netdevice_notifier_net(net, nb);
2127
rtnl_net_unlock(net);
2128
2129
return err;
2130
}
2131
EXPORT_SYMBOL(unregister_netdevice_notifier_net);
2132
2133
static void __move_netdevice_notifier_net(struct net *src_net,
2134
struct net *dst_net,
2135
struct notifier_block *nb)
2136
{
2137
__unregister_netdevice_notifier_net(src_net, nb);
2138
__register_netdevice_notifier_net(dst_net, nb, true);
2139
}
2140
2141
static void rtnl_net_dev_lock(struct net_device *dev)
2142
{
2143
bool again;
2144
2145
do {
2146
struct net *net;
2147
2148
again = false;
2149
2150
/* netns might be being dismantled. */
2151
rcu_read_lock();
2152
net = dev_net_rcu(dev);
2153
net_passive_inc(net);
2154
rcu_read_unlock();
2155
2156
rtnl_net_lock(net);
2157
2158
#ifdef CONFIG_NET_NS
2159
/* dev might have been moved to another netns. */
2160
if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
2161
rtnl_net_unlock(net);
2162
net_passive_dec(net);
2163
again = true;
2164
}
2165
#endif
2166
} while (again);
2167
}
2168
2169
static void rtnl_net_dev_unlock(struct net_device *dev)
2170
{
2171
struct net *net = dev_net(dev);
2172
2173
rtnl_net_unlock(net);
2174
net_passive_dec(net);
2175
}
2176
2177
int register_netdevice_notifier_dev_net(struct net_device *dev,
2178
struct notifier_block *nb,
2179
struct netdev_net_notifier *nn)
2180
{
2181
int err;
2182
2183
rtnl_net_dev_lock(dev);
2184
err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
2185
if (!err) {
2186
nn->nb = nb;
2187
list_add(&nn->list, &dev->net_notifier_list);
2188
}
2189
rtnl_net_dev_unlock(dev);
2190
2191
return err;
2192
}
2193
EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
2194
2195
int unregister_netdevice_notifier_dev_net(struct net_device *dev,
2196
struct notifier_block *nb,
2197
struct netdev_net_notifier *nn)
2198
{
2199
int err;
2200
2201
rtnl_net_dev_lock(dev);
2202
list_del(&nn->list);
2203
err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
2204
rtnl_net_dev_unlock(dev);
2205
2206
return err;
2207
}
2208
EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
2209
2210
static void move_netdevice_notifiers_dev_net(struct net_device *dev,
2211
struct net *net)
2212
{
2213
struct netdev_net_notifier *nn;
2214
2215
list_for_each_entry(nn, &dev->net_notifier_list, list)
2216
__move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
2217
}
2218
2219
/**
2220
* call_netdevice_notifiers_info - call all network notifier blocks
2221
* @val: value passed unmodified to notifier function
2222
* @info: notifier information data
2223
*
2224
* Call all network notifier blocks. Parameters and return value
2225
* are as for raw_notifier_call_chain().
2226
*/
2227
2228
int call_netdevice_notifiers_info(unsigned long val,
2229
struct netdev_notifier_info *info)
2230
{
2231
struct net *net = dev_net(info->dev);
2232
int ret;
2233
2234
ASSERT_RTNL();
2235
2236
/* Run per-netns notifier block chain first, then run the global one.
2237
* Hopefully, one day, the global one is going to be removed after
2238
* all notifier block registrators get converted to be per-netns.
2239
*/
2240
ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
2241
if (ret & NOTIFY_STOP_MASK)
2242
return ret;
2243
return raw_notifier_call_chain(&netdev_chain, val, info);
2244
}
2245
2246
/**
2247
* call_netdevice_notifiers_info_robust - call per-netns notifier blocks
2248
* for and rollback on error
2249
* @val_up: value passed unmodified to notifier function
2250
* @val_down: value passed unmodified to the notifier function when
2251
* recovering from an error on @val_up
2252
* @info: notifier information data
2253
*
2254
* Call all per-netns network notifier blocks, but not notifier blocks on
2255
* the global notifier chain. Parameters and return value are as for
2256
* raw_notifier_call_chain_robust().
2257
*/
2258
2259
static int
2260
call_netdevice_notifiers_info_robust(unsigned long val_up,
2261
unsigned long val_down,
2262
struct netdev_notifier_info *info)
2263
{
2264
struct net *net = dev_net(info->dev);
2265
2266
ASSERT_RTNL();
2267
2268
return raw_notifier_call_chain_robust(&net->netdev_chain,
2269
val_up, val_down, info);
2270
}
2271
2272
static int call_netdevice_notifiers_extack(unsigned long val,
2273
struct net_device *dev,
2274
struct netlink_ext_ack *extack)
2275
{
2276
struct netdev_notifier_info info = {
2277
.dev = dev,
2278
.extack = extack,
2279
};
2280
2281
return call_netdevice_notifiers_info(val, &info);
2282
}
2283
2284
/**
2285
* call_netdevice_notifiers - call all network notifier blocks
2286
* @val: value passed unmodified to notifier function
2287
* @dev: net_device pointer passed unmodified to notifier function
2288
*
2289
* Call all network notifier blocks. Parameters and return value
2290
* are as for raw_notifier_call_chain().
2291
*/
2292
2293
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2294
{
2295
return call_netdevice_notifiers_extack(val, dev, NULL);
2296
}
2297
EXPORT_SYMBOL(call_netdevice_notifiers);
2298
2299
/**
2300
* call_netdevice_notifiers_mtu - call all network notifier blocks
2301
* @val: value passed unmodified to notifier function
2302
* @dev: net_device pointer passed unmodified to notifier function
2303
* @arg: additional u32 argument passed to the notifier function
2304
*
2305
* Call all network notifier blocks. Parameters and return value
2306
* are as for raw_notifier_call_chain().
2307
*/
2308
static int call_netdevice_notifiers_mtu(unsigned long val,
2309
struct net_device *dev, u32 arg)
2310
{
2311
struct netdev_notifier_info_ext info = {
2312
.info.dev = dev,
2313
.ext.mtu = arg,
2314
};
2315
2316
BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2317
2318
return call_netdevice_notifiers_info(val, &info.info);
2319
}
2320
2321
#ifdef CONFIG_NET_INGRESS
2322
static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2323
2324
void net_inc_ingress_queue(void)
2325
{
2326
static_branch_inc(&ingress_needed_key);
2327
}
2328
EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2329
2330
void net_dec_ingress_queue(void)
2331
{
2332
static_branch_dec(&ingress_needed_key);
2333
}
2334
EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2335
#endif
2336
2337
#ifdef CONFIG_NET_EGRESS
2338
static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2339
2340
void net_inc_egress_queue(void)
2341
{
2342
static_branch_inc(&egress_needed_key);
2343
}
2344
EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2345
2346
void net_dec_egress_queue(void)
2347
{
2348
static_branch_dec(&egress_needed_key);
2349
}
2350
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2351
#endif
2352
2353
#ifdef CONFIG_NET_CLS_ACT
2354
DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key);
2355
EXPORT_SYMBOL(tcf_sw_enabled_key);
2356
#endif
2357
2358
DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2359
EXPORT_SYMBOL(netstamp_needed_key);
2360
#ifdef CONFIG_JUMP_LABEL
2361
static atomic_t netstamp_needed_deferred;
2362
static atomic_t netstamp_wanted;
2363
static void netstamp_clear(struct work_struct *work)
2364
{
2365
int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2366
int wanted;
2367
2368
wanted = atomic_add_return(deferred, &netstamp_wanted);
2369
if (wanted > 0)
2370
static_branch_enable(&netstamp_needed_key);
2371
else
2372
static_branch_disable(&netstamp_needed_key);
2373
}
2374
static DECLARE_WORK(netstamp_work, netstamp_clear);
2375
#endif
2376
2377
void net_enable_timestamp(void)
2378
{
2379
#ifdef CONFIG_JUMP_LABEL
2380
int wanted = atomic_read(&netstamp_wanted);
2381
2382
while (wanted > 0) {
2383
if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
2384
return;
2385
}
2386
atomic_inc(&netstamp_needed_deferred);
2387
schedule_work(&netstamp_work);
2388
#else
2389
static_branch_inc(&netstamp_needed_key);
2390
#endif
2391
}
2392
EXPORT_SYMBOL(net_enable_timestamp);
2393
2394
void net_disable_timestamp(void)
2395
{
2396
#ifdef CONFIG_JUMP_LABEL
2397
int wanted = atomic_read(&netstamp_wanted);
2398
2399
while (wanted > 1) {
2400
if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
2401
return;
2402
}
2403
atomic_dec(&netstamp_needed_deferred);
2404
schedule_work(&netstamp_work);
2405
#else
2406
static_branch_dec(&netstamp_needed_key);
2407
#endif
2408
}
2409
EXPORT_SYMBOL(net_disable_timestamp);
2410
2411
static inline void net_timestamp_set(struct sk_buff *skb)
2412
{
2413
skb->tstamp = 0;
2414
skb->tstamp_type = SKB_CLOCK_REALTIME;
2415
if (static_branch_unlikely(&netstamp_needed_key))
2416
skb->tstamp = ktime_get_real();
2417
}
2418
2419
#define net_timestamp_check(COND, SKB) \
2420
if (static_branch_unlikely(&netstamp_needed_key)) { \
2421
if ((COND) && !(SKB)->tstamp) \
2422
(SKB)->tstamp = ktime_get_real(); \
2423
} \
2424
2425
bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2426
{
2427
return __is_skb_forwardable(dev, skb, true);
2428
}
2429
EXPORT_SYMBOL_GPL(is_skb_forwardable);
2430
2431
static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
2432
bool check_mtu)
2433
{
2434
int ret = ____dev_forward_skb(dev, skb, check_mtu);
2435
2436
if (likely(!ret)) {
2437
skb->protocol = eth_type_trans(skb, dev);
2438
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2439
}
2440
2441
return ret;
2442
}
2443
2444
int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2445
{
2446
return __dev_forward_skb2(dev, skb, true);
2447
}
2448
EXPORT_SYMBOL_GPL(__dev_forward_skb);
2449
2450
/**
2451
* dev_forward_skb - loopback an skb to another netif
2452
*
2453
* @dev: destination network device
2454
* @skb: buffer to forward
2455
*
2456
* return values:
2457
* NET_RX_SUCCESS (no congestion)
2458
* NET_RX_DROP (packet was dropped, but freed)
2459
*
2460
* dev_forward_skb can be used for injecting an skb from the
2461
* start_xmit function of one device into the receive queue
2462
* of another device.
2463
*
2464
* The receiving device may be in another namespace, so
2465
* we have to clear all information in the skb that could
2466
* impact namespace isolation.
2467
*/
2468
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2469
{
2470
return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2471
}
2472
EXPORT_SYMBOL_GPL(dev_forward_skb);
2473
2474
int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
2475
{
2476
return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
2477
}
2478
2479
static int deliver_skb(struct sk_buff *skb,
2480
struct packet_type *pt_prev,
2481
struct net_device *orig_dev)
2482
{
2483
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2484
return -ENOMEM;
2485
refcount_inc(&skb->users);
2486
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2487
}
2488
2489
static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2490
struct packet_type **pt,
2491
struct net_device *orig_dev,
2492
__be16 type,
2493
struct list_head *ptype_list)
2494
{
2495
struct packet_type *ptype, *pt_prev = *pt;
2496
2497
list_for_each_entry_rcu(ptype, ptype_list, list) {
2498
if (ptype->type != type)
2499
continue;
2500
if (unlikely(pt_prev))
2501
deliver_skb(skb, pt_prev, orig_dev);
2502
pt_prev = ptype;
2503
}
2504
*pt = pt_prev;
2505
}
2506
2507
static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2508
{
2509
if (!ptype->af_packet_priv || !skb->sk)
2510
return false;
2511
2512
if (ptype->id_match)
2513
return ptype->id_match(ptype, skb->sk);
2514
else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2515
return true;
2516
2517
return false;
2518
}
2519
2520
/**
2521
* dev_nit_active_rcu - return true if any network interface taps are in use
2522
*
2523
* The caller must hold the RCU lock
2524
*
2525
* @dev: network device to check for the presence of taps
2526
*/
2527
bool dev_nit_active_rcu(const struct net_device *dev)
2528
{
2529
/* Callers may hold either RCU or RCU BH lock */
2530
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
2531
2532
return !list_empty(&dev_net(dev)->ptype_all) ||
2533
!list_empty(&dev->ptype_all);
2534
}
2535
EXPORT_SYMBOL_GPL(dev_nit_active_rcu);
2536
2537
/*
2538
* Support routine. Sends outgoing frames to any network
2539
* taps currently in use.
2540
*/
2541
2542
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2543
{
2544
struct packet_type *ptype, *pt_prev = NULL;
2545
struct list_head *ptype_list;
2546
struct sk_buff *skb2 = NULL;
2547
2548
rcu_read_lock();
2549
ptype_list = &dev_net_rcu(dev)->ptype_all;
2550
again:
2551
list_for_each_entry_rcu(ptype, ptype_list, list) {
2552
if (READ_ONCE(ptype->ignore_outgoing))
2553
continue;
2554
2555
/* Never send packets back to the socket
2556
* they originated from - MvS ([email protected])
2557
*/
2558
if (skb_loop_sk(ptype, skb))
2559
continue;
2560
2561
if (unlikely(pt_prev)) {
2562
deliver_skb(skb2, pt_prev, skb->dev);
2563
pt_prev = ptype;
2564
continue;
2565
}
2566
2567
/* need to clone skb, done only once */
2568
skb2 = skb_clone(skb, GFP_ATOMIC);
2569
if (!skb2)
2570
goto out_unlock;
2571
2572
net_timestamp_set(skb2);
2573
2574
/* skb->nh should be correctly
2575
* set by sender, so that the second statement is
2576
* just protection against buggy protocols.
2577
*/
2578
skb_reset_mac_header(skb2);
2579
2580
if (skb_network_header(skb2) < skb2->data ||
2581
skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2582
net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2583
ntohs(skb2->protocol),
2584
dev->name);
2585
skb_reset_network_header(skb2);
2586
}
2587
2588
skb2->transport_header = skb2->network_header;
2589
skb2->pkt_type = PACKET_OUTGOING;
2590
pt_prev = ptype;
2591
}
2592
2593
if (ptype_list != &dev->ptype_all) {
2594
ptype_list = &dev->ptype_all;
2595
goto again;
2596
}
2597
out_unlock:
2598
if (pt_prev) {
2599
if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2600
pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2601
else
2602
kfree_skb(skb2);
2603
}
2604
rcu_read_unlock();
2605
}
2606
EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2607
2608
/**
2609
* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2610
* @dev: Network device
2611
* @txq: number of queues available
2612
*
2613
* If real_num_tx_queues is changed the tc mappings may no longer be
2614
* valid. To resolve this verify the tc mapping remains valid and if
2615
* not NULL the mapping. With no priorities mapping to this
2616
* offset/count pair it will no longer be used. In the worst case TC0
2617
* is invalid nothing can be done so disable priority mappings. If is
2618
* expected that drivers will fix this mapping if they can before
2619
* calling netif_set_real_num_tx_queues.
2620
*/
2621
static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2622
{
2623
int i;
2624
struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2625
2626
/* If TC0 is invalidated disable TC mapping */
2627
if (tc->offset + tc->count > txq) {
2628
netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2629
dev->num_tc = 0;
2630
return;
2631
}
2632
2633
/* Invalidated prio to tc mappings set to TC0 */
2634
for (i = 1; i < TC_BITMASK + 1; i++) {
2635
int q = netdev_get_prio_tc_map(dev, i);
2636
2637
tc = &dev->tc_to_txq[q];
2638
if (tc->offset + tc->count > txq) {
2639
netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2640
i, q);
2641
netdev_set_prio_tc_map(dev, i, 0);
2642
}
2643
}
2644
}
2645
2646
int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2647
{
2648
if (dev->num_tc) {
2649
struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2650
int i;
2651
2652
/* walk through the TCs and see if it falls into any of them */
2653
for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2654
if ((txq - tc->offset) < tc->count)
2655
return i;
2656
}
2657
2658
/* didn't find it, just return -1 to indicate no match */
2659
return -1;
2660
}
2661
2662
return 0;
2663
}
2664
EXPORT_SYMBOL(netdev_txq_to_tc);
2665
2666
#ifdef CONFIG_XPS
2667
static struct static_key xps_needed __read_mostly;
2668
static struct static_key xps_rxqs_needed __read_mostly;
2669
static DEFINE_MUTEX(xps_map_mutex);
2670
#define xmap_dereference(P) \
2671
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2672
2673
static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2674
struct xps_dev_maps *old_maps, int tci, u16 index)
2675
{
2676
struct xps_map *map = NULL;
2677
int pos;
2678
2679
map = xmap_dereference(dev_maps->attr_map[tci]);
2680
if (!map)
2681
return false;
2682
2683
for (pos = map->len; pos--;) {
2684
if (map->queues[pos] != index)
2685
continue;
2686
2687
if (map->len > 1) {
2688
map->queues[pos] = map->queues[--map->len];
2689
break;
2690
}
2691
2692
if (old_maps)
2693
RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
2694
RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2695
kfree_rcu(map, rcu);
2696
return false;
2697
}
2698
2699
return true;
2700
}
2701
2702
static bool remove_xps_queue_cpu(struct net_device *dev,
2703
struct xps_dev_maps *dev_maps,
2704
int cpu, u16 offset, u16 count)
2705
{
2706
int num_tc = dev_maps->num_tc;
2707
bool active = false;
2708
int tci;
2709
2710
for (tci = cpu * num_tc; num_tc--; tci++) {
2711
int i, j;
2712
2713
for (i = count, j = offset; i--; j++) {
2714
if (!remove_xps_queue(dev_maps, NULL, tci, j))
2715
break;
2716
}
2717
2718
active |= i < 0;
2719
}
2720
2721
return active;
2722
}
2723
2724
static void reset_xps_maps(struct net_device *dev,
2725
struct xps_dev_maps *dev_maps,
2726
enum xps_map_type type)
2727
{
2728
static_key_slow_dec_cpuslocked(&xps_needed);
2729
if (type == XPS_RXQS)
2730
static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2731
2732
RCU_INIT_POINTER(dev->xps_maps[type], NULL);
2733
2734
kfree_rcu(dev_maps, rcu);
2735
}
2736
2737
static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
2738
u16 offset, u16 count)
2739
{
2740
struct xps_dev_maps *dev_maps;
2741
bool active = false;
2742
int i, j;
2743
2744
dev_maps = xmap_dereference(dev->xps_maps[type]);
2745
if (!dev_maps)
2746
return;
2747
2748
for (j = 0; j < dev_maps->nr_ids; j++)
2749
active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
2750
if (!active)
2751
reset_xps_maps(dev, dev_maps, type);
2752
2753
if (type == XPS_CPUS) {
2754
for (i = offset + (count - 1); count--; i--)
2755
netdev_queue_numa_node_write(
2756
netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
2757
}
2758
}
2759
2760
static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2761
u16 count)
2762
{
2763
if (!static_key_false(&xps_needed))
2764
return;
2765
2766
cpus_read_lock();
2767
mutex_lock(&xps_map_mutex);
2768
2769
if (static_key_false(&xps_rxqs_needed))
2770
clean_xps_maps(dev, XPS_RXQS, offset, count);
2771
2772
clean_xps_maps(dev, XPS_CPUS, offset, count);
2773
2774
mutex_unlock(&xps_map_mutex);
2775
cpus_read_unlock();
2776
}
2777
2778
static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2779
{
2780
netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2781
}
2782
2783
static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2784
u16 index, bool is_rxqs_map)
2785
{
2786
struct xps_map *new_map;
2787
int alloc_len = XPS_MIN_MAP_ALLOC;
2788
int i, pos;
2789
2790
for (pos = 0; map && pos < map->len; pos++) {
2791
if (map->queues[pos] != index)
2792
continue;
2793
return map;
2794
}
2795
2796
/* Need to add tx-queue to this CPU's/rx-queue's existing map */
2797
if (map) {
2798
if (pos < map->alloc_len)
2799
return map;
2800
2801
alloc_len = map->alloc_len * 2;
2802
}
2803
2804
/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2805
* map
2806
*/
2807
if (is_rxqs_map)
2808
new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2809
else
2810
new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2811
cpu_to_node(attr_index));
2812
if (!new_map)
2813
return NULL;
2814
2815
for (i = 0; i < pos; i++)
2816
new_map->queues[i] = map->queues[i];
2817
new_map->alloc_len = alloc_len;
2818
new_map->len = pos;
2819
2820
return new_map;
2821
}
2822
2823
/* Copy xps maps at a given index */
2824
static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
2825
struct xps_dev_maps *new_dev_maps, int index,
2826
int tc, bool skip_tc)
2827
{
2828
int i, tci = index * dev_maps->num_tc;
2829
struct xps_map *map;
2830
2831
/* copy maps belonging to foreign traffic classes */
2832
for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2833
if (i == tc && skip_tc)
2834
continue;
2835
2836
/* fill in the new device map from the old device map */
2837
map = xmap_dereference(dev_maps->attr_map[tci]);
2838
RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2839
}
2840
}
2841
2842
/* Must be called under cpus_read_lock */
2843
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2844
u16 index, enum xps_map_type type)
2845
{
2846
struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
2847
const unsigned long *online_mask = NULL;
2848
bool active = false, copy = false;
2849
int i, j, tci, numa_node_id = -2;
2850
int maps_sz, num_tc = 1, tc = 0;
2851
struct xps_map *map, *new_map;
2852
unsigned int nr_ids;
2853
2854
WARN_ON_ONCE(index >= dev->num_tx_queues);
2855
2856
if (dev->num_tc) {
2857
/* Do not allow XPS on subordinate device directly */
2858
num_tc = dev->num_tc;
2859
if (num_tc < 0)
2860
return -EINVAL;
2861
2862
/* If queue belongs to subordinate dev use its map */
2863
dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2864
2865
tc = netdev_txq_to_tc(dev, index);
2866
if (tc < 0)
2867
return -EINVAL;
2868
}
2869
2870
mutex_lock(&xps_map_mutex);
2871
2872
dev_maps = xmap_dereference(dev->xps_maps[type]);
2873
if (type == XPS_RXQS) {
2874
maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2875
nr_ids = dev->num_rx_queues;
2876
} else {
2877
maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2878
if (num_possible_cpus() > 1)
2879
online_mask = cpumask_bits(cpu_online_mask);
2880
nr_ids = nr_cpu_ids;
2881
}
2882
2883
if (maps_sz < L1_CACHE_BYTES)
2884
maps_sz = L1_CACHE_BYTES;
2885
2886
/* The old dev_maps could be larger or smaller than the one we're
2887
* setting up now, as dev->num_tc or nr_ids could have been updated in
2888
* between. We could try to be smart, but let's be safe instead and only
2889
* copy foreign traffic classes if the two map sizes match.
2890
*/
2891
if (dev_maps &&
2892
dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
2893
copy = true;
2894
2895
/* allocate memory for queue storage */
2896
for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2897
j < nr_ids;) {
2898
if (!new_dev_maps) {
2899
new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2900
if (!new_dev_maps) {
2901
mutex_unlock(&xps_map_mutex);
2902
return -ENOMEM;
2903
}
2904
2905
new_dev_maps->nr_ids = nr_ids;
2906
new_dev_maps->num_tc = num_tc;
2907
}
2908
2909
tci = j * num_tc + tc;
2910
map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
2911
2912
map = expand_xps_map(map, j, index, type == XPS_RXQS);
2913
if (!map)
2914
goto error;
2915
2916
RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2917
}
2918
2919
if (!new_dev_maps)
2920
goto out_no_new_maps;
2921
2922
if (!dev_maps) {
2923
/* Increment static keys at most once per type */
2924
static_key_slow_inc_cpuslocked(&xps_needed);
2925
if (type == XPS_RXQS)
2926
static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2927
}
2928
2929
for (j = 0; j < nr_ids; j++) {
2930
bool skip_tc = false;
2931
2932
tci = j * num_tc + tc;
2933
if (netif_attr_test_mask(j, mask, nr_ids) &&
2934
netif_attr_test_online(j, online_mask, nr_ids)) {
2935
/* add tx-queue to CPU/rx-queue maps */
2936
int pos = 0;
2937
2938
skip_tc = true;
2939
2940
map = xmap_dereference(new_dev_maps->attr_map[tci]);
2941
while ((pos < map->len) && (map->queues[pos] != index))
2942
pos++;
2943
2944
if (pos == map->len)
2945
map->queues[map->len++] = index;
2946
#ifdef CONFIG_NUMA
2947
if (type == XPS_CPUS) {
2948
if (numa_node_id == -2)
2949
numa_node_id = cpu_to_node(j);
2950
else if (numa_node_id != cpu_to_node(j))
2951
numa_node_id = -1;
2952
}
2953
#endif
2954
}
2955
2956
if (copy)
2957
xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
2958
skip_tc);
2959
}
2960
2961
rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
2962
2963
/* Cleanup old maps */
2964
if (!dev_maps)
2965
goto out_no_old_maps;
2966
2967
for (j = 0; j < dev_maps->nr_ids; j++) {
2968
for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
2969
map = xmap_dereference(dev_maps->attr_map[tci]);
2970
if (!map)
2971
continue;
2972
2973
if (copy) {
2974
new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2975
if (map == new_map)
2976
continue;
2977
}
2978
2979
RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2980
kfree_rcu(map, rcu);
2981
}
2982
}
2983
2984
old_dev_maps = dev_maps;
2985
2986
out_no_old_maps:
2987
dev_maps = new_dev_maps;
2988
active = true;
2989
2990
out_no_new_maps:
2991
if (type == XPS_CPUS)
2992
/* update Tx queue numa node */
2993
netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2994
(numa_node_id >= 0) ?
2995
numa_node_id : NUMA_NO_NODE);
2996
2997
if (!dev_maps)
2998
goto out_no_maps;
2999
3000
/* removes tx-queue from unused CPUs/rx-queues */
3001
for (j = 0; j < dev_maps->nr_ids; j++) {
3002
tci = j * dev_maps->num_tc;
3003
3004
for (i = 0; i < dev_maps->num_tc; i++, tci++) {
3005
if (i == tc &&
3006
netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
3007
netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
3008
continue;
3009
3010
active |= remove_xps_queue(dev_maps,
3011
copy ? old_dev_maps : NULL,
3012
tci, index);
3013
}
3014
}
3015
3016
if (old_dev_maps)
3017
kfree_rcu(old_dev_maps, rcu);
3018
3019
/* free map if not active */
3020
if (!active)
3021
reset_xps_maps(dev, dev_maps, type);
3022
3023
out_no_maps:
3024
mutex_unlock(&xps_map_mutex);
3025
3026
return 0;
3027
error:
3028
/* remove any maps that we added */
3029
for (j = 0; j < nr_ids; j++) {
3030
for (i = num_tc, tci = j * num_tc; i--; tci++) {
3031
new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
3032
map = copy ?
3033
xmap_dereference(dev_maps->attr_map[tci]) :
3034
NULL;
3035
if (new_map && new_map != map)
3036
kfree(new_map);
3037
}
3038
}
3039
3040
mutex_unlock(&xps_map_mutex);
3041
3042
kfree(new_dev_maps);
3043
return -ENOMEM;
3044
}
3045
EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
3046
3047
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
3048
u16 index)
3049
{
3050
int ret;
3051
3052
cpus_read_lock();
3053
ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
3054
cpus_read_unlock();
3055
3056
return ret;
3057
}
3058
EXPORT_SYMBOL(netif_set_xps_queue);
3059
3060
#endif
3061
static void netdev_unbind_all_sb_channels(struct net_device *dev)
3062
{
3063
struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
3064
3065
/* Unbind any subordinate channels */
3066
while (txq-- != &dev->_tx[0]) {
3067
if (txq->sb_dev)
3068
netdev_unbind_sb_channel(dev, txq->sb_dev);
3069
}
3070
}
3071
3072
void netdev_reset_tc(struct net_device *dev)
3073
{
3074
#ifdef CONFIG_XPS
3075
netif_reset_xps_queues_gt(dev, 0);
3076
#endif
3077
netdev_unbind_all_sb_channels(dev);
3078
3079
/* Reset TC configuration of device */
3080
dev->num_tc = 0;
3081
memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
3082
memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
3083
}
3084
EXPORT_SYMBOL(netdev_reset_tc);
3085
3086
int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
3087
{
3088
if (tc >= dev->num_tc)
3089
return -EINVAL;
3090
3091
#ifdef CONFIG_XPS
3092
netif_reset_xps_queues(dev, offset, count);
3093
#endif
3094
dev->tc_to_txq[tc].count = count;
3095
dev->tc_to_txq[tc].offset = offset;
3096
return 0;
3097
}
3098
EXPORT_SYMBOL(netdev_set_tc_queue);
3099
3100
int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
3101
{
3102
if (num_tc > TC_MAX_QUEUE)
3103
return -EINVAL;
3104
3105
#ifdef CONFIG_XPS
3106
netif_reset_xps_queues_gt(dev, 0);
3107
#endif
3108
netdev_unbind_all_sb_channels(dev);
3109
3110
dev->num_tc = num_tc;
3111
return 0;
3112
}
3113
EXPORT_SYMBOL(netdev_set_num_tc);
3114
3115
void netdev_unbind_sb_channel(struct net_device *dev,
3116
struct net_device *sb_dev)
3117
{
3118
struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
3119
3120
#ifdef CONFIG_XPS
3121
netif_reset_xps_queues_gt(sb_dev, 0);
3122
#endif
3123
memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
3124
memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
3125
3126
while (txq-- != &dev->_tx[0]) {
3127
if (txq->sb_dev == sb_dev)
3128
txq->sb_dev = NULL;
3129
}
3130
}
3131
EXPORT_SYMBOL(netdev_unbind_sb_channel);
3132
3133
int netdev_bind_sb_channel_queue(struct net_device *dev,
3134
struct net_device *sb_dev,
3135
u8 tc, u16 count, u16 offset)
3136
{
3137
/* Make certain the sb_dev and dev are already configured */
3138
if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
3139
return -EINVAL;
3140
3141
/* We cannot hand out queues we don't have */
3142
if ((offset + count) > dev->real_num_tx_queues)
3143
return -EINVAL;
3144
3145
/* Record the mapping */
3146
sb_dev->tc_to_txq[tc].count = count;
3147
sb_dev->tc_to_txq[tc].offset = offset;
3148
3149
/* Provide a way for Tx queue to find the tc_to_txq map or
3150
* XPS map for itself.
3151
*/
3152
while (count--)
3153
netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
3154
3155
return 0;
3156
}
3157
EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
3158
3159
int netdev_set_sb_channel(struct net_device *dev, u16 channel)
3160
{
3161
/* Do not use a multiqueue device to represent a subordinate channel */
3162
if (netif_is_multiqueue(dev))
3163
return -ENODEV;
3164
3165
/* We allow channels 1 - 32767 to be used for subordinate channels.
3166
* Channel 0 is meant to be "native" mode and used only to represent
3167
* the main root device. We allow writing 0 to reset the device back
3168
* to normal mode after being used as a subordinate channel.
3169
*/
3170
if (channel > S16_MAX)
3171
return -EINVAL;
3172
3173
dev->num_tc = -channel;
3174
3175
return 0;
3176
}
3177
EXPORT_SYMBOL(netdev_set_sb_channel);
3178
3179
/*
3180
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
3181
* greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
3182
*/
3183
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
3184
{
3185
bool disabling;
3186
int rc;
3187
3188
disabling = txq < dev->real_num_tx_queues;
3189
3190
if (txq < 1 || txq > dev->num_tx_queues)
3191
return -EINVAL;
3192
3193
if (dev->reg_state == NETREG_REGISTERED ||
3194
dev->reg_state == NETREG_UNREGISTERING) {
3195
netdev_ops_assert_locked(dev);
3196
3197
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
3198
txq);
3199
if (rc)
3200
return rc;
3201
3202
if (dev->num_tc)
3203
netif_setup_tc(dev, txq);
3204
3205
net_shaper_set_real_num_tx_queues(dev, txq);
3206
3207
dev_qdisc_change_real_num_tx(dev, txq);
3208
3209
dev->real_num_tx_queues = txq;
3210
3211
if (disabling) {
3212
synchronize_net();
3213
qdisc_reset_all_tx_gt(dev, txq);
3214
#ifdef CONFIG_XPS
3215
netif_reset_xps_queues_gt(dev, txq);
3216
#endif
3217
}
3218
} else {
3219
dev->real_num_tx_queues = txq;
3220
}
3221
3222
return 0;
3223
}
3224
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
3225
3226
/**
3227
* netif_set_real_num_rx_queues - set actual number of RX queues used
3228
* @dev: Network device
3229
* @rxq: Actual number of RX queues
3230
*
3231
* This must be called either with the rtnl_lock held or before
3232
* registration of the net device. Returns 0 on success, or a
3233
* negative error code. If called before registration, it always
3234
* succeeds.
3235
*/
3236
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
3237
{
3238
int rc;
3239
3240
if (rxq < 1 || rxq > dev->num_rx_queues)
3241
return -EINVAL;
3242
3243
if (dev->reg_state == NETREG_REGISTERED) {
3244
netdev_ops_assert_locked(dev);
3245
3246
rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
3247
rxq);
3248
if (rc)
3249
return rc;
3250
}
3251
3252
dev->real_num_rx_queues = rxq;
3253
return 0;
3254
}
3255
EXPORT_SYMBOL(netif_set_real_num_rx_queues);
3256
3257
/**
3258
* netif_set_real_num_queues - set actual number of RX and TX queues used
3259
* @dev: Network device
3260
* @txq: Actual number of TX queues
3261
* @rxq: Actual number of RX queues
3262
*
3263
* Set the real number of both TX and RX queues.
3264
* Does nothing if the number of queues is already correct.
3265
*/
3266
int netif_set_real_num_queues(struct net_device *dev,
3267
unsigned int txq, unsigned int rxq)
3268
{
3269
unsigned int old_rxq = dev->real_num_rx_queues;
3270
int err;
3271
3272
if (txq < 1 || txq > dev->num_tx_queues ||
3273
rxq < 1 || rxq > dev->num_rx_queues)
3274
return -EINVAL;
3275
3276
/* Start from increases, so the error path only does decreases -
3277
* decreases can't fail.
3278
*/
3279
if (rxq > dev->real_num_rx_queues) {
3280
err = netif_set_real_num_rx_queues(dev, rxq);
3281
if (err)
3282
return err;
3283
}
3284
if (txq > dev->real_num_tx_queues) {
3285
err = netif_set_real_num_tx_queues(dev, txq);
3286
if (err)
3287
goto undo_rx;
3288
}
3289
if (rxq < dev->real_num_rx_queues)
3290
WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
3291
if (txq < dev->real_num_tx_queues)
3292
WARN_ON(netif_set_real_num_tx_queues(dev, txq));
3293
3294
return 0;
3295
undo_rx:
3296
WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
3297
return err;
3298
}
3299
EXPORT_SYMBOL(netif_set_real_num_queues);
3300
3301
/**
3302
* netif_set_tso_max_size() - set the max size of TSO frames supported
3303
* @dev: netdev to update
3304
* @size: max skb->len of a TSO frame
3305
*
3306
* Set the limit on the size of TSO super-frames the device can handle.
3307
* Unless explicitly set the stack will assume the value of
3308
* %GSO_LEGACY_MAX_SIZE.
3309
*/
3310
void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
3311
{
3312
dev->tso_max_size = min(GSO_MAX_SIZE, size);
3313
if (size < READ_ONCE(dev->gso_max_size))
3314
netif_set_gso_max_size(dev, size);
3315
if (size < READ_ONCE(dev->gso_ipv4_max_size))
3316
netif_set_gso_ipv4_max_size(dev, size);
3317
}
3318
EXPORT_SYMBOL(netif_set_tso_max_size);
3319
3320
/**
3321
* netif_set_tso_max_segs() - set the max number of segs supported for TSO
3322
* @dev: netdev to update
3323
* @segs: max number of TCP segments
3324
*
3325
* Set the limit on the number of TCP segments the device can generate from
3326
* a single TSO super-frame.
3327
* Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
3328
*/
3329
void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
3330
{
3331
dev->tso_max_segs = segs;
3332
if (segs < READ_ONCE(dev->gso_max_segs))
3333
netif_set_gso_max_segs(dev, segs);
3334
}
3335
EXPORT_SYMBOL(netif_set_tso_max_segs);
3336
3337
/**
3338
* netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
3339
* @to: netdev to update
3340
* @from: netdev from which to copy the limits
3341
*/
3342
void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
3343
{
3344
netif_set_tso_max_size(to, from->tso_max_size);
3345
netif_set_tso_max_segs(to, from->tso_max_segs);
3346
}
3347
EXPORT_SYMBOL(netif_inherit_tso_max);
3348
3349
/**
3350
* netif_get_num_default_rss_queues - default number of RSS queues
3351
*
3352
* Default value is the number of physical cores if there are only 1 or 2, or
3353
* divided by 2 if there are more.
3354
*/
3355
int netif_get_num_default_rss_queues(void)
3356
{
3357
cpumask_var_t cpus;
3358
int cpu, count = 0;
3359
3360
if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
3361
return 1;
3362
3363
cpumask_copy(cpus, cpu_online_mask);
3364
for_each_cpu(cpu, cpus) {
3365
++count;
3366
cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
3367
}
3368
free_cpumask_var(cpus);
3369
3370
return count > 2 ? DIV_ROUND_UP(count, 2) : count;
3371
}
3372
EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3373
3374
static void __netif_reschedule(struct Qdisc *q)
3375
{
3376
struct softnet_data *sd;
3377
unsigned long flags;
3378
3379
local_irq_save(flags);
3380
sd = this_cpu_ptr(&softnet_data);
3381
q->next_sched = NULL;
3382
*sd->output_queue_tailp = q;
3383
sd->output_queue_tailp = &q->next_sched;
3384
raise_softirq_irqoff(NET_TX_SOFTIRQ);
3385
local_irq_restore(flags);
3386
}
3387
3388
void __netif_schedule(struct Qdisc *q)
3389
{
3390
/* If q->defer_list is not empty, at least one thread is
3391
* in __dev_xmit_skb() before llist_del_all(&q->defer_list).
3392
* This thread will attempt to run the queue.
3393
*/
3394
if (!llist_empty(&q->defer_list))
3395
return;
3396
3397
if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3398
__netif_reschedule(q);
3399
}
3400
EXPORT_SYMBOL(__netif_schedule);
3401
3402
struct dev_kfree_skb_cb {
3403
enum skb_drop_reason reason;
3404
};
3405
3406
static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3407
{
3408
return (struct dev_kfree_skb_cb *)skb->cb;
3409
}
3410
3411
void netif_schedule_queue(struct netdev_queue *txq)
3412
{
3413
rcu_read_lock();
3414
if (!netif_xmit_stopped(txq)) {
3415
struct Qdisc *q = rcu_dereference(txq->qdisc);
3416
3417
__netif_schedule(q);
3418
}
3419
rcu_read_unlock();
3420
}
3421
EXPORT_SYMBOL(netif_schedule_queue);
3422
3423
void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3424
{
3425
if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3426
struct Qdisc *q;
3427
3428
rcu_read_lock();
3429
q = rcu_dereference(dev_queue->qdisc);
3430
__netif_schedule(q);
3431
rcu_read_unlock();
3432
}
3433
}
3434
EXPORT_SYMBOL(netif_tx_wake_queue);
3435
3436
void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3437
{
3438
unsigned long flags;
3439
3440
if (unlikely(!skb))
3441
return;
3442
3443
if (likely(refcount_read(&skb->users) == 1)) {
3444
smp_rmb();
3445
refcount_set(&skb->users, 0);
3446
} else if (likely(!refcount_dec_and_test(&skb->users))) {
3447
return;
3448
}
3449
get_kfree_skb_cb(skb)->reason = reason;
3450
local_irq_save(flags);
3451
skb->next = __this_cpu_read(softnet_data.completion_queue);
3452
__this_cpu_write(softnet_data.completion_queue, skb);
3453
raise_softirq_irqoff(NET_TX_SOFTIRQ);
3454
local_irq_restore(flags);
3455
}
3456
EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
3457
3458
void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3459
{
3460
if (in_hardirq() || irqs_disabled())
3461
dev_kfree_skb_irq_reason(skb, reason);
3462
else
3463
kfree_skb_reason(skb, reason);
3464
}
3465
EXPORT_SYMBOL(dev_kfree_skb_any_reason);
3466
3467
3468
/**
3469
* netif_device_detach - mark device as removed
3470
* @dev: network device
3471
*
3472
* Mark device as removed from system and therefore no longer available.
3473
*/
3474
void netif_device_detach(struct net_device *dev)
3475
{
3476
if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3477
netif_running(dev)) {
3478
netif_tx_stop_all_queues(dev);
3479
}
3480
}
3481
EXPORT_SYMBOL(netif_device_detach);
3482
3483
/**
3484
* netif_device_attach - mark device as attached
3485
* @dev: network device
3486
*
3487
* Mark device as attached from system and restart if needed.
3488
*/
3489
void netif_device_attach(struct net_device *dev)
3490
{
3491
if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3492
netif_running(dev)) {
3493
netif_tx_wake_all_queues(dev);
3494
netdev_watchdog_up(dev);
3495
}
3496
}
3497
EXPORT_SYMBOL(netif_device_attach);
3498
3499
/*
3500
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
3501
* to be used as a distribution range.
3502
*/
3503
static u16 skb_tx_hash(const struct net_device *dev,
3504
const struct net_device *sb_dev,
3505
struct sk_buff *skb)
3506
{
3507
u32 hash;
3508
u16 qoffset = 0;
3509
u16 qcount = dev->real_num_tx_queues;
3510
3511
if (dev->num_tc) {
3512
u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3513
3514
qoffset = sb_dev->tc_to_txq[tc].offset;
3515
qcount = sb_dev->tc_to_txq[tc].count;
3516
if (unlikely(!qcount)) {
3517
net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
3518
sb_dev->name, qoffset, tc);
3519
qoffset = 0;
3520
qcount = dev->real_num_tx_queues;
3521
}
3522
}
3523
3524
if (skb_rx_queue_recorded(skb)) {
3525
DEBUG_NET_WARN_ON_ONCE(qcount == 0);
3526
hash = skb_get_rx_queue(skb);
3527
if (hash >= qoffset)
3528
hash -= qoffset;
3529
while (unlikely(hash >= qcount))
3530
hash -= qcount;
3531
return hash + qoffset;
3532
}
3533
3534
return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3535
}
3536
3537
void skb_warn_bad_offload(const struct sk_buff *skb)
3538
{
3539
static const netdev_features_t null_features;
3540
struct net_device *dev = skb->dev;
3541
const char *name = "";
3542
3543
if (!net_ratelimit())
3544
return;
3545
3546
if (dev) {
3547
if (dev->dev.parent)
3548
name = dev_driver_string(dev->dev.parent);
3549
else
3550
name = netdev_name(dev);
3551
}
3552
skb_dump(KERN_WARNING, skb, false);
3553
WARN(1, "%s: caps=(%pNF, %pNF)\n",
3554
name, dev ? &dev->features : &null_features,
3555
skb->sk ? &skb->sk->sk_route_caps : &null_features);
3556
}
3557
3558
/*
3559
* Invalidate hardware checksum when packet is to be mangled, and
3560
* complete checksum manually on outgoing path.
3561
*/
3562
int skb_checksum_help(struct sk_buff *skb)
3563
{
3564
__wsum csum;
3565
int ret = 0, offset;
3566
3567
if (skb->ip_summed == CHECKSUM_COMPLETE)
3568
goto out_set_summed;
3569
3570
if (unlikely(skb_is_gso(skb))) {
3571
skb_warn_bad_offload(skb);
3572
return -EINVAL;
3573
}
3574
3575
if (!skb_frags_readable(skb)) {
3576
return -EFAULT;
3577
}
3578
3579
/* Before computing a checksum, we should make sure no frag could
3580
* be modified by an external entity : checksum could be wrong.
3581
*/
3582
if (skb_has_shared_frag(skb)) {
3583
ret = __skb_linearize(skb);
3584
if (ret)
3585
goto out;
3586
}
3587
3588
offset = skb_checksum_start_offset(skb);
3589
ret = -EINVAL;
3590
if (unlikely(offset >= skb_headlen(skb))) {
3591
DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3592
WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
3593
offset, skb_headlen(skb));
3594
goto out;
3595
}
3596
csum = skb_checksum(skb, offset, skb->len - offset, 0);
3597
3598
offset += skb->csum_offset;
3599
if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
3600
DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3601
WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
3602
offset + sizeof(__sum16), skb_headlen(skb));
3603
goto out;
3604
}
3605
ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3606
if (ret)
3607
goto out;
3608
3609
*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3610
out_set_summed:
3611
skb->ip_summed = CHECKSUM_NONE;
3612
out:
3613
return ret;
3614
}
3615
EXPORT_SYMBOL(skb_checksum_help);
3616
3617
#ifdef CONFIG_NET_CRC32C
3618
int skb_crc32c_csum_help(struct sk_buff *skb)
3619
{
3620
u32 crc;
3621
int ret = 0, offset, start;
3622
3623
if (skb->ip_summed != CHECKSUM_PARTIAL)
3624
goto out;
3625
3626
if (unlikely(skb_is_gso(skb)))
3627
goto out;
3628
3629
/* Before computing a checksum, we should make sure no frag could
3630
* be modified by an external entity : checksum could be wrong.
3631
*/
3632
if (unlikely(skb_has_shared_frag(skb))) {
3633
ret = __skb_linearize(skb);
3634
if (ret)
3635
goto out;
3636
}
3637
start = skb_checksum_start_offset(skb);
3638
offset = start + offsetof(struct sctphdr, checksum);
3639
if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3640
ret = -EINVAL;
3641
goto out;
3642
}
3643
3644
ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3645
if (ret)
3646
goto out;
3647
3648
crc = ~skb_crc32c(skb, start, skb->len - start, ~0);
3649
*(__le32 *)(skb->data + offset) = cpu_to_le32(crc);
3650
skb_reset_csum_not_inet(skb);
3651
out:
3652
return ret;
3653
}
3654
EXPORT_SYMBOL(skb_crc32c_csum_help);
3655
#endif /* CONFIG_NET_CRC32C */
3656
3657
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3658
{
3659
__be16 type = skb->protocol;
3660
3661
/* Tunnel gso handlers can set protocol to ethernet. */
3662
if (type == htons(ETH_P_TEB)) {
3663
struct ethhdr *eth;
3664
3665
if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3666
return 0;
3667
3668
eth = (struct ethhdr *)skb->data;
3669
type = eth->h_proto;
3670
}
3671
3672
return vlan_get_protocol_and_depth(skb, type, depth);
3673
}
3674
3675
3676
/* Take action when hardware reception checksum errors are detected. */
3677
#ifdef CONFIG_BUG
3678
static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3679
{
3680
netdev_err(dev, "hw csum failure\n");
3681
skb_dump(KERN_ERR, skb, true);
3682
dump_stack();
3683
}
3684
3685
void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3686
{
3687
DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
3688
}
3689
EXPORT_SYMBOL(netdev_rx_csum_fault);
3690
#endif
3691
3692
/* XXX: check that highmem exists at all on the given machine. */
3693
static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3694
{
3695
#ifdef CONFIG_HIGHMEM
3696
int i;
3697
3698
if (!(dev->features & NETIF_F_HIGHDMA)) {
3699
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3700
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3701
struct page *page = skb_frag_page(frag);
3702
3703
if (page && PageHighMem(page))
3704
return 1;
3705
}
3706
}
3707
#endif
3708
return 0;
3709
}
3710
3711
/* If MPLS offload request, verify we are testing hardware MPLS features
3712
* instead of standard features for the netdev.
3713
*/
3714
#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3715
static netdev_features_t net_mpls_features(struct sk_buff *skb,
3716
netdev_features_t features,
3717
__be16 type)
3718
{
3719
if (eth_p_mpls(type))
3720
features &= skb->dev->mpls_features;
3721
3722
return features;
3723
}
3724
#else
3725
static netdev_features_t net_mpls_features(struct sk_buff *skb,
3726
netdev_features_t features,
3727
__be16 type)
3728
{
3729
return features;
3730
}
3731
#endif
3732
3733
static netdev_features_t harmonize_features(struct sk_buff *skb,
3734
netdev_features_t features)
3735
{
3736
__be16 type;
3737
3738
type = skb_network_protocol(skb, NULL);
3739
features = net_mpls_features(skb, features, type);
3740
3741
if (skb->ip_summed != CHECKSUM_NONE &&
3742
!can_checksum_protocol(features, type)) {
3743
features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3744
}
3745
if (illegal_highdma(skb->dev, skb))
3746
features &= ~NETIF_F_SG;
3747
3748
return features;
3749
}
3750
3751
netdev_features_t passthru_features_check(struct sk_buff *skb,
3752
struct net_device *dev,
3753
netdev_features_t features)
3754
{
3755
return features;
3756
}
3757
EXPORT_SYMBOL(passthru_features_check);
3758
3759
static netdev_features_t dflt_features_check(struct sk_buff *skb,
3760
struct net_device *dev,
3761
netdev_features_t features)
3762
{
3763
return vlan_features_check(skb, features);
3764
}
3765
3766
static netdev_features_t gso_features_check(const struct sk_buff *skb,
3767
struct net_device *dev,
3768
netdev_features_t features)
3769
{
3770
u16 gso_segs = skb_shinfo(skb)->gso_segs;
3771
3772
if (gso_segs > READ_ONCE(dev->gso_max_segs))
3773
return features & ~NETIF_F_GSO_MASK;
3774
3775
if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb)))
3776
return features & ~NETIF_F_GSO_MASK;
3777
3778
if (!skb_shinfo(skb)->gso_type) {
3779
skb_warn_bad_offload(skb);
3780
return features & ~NETIF_F_GSO_MASK;
3781
}
3782
3783
/* Support for GSO partial features requires software
3784
* intervention before we can actually process the packets
3785
* so we need to strip support for any partial features now
3786
* and we can pull them back in after we have partially
3787
* segmented the frame.
3788
*/
3789
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3790
features &= ~dev->gso_partial_features;
3791
3792
/* Make sure to clear the IPv4 ID mangling feature if the IPv4 header
3793
* has the potential to be fragmented so that TSO does not generate
3794
* segments with the same ID. For encapsulated packets, the ID mangling
3795
* feature is guaranteed not to use the same ID for the outer IPv4
3796
* headers of the generated segments if the headers have the potential
3797
* to be fragmented, so there is no need to clear the IPv4 ID mangling
3798
* feature (see the section about NETIF_F_TSO_MANGLEID in
3799
* segmentation-offloads.rst).
3800
*/
3801
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3802
struct iphdr *iph = skb->encapsulation ?
3803
inner_ip_hdr(skb) : ip_hdr(skb);
3804
3805
if (!(iph->frag_off & htons(IP_DF)))
3806
features &= ~NETIF_F_TSO_MANGLEID;
3807
}
3808
3809
/* NETIF_F_IPV6_CSUM does not support IPv6 extension headers,
3810
* so neither does TSO that depends on it.
3811
*/
3812
if (features & NETIF_F_IPV6_CSUM &&
3813
(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 ||
3814
(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
3815
vlan_get_protocol(skb) == htons(ETH_P_IPV6))) &&
3816
skb_transport_header_was_set(skb) &&
3817
skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
3818
!ipv6_has_hopopt_jumbo(skb))
3819
features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4);
3820
3821
return features;
3822
}
3823
3824
netdev_features_t netif_skb_features(struct sk_buff *skb)
3825
{
3826
struct net_device *dev = skb->dev;
3827
netdev_features_t features = dev->features;
3828
3829
if (skb_is_gso(skb))
3830
features = gso_features_check(skb, dev, features);
3831
3832
/* If encapsulation offload request, verify we are testing
3833
* hardware encapsulation features instead of standard
3834
* features for the netdev
3835
*/
3836
if (skb->encapsulation)
3837
features &= dev->hw_enc_features;
3838
3839
if (skb_vlan_tagged(skb))
3840
features = netdev_intersect_features(features,
3841
dev->vlan_features |
3842
NETIF_F_HW_VLAN_CTAG_TX |
3843
NETIF_F_HW_VLAN_STAG_TX);
3844
3845
if (dev->netdev_ops->ndo_features_check)
3846
features &= dev->netdev_ops->ndo_features_check(skb, dev,
3847
features);
3848
else
3849
features &= dflt_features_check(skb, dev, features);
3850
3851
return harmonize_features(skb, features);
3852
}
3853
EXPORT_SYMBOL(netif_skb_features);
3854
3855
static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3856
struct netdev_queue *txq, bool more)
3857
{
3858
unsigned int len;
3859
int rc;
3860
3861
if (dev_nit_active_rcu(dev))
3862
dev_queue_xmit_nit(skb, dev);
3863
3864
len = skb->len;
3865
trace_net_dev_start_xmit(skb, dev);
3866
rc = netdev_start_xmit(skb, dev, txq, more);
3867
trace_net_dev_xmit(skb, rc, dev, len);
3868
3869
return rc;
3870
}
3871
3872
struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3873
struct netdev_queue *txq, int *ret)
3874
{
3875
struct sk_buff *skb = first;
3876
int rc = NETDEV_TX_OK;
3877
3878
while (skb) {
3879
struct sk_buff *next = skb->next;
3880
3881
skb_mark_not_on_list(skb);
3882
rc = xmit_one(skb, dev, txq, next != NULL);
3883
if (unlikely(!dev_xmit_complete(rc))) {
3884
skb->next = next;
3885
goto out;
3886
}
3887
3888
skb = next;
3889
if (netif_tx_queue_stopped(txq) && skb) {
3890
rc = NETDEV_TX_BUSY;
3891
break;
3892
}
3893
}
3894
3895
out:
3896
*ret = rc;
3897
return skb;
3898
}
3899
3900
static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3901
netdev_features_t features)
3902
{
3903
if (skb_vlan_tag_present(skb) &&
3904
!vlan_hw_offload_capable(features, skb->vlan_proto))
3905
skb = __vlan_hwaccel_push_inside(skb);
3906
return skb;
3907
}
3908
3909
int skb_csum_hwoffload_help(struct sk_buff *skb,
3910
const netdev_features_t features)
3911
{
3912
if (unlikely(skb_csum_is_sctp(skb)))
3913
return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3914
skb_crc32c_csum_help(skb);
3915
3916
if (features & NETIF_F_HW_CSUM)
3917
return 0;
3918
3919
if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
3920
if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
3921
skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
3922
!ipv6_has_hopopt_jumbo(skb))
3923
goto sw_checksum;
3924
3925
switch (skb->csum_offset) {
3926
case offsetof(struct tcphdr, check):
3927
case offsetof(struct udphdr, check):
3928
return 0;
3929
}
3930
}
3931
3932
sw_checksum:
3933
return skb_checksum_help(skb);
3934
}
3935
EXPORT_SYMBOL(skb_csum_hwoffload_help);
3936
3937
/* Checks if this SKB belongs to an HW offloaded socket
3938
* and whether any SW fallbacks are required based on dev.
3939
* Check decrypted mark in case skb_orphan() cleared socket.
3940
*/
3941
static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
3942
struct net_device *dev)
3943
{
3944
#ifdef CONFIG_SOCK_VALIDATE_XMIT
3945
struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev,
3946
struct sk_buff *skb);
3947
struct sock *sk = skb->sk;
3948
3949
sk_validate = NULL;
3950
if (sk) {
3951
if (sk_fullsock(sk))
3952
sk_validate = sk->sk_validate_xmit_skb;
3953
else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT)
3954
sk_validate = inet_twsk(sk)->tw_validate_xmit_skb;
3955
}
3956
3957
if (sk_validate) {
3958
skb = sk_validate(sk, dev, skb);
3959
} else if (unlikely(skb_is_decrypted(skb))) {
3960
pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
3961
kfree_skb(skb);
3962
skb = NULL;
3963
}
3964
#endif
3965
3966
return skb;
3967
}
3968
3969
static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
3970
struct net_device *dev)
3971
{
3972
struct skb_shared_info *shinfo;
3973
struct net_iov *niov;
3974
3975
if (likely(skb_frags_readable(skb)))
3976
goto out;
3977
3978
if (!dev->netmem_tx)
3979
goto out_free;
3980
3981
shinfo = skb_shinfo(skb);
3982
3983
if (shinfo->nr_frags > 0) {
3984
niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
3985
if (net_is_devmem_iov(niov) &&
3986
net_devmem_iov_binding(niov)->dev != dev)
3987
goto out_free;
3988
}
3989
3990
out:
3991
return skb;
3992
3993
out_free:
3994
kfree_skb(skb);
3995
return NULL;
3996
}
3997
3998
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3999
{
4000
netdev_features_t features;
4001
4002
skb = validate_xmit_unreadable_skb(skb, dev);
4003
if (unlikely(!skb))
4004
goto out_null;
4005
4006
features = netif_skb_features(skb);
4007
skb = validate_xmit_vlan(skb, features);
4008
if (unlikely(!skb))
4009
goto out_null;
4010
4011
skb = sk_validate_xmit_skb(skb, dev);
4012
if (unlikely(!skb))
4013
goto out_null;
4014
4015
if (netif_needs_gso(skb, features)) {
4016
struct sk_buff *segs;
4017
4018
segs = skb_gso_segment(skb, features);
4019
if (IS_ERR(segs)) {
4020
goto out_kfree_skb;
4021
} else if (segs) {
4022
consume_skb(skb);
4023
skb = segs;
4024
}
4025
} else {
4026
if (skb_needs_linearize(skb, features) &&
4027
__skb_linearize(skb))
4028
goto out_kfree_skb;
4029
4030
/* If packet is not checksummed and device does not
4031
* support checksumming for this protocol, complete
4032
* checksumming here.
4033
*/
4034
if (skb->ip_summed == CHECKSUM_PARTIAL) {
4035
if (skb->encapsulation)
4036
skb_set_inner_transport_header(skb,
4037
skb_checksum_start_offset(skb));
4038
else
4039
skb_set_transport_header(skb,
4040
skb_checksum_start_offset(skb));
4041
if (skb_csum_hwoffload_help(skb, features))
4042
goto out_kfree_skb;
4043
}
4044
}
4045
4046
skb = validate_xmit_xfrm(skb, features, again);
4047
4048
return skb;
4049
4050
out_kfree_skb:
4051
kfree_skb(skb);
4052
out_null:
4053
dev_core_stats_tx_dropped_inc(dev);
4054
return NULL;
4055
}
4056
4057
struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
4058
{
4059
struct sk_buff *next, *head = NULL, *tail;
4060
4061
for (; skb != NULL; skb = next) {
4062
next = skb->next;
4063
skb_mark_not_on_list(skb);
4064
4065
/* in case skb won't be segmented, point to itself */
4066
skb->prev = skb;
4067
4068
skb = validate_xmit_skb(skb, dev, again);
4069
if (!skb)
4070
continue;
4071
4072
if (!head)
4073
head = skb;
4074
else
4075
tail->next = skb;
4076
/* If skb was segmented, skb->prev points to
4077
* the last segment. If not, it still contains skb.
4078
*/
4079
tail = skb->prev;
4080
}
4081
return head;
4082
}
4083
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
4084
4085
static void qdisc_pkt_len_segs_init(struct sk_buff *skb)
4086
{
4087
struct skb_shared_info *shinfo = skb_shinfo(skb);
4088
u16 gso_segs;
4089
4090
qdisc_skb_cb(skb)->pkt_len = skb->len;
4091
if (!shinfo->gso_size) {
4092
qdisc_skb_cb(skb)->pkt_segs = 1;
4093
return;
4094
}
4095
4096
qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs;
4097
4098
/* To get more precise estimation of bytes sent on wire,
4099
* we add to pkt_len the headers size of all segments
4100
*/
4101
if (skb_transport_header_was_set(skb)) {
4102
unsigned int hdr_len;
4103
4104
/* mac layer + network layer */
4105
if (!skb->encapsulation)
4106
hdr_len = skb_transport_offset(skb);
4107
else
4108
hdr_len = skb_inner_transport_offset(skb);
4109
4110
/* + transport layer */
4111
if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
4112
const struct tcphdr *th;
4113
struct tcphdr _tcphdr;
4114
4115
th = skb_header_pointer(skb, hdr_len,
4116
sizeof(_tcphdr), &_tcphdr);
4117
if (likely(th))
4118
hdr_len += __tcp_hdrlen(th);
4119
} else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
4120
struct udphdr _udphdr;
4121
4122
if (skb_header_pointer(skb, hdr_len,
4123
sizeof(_udphdr), &_udphdr))
4124
hdr_len += sizeof(struct udphdr);
4125
}
4126
4127
if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) {
4128
int payload = skb->len - hdr_len;
4129
4130
/* Malicious packet. */
4131
if (payload <= 0)
4132
return;
4133
gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
4134
shinfo->gso_segs = gso_segs;
4135
qdisc_skb_cb(skb)->pkt_segs = gso_segs;
4136
}
4137
qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
4138
}
4139
}
4140
4141
static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
4142
struct sk_buff **to_free,
4143
struct netdev_queue *txq)
4144
{
4145
int rc;
4146
4147
rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
4148
if (rc == NET_XMIT_SUCCESS)
4149
trace_qdisc_enqueue(q, txq, skb);
4150
return rc;
4151
}
4152
4153
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
4154
struct net_device *dev,
4155
struct netdev_queue *txq)
4156
{
4157
struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
4158
spinlock_t *root_lock = qdisc_lock(q);
4159
struct llist_node *ll_list, *first_n;
4160
unsigned long defer_count = 0;
4161
int rc;
4162
4163
qdisc_calculate_pkt_len(skb, q);
4164
4165
tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
4166
4167
if (q->flags & TCQ_F_NOLOCK) {
4168
if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
4169
qdisc_run_begin(q)) {
4170
/* Retest nolock_qdisc_is_empty() within the protection
4171
* of q->seqlock to protect from racing with requeuing.
4172
*/
4173
if (unlikely(!nolock_qdisc_is_empty(q))) {
4174
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
4175
__qdisc_run(q);
4176
to_free2 = qdisc_run_end(q);
4177
4178
goto free_skbs;
4179
}
4180
4181
qdisc_bstats_cpu_update(q, skb);
4182
if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
4183
!nolock_qdisc_is_empty(q))
4184
__qdisc_run(q);
4185
4186
to_free2 = qdisc_run_end(q);
4187
rc = NET_XMIT_SUCCESS;
4188
goto free_skbs;
4189
}
4190
4191
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
4192
to_free2 = qdisc_run(q);
4193
goto free_skbs;
4194
}
4195
4196
/* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
4197
* In the try_cmpxchg() loop, we want to increment q->defer_count
4198
* at most once to limit the number of skbs in defer_list.
4199
* We perform the defer_count increment only if the list is not empty,
4200
* because some arches have slow atomic_long_inc_return().
4201
*/
4202
first_n = READ_ONCE(q->defer_list.first);
4203
do {
4204
if (first_n && !defer_count) {
4205
defer_count = atomic_long_inc_return(&q->defer_count);
4206
if (unlikely(defer_count > READ_ONCE(net_hotdata.qdisc_max_burst))) {
4207
kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_BURST_DROP);
4208
return NET_XMIT_DROP;
4209
}
4210
}
4211
skb->ll_node.next = first_n;
4212
} while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node));
4213
4214
/* If defer_list was not empty, we know the cpu which queued
4215
* the first skb will process the whole list for us.
4216
*/
4217
if (first_n)
4218
return NET_XMIT_SUCCESS;
4219
4220
spin_lock(root_lock);
4221
4222
ll_list = llist_del_all(&q->defer_list);
4223
/* There is a small race because we clear defer_count not atomically
4224
* with the prior llist_del_all(). This means defer_list could grow
4225
* over qdisc_max_burst.
4226
*/
4227
atomic_long_set(&q->defer_count, 0);
4228
4229
ll_list = llist_reverse_order(ll_list);
4230
4231
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
4232
llist_for_each_entry_safe(skb, next, ll_list, ll_node)
4233
__qdisc_drop(skb, &to_free);
4234
rc = NET_XMIT_DROP;
4235
goto unlock;
4236
}
4237
if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
4238
!llist_next(ll_list) && qdisc_run_begin(q)) {
4239
/*
4240
* This is a work-conserving queue; there are no old skbs
4241
* waiting to be sent out; and the qdisc is not running -
4242
* xmit the skb directly.
4243
*/
4244
4245
DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list,
4246
struct sk_buff,
4247
ll_node));
4248
qdisc_bstats_update(q, skb);
4249
if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
4250
__qdisc_run(q);
4251
to_free2 = qdisc_run_end(q);
4252
rc = NET_XMIT_SUCCESS;
4253
} else {
4254
int count = 0;
4255
4256
llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
4257
if (next) {
4258
prefetch(next);
4259
prefetch(&next->priority);
4260
skb_mark_not_on_list(skb);
4261
}
4262
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
4263
count++;
4264
}
4265
to_free2 = qdisc_run(q);
4266
if (count != 1)
4267
rc = NET_XMIT_SUCCESS;
4268
}
4269
unlock:
4270
spin_unlock(root_lock);
4271
4272
free_skbs:
4273
tcf_kfree_skb_list(to_free);
4274
tcf_kfree_skb_list(to_free2);
4275
return rc;
4276
}
4277
4278
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
4279
static void skb_update_prio(struct sk_buff *skb)
4280
{
4281
const struct netprio_map *map;
4282
const struct sock *sk;
4283
unsigned int prioidx;
4284
4285
if (skb->priority)
4286
return;
4287
map = rcu_dereference_bh(skb->dev->priomap);
4288
if (!map)
4289
return;
4290
sk = skb_to_full_sk(skb);
4291
if (!sk)
4292
return;
4293
4294
prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
4295
4296
if (prioidx < map->priomap_len)
4297
skb->priority = map->priomap[prioidx];
4298
}
4299
#else
4300
#define skb_update_prio(skb)
4301
#endif
4302
4303
/**
4304
* dev_loopback_xmit - loop back @skb
4305
* @net: network namespace this loopback is happening in
4306
* @sk: sk needed to be a netfilter okfn
4307
* @skb: buffer to transmit
4308
*/
4309
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
4310
{
4311
skb_reset_mac_header(skb);
4312
__skb_pull(skb, skb_network_offset(skb));
4313
skb->pkt_type = PACKET_LOOPBACK;
4314
if (skb->ip_summed == CHECKSUM_NONE)
4315
skb->ip_summed = CHECKSUM_UNNECESSARY;
4316
DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
4317
skb_dst_force(skb);
4318
netif_rx(skb);
4319
return 0;
4320
}
4321
EXPORT_SYMBOL(dev_loopback_xmit);
4322
4323
#ifdef CONFIG_NET_EGRESS
4324
static struct netdev_queue *
4325
netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
4326
{
4327
int qm = skb_get_queue_mapping(skb);
4328
4329
return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
4330
}
4331
4332
#ifndef CONFIG_PREEMPT_RT
4333
static bool netdev_xmit_txqueue_skipped(void)
4334
{
4335
return __this_cpu_read(softnet_data.xmit.skip_txqueue);
4336
}
4337
4338
void netdev_xmit_skip_txqueue(bool skip)
4339
{
4340
__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
4341
}
4342
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
4343
4344
#else
4345
static bool netdev_xmit_txqueue_skipped(void)
4346
{
4347
return current->net_xmit.skip_txqueue;
4348
}
4349
4350
void netdev_xmit_skip_txqueue(bool skip)
4351
{
4352
current->net_xmit.skip_txqueue = skip;
4353
}
4354
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
4355
#endif
4356
#endif /* CONFIG_NET_EGRESS */
4357
4358
#ifdef CONFIG_NET_XGRESS
4359
static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
4360
enum skb_drop_reason *drop_reason)
4361
{
4362
int ret = TC_ACT_UNSPEC;
4363
#ifdef CONFIG_NET_CLS_ACT
4364
struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
4365
struct tcf_result res;
4366
4367
if (!miniq)
4368
return ret;
4369
4370
/* Global bypass */
4371
if (!static_branch_likely(&tcf_sw_enabled_key))
4372
return ret;
4373
4374
/* Block-wise bypass */
4375
if (tcf_block_bypass_sw(miniq->block))
4376
return ret;
4377
4378
tc_skb_cb(skb)->mru = 0;
4379
qdisc_skb_cb(skb)->post_ct = false;
4380
tcf_set_drop_reason(skb, *drop_reason);
4381
4382
mini_qdisc_bstats_cpu_update(miniq, skb);
4383
ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
4384
/* Only tcf related quirks below. */
4385
switch (ret) {
4386
case TC_ACT_SHOT:
4387
*drop_reason = tcf_get_drop_reason(skb);
4388
mini_qdisc_qstats_cpu_drop(miniq);
4389
break;
4390
case TC_ACT_OK:
4391
case TC_ACT_RECLASSIFY:
4392
skb->tc_index = TC_H_MIN(res.classid);
4393
break;
4394
}
4395
#endif /* CONFIG_NET_CLS_ACT */
4396
return ret;
4397
}
4398
4399
static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
4400
4401
void tcx_inc(void)
4402
{
4403
static_branch_inc(&tcx_needed_key);
4404
}
4405
4406
void tcx_dec(void)
4407
{
4408
static_branch_dec(&tcx_needed_key);
4409
}
4410
4411
static __always_inline enum tcx_action_base
4412
tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
4413
const bool needs_mac)
4414
{
4415
const struct bpf_mprog_fp *fp;
4416
const struct bpf_prog *prog;
4417
int ret = TCX_NEXT;
4418
4419
if (needs_mac)
4420
__skb_push(skb, skb->mac_len);
4421
bpf_mprog_foreach_prog(entry, fp, prog) {
4422
bpf_compute_data_pointers(skb);
4423
ret = bpf_prog_run(prog, skb);
4424
if (ret != TCX_NEXT)
4425
break;
4426
}
4427
if (needs_mac)
4428
__skb_pull(skb, skb->mac_len);
4429
return tcx_action_code(skb, ret);
4430
}
4431
4432
static __always_inline struct sk_buff *
4433
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4434
struct net_device *orig_dev, bool *another)
4435
{
4436
struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
4437
enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
4438
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
4439
int sch_ret;
4440
4441
if (!entry)
4442
return skb;
4443
4444
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
4445
if (unlikely(*pt_prev)) {
4446
*ret = deliver_skb(skb, *pt_prev, orig_dev);
4447
*pt_prev = NULL;
4448
}
4449
4450
qdisc_pkt_len_segs_init(skb);
4451
tcx_set_ingress(skb, true);
4452
4453
if (static_branch_unlikely(&tcx_needed_key)) {
4454
sch_ret = tcx_run(entry, skb, true);
4455
if (sch_ret != TC_ACT_UNSPEC)
4456
goto ingress_verdict;
4457
}
4458
sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
4459
ingress_verdict:
4460
switch (sch_ret) {
4461
case TC_ACT_REDIRECT:
4462
/* skb_mac_header check was done by BPF, so we can safely
4463
* push the L2 header back before redirecting to another
4464
* netdev.
4465
*/
4466
__skb_push(skb, skb->mac_len);
4467
if (skb_do_redirect(skb) == -EAGAIN) {
4468
__skb_pull(skb, skb->mac_len);
4469
*another = true;
4470
break;
4471
}
4472
*ret = NET_RX_SUCCESS;
4473
bpf_net_ctx_clear(bpf_net_ctx);
4474
return NULL;
4475
case TC_ACT_SHOT:
4476
kfree_skb_reason(skb, drop_reason);
4477
*ret = NET_RX_DROP;
4478
bpf_net_ctx_clear(bpf_net_ctx);
4479
return NULL;
4480
/* used by tc_run */
4481
case TC_ACT_STOLEN:
4482
case TC_ACT_QUEUED:
4483
case TC_ACT_TRAP:
4484
consume_skb(skb);
4485
fallthrough;
4486
case TC_ACT_CONSUMED:
4487
*ret = NET_RX_SUCCESS;
4488
bpf_net_ctx_clear(bpf_net_ctx);
4489
return NULL;
4490
}
4491
bpf_net_ctx_clear(bpf_net_ctx);
4492
4493
return skb;
4494
}
4495
4496
static __always_inline struct sk_buff *
4497
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4498
{
4499
struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
4500
enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
4501
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
4502
int sch_ret;
4503
4504
if (!entry)
4505
return skb;
4506
4507
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
4508
4509
/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
4510
* already set by the caller.
4511
*/
4512
if (static_branch_unlikely(&tcx_needed_key)) {
4513
sch_ret = tcx_run(entry, skb, false);
4514
if (sch_ret != TC_ACT_UNSPEC)
4515
goto egress_verdict;
4516
}
4517
sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
4518
egress_verdict:
4519
switch (sch_ret) {
4520
case TC_ACT_REDIRECT:
4521
/* No need to push/pop skb's mac_header here on egress! */
4522
skb_do_redirect(skb);
4523
*ret = NET_XMIT_SUCCESS;
4524
bpf_net_ctx_clear(bpf_net_ctx);
4525
return NULL;
4526
case TC_ACT_SHOT:
4527
kfree_skb_reason(skb, drop_reason);
4528
*ret = NET_XMIT_DROP;
4529
bpf_net_ctx_clear(bpf_net_ctx);
4530
return NULL;
4531
/* used by tc_run */
4532
case TC_ACT_STOLEN:
4533
case TC_ACT_QUEUED:
4534
case TC_ACT_TRAP:
4535
consume_skb(skb);
4536
fallthrough;
4537
case TC_ACT_CONSUMED:
4538
*ret = NET_XMIT_SUCCESS;
4539
bpf_net_ctx_clear(bpf_net_ctx);
4540
return NULL;
4541
}
4542
bpf_net_ctx_clear(bpf_net_ctx);
4543
4544
return skb;
4545
}
4546
#else
4547
static __always_inline struct sk_buff *
4548
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4549
struct net_device *orig_dev, bool *another)
4550
{
4551
return skb;
4552
}
4553
4554
static __always_inline struct sk_buff *
4555
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4556
{
4557
return skb;
4558
}
4559
#endif /* CONFIG_NET_XGRESS */
4560
4561
#ifdef CONFIG_XPS
4562
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
4563
struct xps_dev_maps *dev_maps, unsigned int tci)
4564
{
4565
int tc = netdev_get_prio_tc_map(dev, skb->priority);
4566
struct xps_map *map;
4567
int queue_index = -1;
4568
4569
if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
4570
return queue_index;
4571
4572
tci *= dev_maps->num_tc;
4573
tci += tc;
4574
4575
map = rcu_dereference(dev_maps->attr_map[tci]);
4576
if (map) {
4577
if (map->len == 1)
4578
queue_index = map->queues[0];
4579
else
4580
queue_index = map->queues[reciprocal_scale(
4581
skb_get_hash(skb), map->len)];
4582
if (unlikely(queue_index >= dev->real_num_tx_queues))
4583
queue_index = -1;
4584
}
4585
return queue_index;
4586
}
4587
#endif
4588
4589
static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
4590
struct sk_buff *skb)
4591
{
4592
#ifdef CONFIG_XPS
4593
struct xps_dev_maps *dev_maps;
4594
struct sock *sk = skb->sk;
4595
int queue_index = -1;
4596
4597
if (!static_key_false(&xps_needed))
4598
return -1;
4599
4600
rcu_read_lock();
4601
if (!static_key_false(&xps_rxqs_needed))
4602
goto get_cpus_map;
4603
4604
dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
4605
if (dev_maps) {
4606
int tci = sk_rx_queue_get(sk);
4607
4608
if (tci >= 0)
4609
queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4610
tci);
4611
}
4612
4613
get_cpus_map:
4614
if (queue_index < 0) {
4615
dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
4616
if (dev_maps) {
4617
unsigned int tci = skb->sender_cpu - 1;
4618
4619
queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4620
tci);
4621
}
4622
}
4623
rcu_read_unlock();
4624
4625
return queue_index;
4626
#else
4627
return -1;
4628
#endif
4629
}
4630
4631
u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
4632
struct net_device *sb_dev)
4633
{
4634
return 0;
4635
}
4636
EXPORT_SYMBOL(dev_pick_tx_zero);
4637
4638
int sk_tx_queue_get(const struct sock *sk)
4639
{
4640
int resel, val;
4641
4642
if (!sk)
4643
return -1;
4644
/* Paired with WRITE_ONCE() in sk_tx_queue_clear()
4645
* and sk_tx_queue_set().
4646
*/
4647
val = READ_ONCE(sk->sk_tx_queue_mapping);
4648
4649
if (val == NO_QUEUE_MAPPING)
4650
return -1;
4651
4652
if (!sk_fullsock(sk))
4653
return val;
4654
4655
resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection);
4656
if (resel && time_is_before_jiffies(
4657
READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel))
4658
return -1;
4659
4660
return val;
4661
}
4662
EXPORT_SYMBOL(sk_tx_queue_get);
4663
4664
u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4665
struct net_device *sb_dev)
4666
{
4667
struct sock *sk = skb->sk;
4668
int queue_index = sk_tx_queue_get(sk);
4669
4670
sb_dev = sb_dev ? : dev;
4671
4672
if (queue_index < 0 || skb->ooo_okay ||
4673
queue_index >= dev->real_num_tx_queues) {
4674
int new_index = get_xps_queue(dev, sb_dev, skb);
4675
4676
if (new_index < 0)
4677
new_index = skb_tx_hash(dev, sb_dev, skb);
4678
4679
if (sk && sk_fullsock(sk) &&
4680
rcu_access_pointer(sk->sk_dst_cache))
4681
sk_tx_queue_set(sk, new_index);
4682
4683
queue_index = new_index;
4684
}
4685
4686
return queue_index;
4687
}
4688
EXPORT_SYMBOL(netdev_pick_tx);
4689
4690
struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4691
struct sk_buff *skb,
4692
struct net_device *sb_dev)
4693
{
4694
int queue_index = 0;
4695
4696
#ifdef CONFIG_XPS
4697
u32 sender_cpu = skb->sender_cpu - 1;
4698
4699
if (sender_cpu >= (u32)NR_CPUS)
4700
skb->sender_cpu = raw_smp_processor_id() + 1;
4701
#endif
4702
4703
if (dev->real_num_tx_queues != 1) {
4704
const struct net_device_ops *ops = dev->netdev_ops;
4705
4706
if (ops->ndo_select_queue)
4707
queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4708
else
4709
queue_index = netdev_pick_tx(dev, skb, sb_dev);
4710
4711
queue_index = netdev_cap_txqueue(dev, queue_index);
4712
}
4713
4714
skb_set_queue_mapping(skb, queue_index);
4715
return netdev_get_tx_queue(dev, queue_index);
4716
}
4717
4718
/**
4719
* __dev_queue_xmit() - transmit a buffer
4720
* @skb: buffer to transmit
4721
* @sb_dev: suboordinate device used for L2 forwarding offload
4722
*
4723
* Queue a buffer for transmission to a network device. The caller must
4724
* have set the device and priority and built the buffer before calling
4725
* this function. The function can be called from an interrupt.
4726
*
4727
* When calling this method, interrupts MUST be enabled. This is because
4728
* the BH enable code must have IRQs enabled so that it will not deadlock.
4729
*
4730
* Regardless of the return value, the skb is consumed, so it is currently
4731
* difficult to retry a send to this method. (You can bump the ref count
4732
* before sending to hold a reference for retry if you are careful.)
4733
*
4734
* Return:
4735
* * 0 - buffer successfully transmitted
4736
* * positive qdisc return code - NET_XMIT_DROP etc.
4737
* * negative errno - other errors
4738
*/
4739
int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4740
{
4741
struct net_device *dev = skb->dev;
4742
struct netdev_queue *txq = NULL;
4743
struct Qdisc *q;
4744
int rc = -ENOMEM;
4745
bool again = false;
4746
4747
skb_reset_mac_header(skb);
4748
skb_assert_len(skb);
4749
4750
if (unlikely(skb_shinfo(skb)->tx_flags &
4751
(SKBTX_SCHED_TSTAMP | SKBTX_BPF)))
4752
__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
4753
4754
/* Disable soft irqs for various locks below. Also
4755
* stops preemption for RCU.
4756
*/
4757
rcu_read_lock_bh();
4758
4759
skb_update_prio(skb);
4760
4761
qdisc_pkt_len_segs_init(skb);
4762
tcx_set_ingress(skb, false);
4763
#ifdef CONFIG_NET_EGRESS
4764
if (static_branch_unlikely(&egress_needed_key)) {
4765
if (nf_hook_egress_active()) {
4766
skb = nf_hook_egress(skb, &rc, dev);
4767
if (!skb)
4768
goto out;
4769
}
4770
4771
netdev_xmit_skip_txqueue(false);
4772
4773
nf_skip_egress(skb, true);
4774
skb = sch_handle_egress(skb, &rc, dev);
4775
if (!skb)
4776
goto out;
4777
nf_skip_egress(skb, false);
4778
4779
if (netdev_xmit_txqueue_skipped())
4780
txq = netdev_tx_queue_mapping(dev, skb);
4781
}
4782
#endif
4783
/* If device/qdisc don't need skb->dst, release it right now while
4784
* its hot in this cpu cache.
4785
*/
4786
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4787
skb_dst_drop(skb);
4788
else
4789
skb_dst_force(skb);
4790
4791
if (!txq)
4792
txq = netdev_core_pick_tx(dev, skb, sb_dev);
4793
4794
q = rcu_dereference_bh(txq->qdisc);
4795
4796
trace_net_dev_queue(skb);
4797
if (q->enqueue) {
4798
rc = __dev_xmit_skb(skb, q, dev, txq);
4799
goto out;
4800
}
4801
4802
/* The device has no queue. Common case for software devices:
4803
* loopback, all the sorts of tunnels...
4804
4805
* Really, it is unlikely that netif_tx_lock protection is necessary
4806
* here. (f.e. loopback and IP tunnels are clean ignoring statistics
4807
* counters.)
4808
* However, it is possible, that they rely on protection
4809
* made by us here.
4810
4811
* Check this and shot the lock. It is not prone from deadlocks.
4812
*Either shot noqueue qdisc, it is even simpler 8)
4813
*/
4814
if (dev->flags & IFF_UP) {
4815
int cpu = smp_processor_id(); /* ok because BHs are off */
4816
4817
/* Other cpus might concurrently change txq->xmit_lock_owner
4818
* to -1 or to their cpu id, but not to our id.
4819
*/
4820
if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
4821
if (dev_xmit_recursion())
4822
goto recursion_alert;
4823
4824
skb = validate_xmit_skb(skb, dev, &again);
4825
if (!skb)
4826
goto out;
4827
4828
HARD_TX_LOCK(dev, txq, cpu);
4829
4830
if (!netif_xmit_stopped(txq)) {
4831
dev_xmit_recursion_inc();
4832
skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4833
dev_xmit_recursion_dec();
4834
if (dev_xmit_complete(rc)) {
4835
HARD_TX_UNLOCK(dev, txq);
4836
goto out;
4837
}
4838
}
4839
HARD_TX_UNLOCK(dev, txq);
4840
net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4841
dev->name);
4842
} else {
4843
/* Recursion is detected! It is possible,
4844
* unfortunately
4845
*/
4846
recursion_alert:
4847
net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4848
dev->name);
4849
}
4850
}
4851
4852
rc = -ENETDOWN;
4853
rcu_read_unlock_bh();
4854
4855
dev_core_stats_tx_dropped_inc(dev);
4856
kfree_skb_list(skb);
4857
return rc;
4858
out:
4859
rcu_read_unlock_bh();
4860
return rc;
4861
}
4862
EXPORT_SYMBOL(__dev_queue_xmit);
4863
4864
int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4865
{
4866
struct net_device *dev = skb->dev;
4867
struct sk_buff *orig_skb = skb;
4868
struct netdev_queue *txq;
4869
int ret = NETDEV_TX_BUSY;
4870
bool again = false;
4871
4872
if (unlikely(!netif_running(dev) ||
4873
!netif_carrier_ok(dev)))
4874
goto drop;
4875
4876
skb = validate_xmit_skb_list(skb, dev, &again);
4877
if (skb != orig_skb)
4878
goto drop;
4879
4880
skb_set_queue_mapping(skb, queue_id);
4881
txq = skb_get_tx_queue(dev, skb);
4882
4883
local_bh_disable();
4884
4885
dev_xmit_recursion_inc();
4886
HARD_TX_LOCK(dev, txq, smp_processor_id());
4887
if (!netif_xmit_frozen_or_drv_stopped(txq))
4888
ret = netdev_start_xmit(skb, dev, txq, false);
4889
HARD_TX_UNLOCK(dev, txq);
4890
dev_xmit_recursion_dec();
4891
4892
local_bh_enable();
4893
return ret;
4894
drop:
4895
dev_core_stats_tx_dropped_inc(dev);
4896
kfree_skb_list(skb);
4897
return NET_XMIT_DROP;
4898
}
4899
EXPORT_SYMBOL(__dev_direct_xmit);
4900
4901
/*************************************************************************
4902
* Receiver routines
4903
*************************************************************************/
4904
static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
4905
4906
int weight_p __read_mostly = 64; /* old backlog weight */
4907
int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
4908
int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
4909
4910
/* Called with irq disabled */
4911
static inline void ____napi_schedule(struct softnet_data *sd,
4912
struct napi_struct *napi)
4913
{
4914
struct task_struct *thread;
4915
4916
lockdep_assert_irqs_disabled();
4917
4918
if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
4919
/* Paired with smp_mb__before_atomic() in
4920
* napi_enable()/netif_set_threaded().
4921
* Use READ_ONCE() to guarantee a complete
4922
* read on napi->thread. Only call
4923
* wake_up_process() when it's not NULL.
4924
*/
4925
thread = READ_ONCE(napi->thread);
4926
if (thread) {
4927
if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
4928
goto use_local_napi;
4929
4930
set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
4931
wake_up_process(thread);
4932
return;
4933
}
4934
}
4935
4936
use_local_napi:
4937
DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list));
4938
list_add_tail(&napi->poll_list, &sd->poll_list);
4939
WRITE_ONCE(napi->list_owner, smp_processor_id());
4940
/* If not called from net_rx_action()
4941
* we have to raise NET_RX_SOFTIRQ.
4942
*/
4943
if (!sd->in_net_rx_action)
4944
raise_softirq_irqoff(NET_RX_SOFTIRQ);
4945
}
4946
4947
#ifdef CONFIG_RPS
4948
4949
struct static_key_false rps_needed __read_mostly;
4950
EXPORT_SYMBOL(rps_needed);
4951
struct static_key_false rfs_needed __read_mostly;
4952
EXPORT_SYMBOL(rfs_needed);
4953
4954
static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
4955
{
4956
return hash_32(hash, flow_table->log);
4957
}
4958
4959
#ifdef CONFIG_RFS_ACCEL
4960
/**
4961
* rps_flow_is_active - check whether the flow is recently active.
4962
* @rflow: Specific flow to check activity.
4963
* @flow_table: per-queue flowtable that @rflow belongs to.
4964
* @cpu: CPU saved in @rflow.
4965
*
4966
* If the CPU has processed many packets since the flow's last activity
4967
* (beyond 10 times the table size), the flow is considered stale.
4968
*
4969
* Return: true if flow was recently active.
4970
*/
4971
static bool rps_flow_is_active(struct rps_dev_flow *rflow,
4972
struct rps_dev_flow_table *flow_table,
4973
unsigned int cpu)
4974
{
4975
unsigned int flow_last_active;
4976
unsigned int sd_input_head;
4977
4978
if (cpu >= nr_cpu_ids)
4979
return false;
4980
4981
sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head);
4982
flow_last_active = READ_ONCE(rflow->last_qtail);
4983
4984
return (int)(sd_input_head - flow_last_active) <
4985
(int)(10 << flow_table->log);
4986
}
4987
#endif
4988
4989
static struct rps_dev_flow *
4990
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4991
struct rps_dev_flow *rflow, u16 next_cpu, u32 hash,
4992
u32 flow_id)
4993
{
4994
if (next_cpu < nr_cpu_ids) {
4995
u32 head;
4996
#ifdef CONFIG_RFS_ACCEL
4997
struct netdev_rx_queue *rxqueue;
4998
struct rps_dev_flow_table *flow_table;
4999
struct rps_dev_flow *old_rflow;
5000
struct rps_dev_flow *tmp_rflow;
5001
unsigned int tmp_cpu;
5002
u16 rxq_index;
5003
int rc;
5004
5005
/* Should we steer this flow to a different hardware queue? */
5006
if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
5007
!(dev->features & NETIF_F_NTUPLE))
5008
goto out;
5009
rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
5010
if (rxq_index == skb_get_rx_queue(skb))
5011
goto out;
5012
5013
rxqueue = dev->_rx + rxq_index;
5014
flow_table = rcu_dereference(rxqueue->rps_flow_table);
5015
if (!flow_table)
5016
goto out;
5017
5018
tmp_rflow = &flow_table->flows[flow_id];
5019
tmp_cpu = READ_ONCE(tmp_rflow->cpu);
5020
5021
if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) {
5022
if (rps_flow_is_active(tmp_rflow, flow_table,
5023
tmp_cpu)) {
5024
if (hash != READ_ONCE(tmp_rflow->hash) ||
5025
next_cpu == tmp_cpu)
5026
goto out;
5027
}
5028
}
5029
5030
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
5031
rxq_index, flow_id);
5032
if (rc < 0)
5033
goto out;
5034
5035
old_rflow = rflow;
5036
rflow = tmp_rflow;
5037
WRITE_ONCE(rflow->filter, rc);
5038
WRITE_ONCE(rflow->hash, hash);
5039
5040
if (old_rflow->filter == rc)
5041
WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
5042
out:
5043
#endif
5044
head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
5045
rps_input_queue_tail_save(&rflow->last_qtail, head);
5046
}
5047
5048
WRITE_ONCE(rflow->cpu, next_cpu);
5049
return rflow;
5050
}
5051
5052
/*
5053
* get_rps_cpu is called from netif_receive_skb and returns the target
5054
* CPU from the RPS map of the receiving queue for a given skb.
5055
* rcu_read_lock must be held on entry.
5056
*/
5057
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
5058
struct rps_dev_flow **rflowp)
5059
{
5060
const struct rps_sock_flow_table *sock_flow_table;
5061
struct netdev_rx_queue *rxqueue = dev->_rx;
5062
struct rps_dev_flow_table *flow_table;
5063
struct rps_map *map;
5064
int cpu = -1;
5065
u32 flow_id;
5066
u32 tcpu;
5067
u32 hash;
5068
5069
if (skb_rx_queue_recorded(skb)) {
5070
u16 index = skb_get_rx_queue(skb);
5071
5072
if (unlikely(index >= dev->real_num_rx_queues)) {
5073
WARN_ONCE(dev->real_num_rx_queues > 1,
5074
"%s received packet on queue %u, but number "
5075
"of RX queues is %u\n",
5076
dev->name, index, dev->real_num_rx_queues);
5077
goto done;
5078
}
5079
rxqueue += index;
5080
}
5081
5082
/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
5083
5084
flow_table = rcu_dereference(rxqueue->rps_flow_table);
5085
map = rcu_dereference(rxqueue->rps_map);
5086
if (!flow_table && !map)
5087
goto done;
5088
5089
skb_reset_network_header(skb);
5090
hash = skb_get_hash(skb);
5091
if (!hash)
5092
goto done;
5093
5094
sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
5095
if (flow_table && sock_flow_table) {
5096
struct rps_dev_flow *rflow;
5097
u32 next_cpu;
5098
u32 ident;
5099
5100
/* First check into global flow table if there is a match.
5101
* This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
5102
*/
5103
ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
5104
if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
5105
goto try_rps;
5106
5107
next_cpu = ident & net_hotdata.rps_cpu_mask;
5108
5109
/* OK, now we know there is a match,
5110
* we can look at the local (per receive queue) flow table
5111
*/
5112
flow_id = rfs_slot(hash, flow_table);
5113
rflow = &flow_table->flows[flow_id];
5114
tcpu = rflow->cpu;
5115
5116
/*
5117
* If the desired CPU (where last recvmsg was done) is
5118
* different from current CPU (one in the rx-queue flow
5119
* table entry), switch if one of the following holds:
5120
* - Current CPU is unset (>= nr_cpu_ids).
5121
* - Current CPU is offline.
5122
* - The current CPU's queue tail has advanced beyond the
5123
* last packet that was enqueued using this table entry.
5124
* This guarantees that all previous packets for the flow
5125
* have been dequeued, thus preserving in order delivery.
5126
*/
5127
if (unlikely(tcpu != next_cpu) &&
5128
(tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
5129
((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
5130
rflow->last_qtail)) >= 0)) {
5131
tcpu = next_cpu;
5132
rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash,
5133
flow_id);
5134
}
5135
5136
if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
5137
*rflowp = rflow;
5138
cpu = tcpu;
5139
goto done;
5140
}
5141
}
5142
5143
try_rps:
5144
5145
if (map) {
5146
tcpu = map->cpus[reciprocal_scale(hash, map->len)];
5147
if (cpu_online(tcpu)) {
5148
cpu = tcpu;
5149
goto done;
5150
}
5151
}
5152
5153
done:
5154
return cpu;
5155
}
5156
5157
#ifdef CONFIG_RFS_ACCEL
5158
5159
/**
5160
* rps_may_expire_flow - check whether an RFS hardware filter may be removed
5161
* @dev: Device on which the filter was set
5162
* @rxq_index: RX queue index
5163
* @flow_id: Flow ID passed to ndo_rx_flow_steer()
5164
* @filter_id: Filter ID returned by ndo_rx_flow_steer()
5165
*
5166
* Drivers that implement ndo_rx_flow_steer() should periodically call
5167
* this function for each installed filter and remove the filters for
5168
* which it returns %true.
5169
*/
5170
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
5171
u32 flow_id, u16 filter_id)
5172
{
5173
struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
5174
struct rps_dev_flow_table *flow_table;
5175
struct rps_dev_flow *rflow;
5176
bool expire = true;
5177
5178
rcu_read_lock();
5179
flow_table = rcu_dereference(rxqueue->rps_flow_table);
5180
if (flow_table && flow_id < (1UL << flow_table->log)) {
5181
unsigned int cpu;
5182
5183
rflow = &flow_table->flows[flow_id];
5184
cpu = READ_ONCE(rflow->cpu);
5185
if (READ_ONCE(rflow->filter) == filter_id &&
5186
rps_flow_is_active(rflow, flow_table, cpu))
5187
expire = false;
5188
}
5189
rcu_read_unlock();
5190
return expire;
5191
}
5192
EXPORT_SYMBOL(rps_may_expire_flow);
5193
5194
#endif /* CONFIG_RFS_ACCEL */
5195
5196
/* Called from hardirq (IPI) context */
5197
static void rps_trigger_softirq(void *data)
5198
{
5199
struct softnet_data *sd = data;
5200
5201
____napi_schedule(sd, &sd->backlog);
5202
/* Pairs with READ_ONCE() in softnet_seq_show() */
5203
WRITE_ONCE(sd->received_rps, sd->received_rps + 1);
5204
}
5205
5206
#endif /* CONFIG_RPS */
5207
5208
/* Called from hardirq (IPI) context */
5209
static void trigger_rx_softirq(void *data)
5210
{
5211
struct softnet_data *sd = data;
5212
5213
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5214
smp_store_release(&sd->defer_ipi_scheduled, 0);
5215
}
5216
5217
/*
5218
* After we queued a packet into sd->input_pkt_queue,
5219
* we need to make sure this queue is serviced soon.
5220
*
5221
* - If this is another cpu queue, link it to our rps_ipi_list,
5222
* and make sure we will process rps_ipi_list from net_rx_action().
5223
*
5224
* - If this is our own queue, NAPI schedule our backlog.
5225
* Note that this also raises NET_RX_SOFTIRQ.
5226
*/
5227
static void napi_schedule_rps(struct softnet_data *sd)
5228
{
5229
struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
5230
5231
#ifdef CONFIG_RPS
5232
if (sd != mysd) {
5233
if (use_backlog_threads()) {
5234
__napi_schedule_irqoff(&sd->backlog);
5235
return;
5236
}
5237
5238
sd->rps_ipi_next = mysd->rps_ipi_list;
5239
mysd->rps_ipi_list = sd;
5240
5241
/* If not called from net_rx_action() or napi_threaded_poll()
5242
* we have to raise NET_RX_SOFTIRQ.
5243
*/
5244
if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
5245
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5246
return;
5247
}
5248
#endif /* CONFIG_RPS */
5249
__napi_schedule_irqoff(&mysd->backlog);
5250
}
5251
5252
void kick_defer_list_purge(unsigned int cpu)
5253
{
5254
struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5255
unsigned long flags;
5256
5257
if (use_backlog_threads()) {
5258
backlog_lock_irq_save(sd, &flags);
5259
5260
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
5261
__napi_schedule_irqoff(&sd->backlog);
5262
5263
backlog_unlock_irq_restore(sd, &flags);
5264
5265
} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
5266
smp_call_function_single_async(cpu, &sd->defer_csd);
5267
}
5268
}
5269
5270
#ifdef CONFIG_NET_FLOW_LIMIT
5271
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
5272
#endif
5273
5274
static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen,
5275
int max_backlog)
5276
{
5277
#ifdef CONFIG_NET_FLOW_LIMIT
5278
unsigned int old_flow, new_flow;
5279
const struct softnet_data *sd;
5280
struct sd_flow_limit *fl;
5281
5282
if (likely(qlen < (max_backlog >> 1)))
5283
return false;
5284
5285
sd = this_cpu_ptr(&softnet_data);
5286
5287
rcu_read_lock();
5288
fl = rcu_dereference(sd->flow_limit);
5289
if (fl) {
5290
new_flow = hash_32(skb_get_hash(skb), fl->log_buckets);
5291
old_flow = fl->history[fl->history_head];
5292
fl->history[fl->history_head] = new_flow;
5293
5294
fl->history_head++;
5295
fl->history_head &= FLOW_LIMIT_HISTORY - 1;
5296
5297
if (likely(fl->buckets[old_flow]))
5298
fl->buckets[old_flow]--;
5299
5300
if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
5301
/* Pairs with READ_ONCE() in softnet_seq_show() */
5302
WRITE_ONCE(fl->count, fl->count + 1);
5303
rcu_read_unlock();
5304
return true;
5305
}
5306
}
5307
rcu_read_unlock();
5308
#endif
5309
return false;
5310
}
5311
5312
/*
5313
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
5314
* queue (may be a remote CPU queue).
5315
*/
5316
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
5317
unsigned int *qtail)
5318
{
5319
enum skb_drop_reason reason;
5320
struct softnet_data *sd;
5321
unsigned long flags;
5322
unsigned int qlen;
5323
int max_backlog;
5324
u32 tail;
5325
5326
reason = SKB_DROP_REASON_DEV_READY;
5327
if (unlikely(!netif_running(skb->dev)))
5328
goto bad_dev;
5329
5330
sd = &per_cpu(softnet_data, cpu);
5331
5332
qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
5333
max_backlog = READ_ONCE(net_hotdata.max_backlog);
5334
if (unlikely(qlen > max_backlog) ||
5335
skb_flow_limit(skb, qlen, max_backlog))
5336
goto cpu_backlog_drop;
5337
backlog_lock_irq_save(sd, &flags);
5338
qlen = skb_queue_len(&sd->input_pkt_queue);
5339
if (likely(qlen <= max_backlog)) {
5340
if (!qlen) {
5341
/* Schedule NAPI for backlog device. We can use
5342
* non atomic operation as we own the queue lock.
5343
*/
5344
if (!__test_and_set_bit(NAPI_STATE_SCHED,
5345
&sd->backlog.state))
5346
napi_schedule_rps(sd);
5347
}
5348
__skb_queue_tail(&sd->input_pkt_queue, skb);
5349
tail = rps_input_queue_tail_incr(sd);
5350
backlog_unlock_irq_restore(sd, &flags);
5351
5352
/* save the tail outside of the critical section */
5353
rps_input_queue_tail_save(qtail, tail);
5354
return NET_RX_SUCCESS;
5355
}
5356
5357
backlog_unlock_irq_restore(sd, &flags);
5358
5359
cpu_backlog_drop:
5360
reason = SKB_DROP_REASON_CPU_BACKLOG;
5361
numa_drop_add(&sd->drop_counters, 1);
5362
bad_dev:
5363
dev_core_stats_rx_dropped_inc(skb->dev);
5364
kfree_skb_reason(skb, reason);
5365
return NET_RX_DROP;
5366
}
5367
5368
static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
5369
{
5370
struct net_device *dev = skb->dev;
5371
struct netdev_rx_queue *rxqueue;
5372
5373
rxqueue = dev->_rx;
5374
5375
if (skb_rx_queue_recorded(skb)) {
5376
u16 index = skb_get_rx_queue(skb);
5377
5378
if (unlikely(index >= dev->real_num_rx_queues)) {
5379
WARN_ONCE(dev->real_num_rx_queues > 1,
5380
"%s received packet on queue %u, but number "
5381
"of RX queues is %u\n",
5382
dev->name, index, dev->real_num_rx_queues);
5383
5384
return rxqueue; /* Return first rxqueue */
5385
}
5386
rxqueue += index;
5387
}
5388
return rxqueue;
5389
}
5390
5391
u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
5392
const struct bpf_prog *xdp_prog)
5393
{
5394
void *orig_data, *orig_data_end, *hard_start;
5395
struct netdev_rx_queue *rxqueue;
5396
bool orig_bcast, orig_host;
5397
u32 mac_len, frame_sz;
5398
__be16 orig_eth_type;
5399
struct ethhdr *eth;
5400
u32 metalen, act;
5401
int off;
5402
5403
/* The XDP program wants to see the packet starting at the MAC
5404
* header.
5405
*/
5406
mac_len = skb->data - skb_mac_header(skb);
5407
hard_start = skb->data - skb_headroom(skb);
5408
5409
/* SKB "head" area always have tailroom for skb_shared_info */
5410
frame_sz = (void *)skb_end_pointer(skb) - hard_start;
5411
frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
5412
5413
rxqueue = netif_get_rxqueue(skb);
5414
xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
5415
xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
5416
skb_headlen(skb) + mac_len, true);
5417
if (skb_is_nonlinear(skb)) {
5418
skb_shinfo(skb)->xdp_frags_size = skb->data_len;
5419
xdp_buff_set_frags_flag(xdp);
5420
} else {
5421
xdp_buff_clear_frags_flag(xdp);
5422
}
5423
5424
orig_data_end = xdp->data_end;
5425
orig_data = xdp->data;
5426
eth = (struct ethhdr *)xdp->data;
5427
orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
5428
orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
5429
orig_eth_type = eth->h_proto;
5430
5431
act = bpf_prog_run_xdp(xdp_prog, xdp);
5432
5433
/* check if bpf_xdp_adjust_head was used */
5434
off = xdp->data - orig_data;
5435
if (off) {
5436
if (off > 0)
5437
__skb_pull(skb, off);
5438
else if (off < 0)
5439
__skb_push(skb, -off);
5440
5441
skb->mac_header += off;
5442
skb_reset_network_header(skb);
5443
}
5444
5445
/* check if bpf_xdp_adjust_tail was used */
5446
off = xdp->data_end - orig_data_end;
5447
if (off != 0) {
5448
skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
5449
skb->len += off; /* positive on grow, negative on shrink */
5450
}
5451
5452
/* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
5453
* (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
5454
*/
5455
if (xdp_buff_has_frags(xdp))
5456
skb->data_len = skb_shinfo(skb)->xdp_frags_size;
5457
else
5458
skb->data_len = 0;
5459
5460
/* check if XDP changed eth hdr such SKB needs update */
5461
eth = (struct ethhdr *)xdp->data;
5462
if ((orig_eth_type != eth->h_proto) ||
5463
(orig_host != ether_addr_equal_64bits(eth->h_dest,
5464
skb->dev->dev_addr)) ||
5465
(orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
5466
__skb_push(skb, ETH_HLEN);
5467
skb->pkt_type = PACKET_HOST;
5468
skb->protocol = eth_type_trans(skb, skb->dev);
5469
}
5470
5471
/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
5472
* before calling us again on redirect path. We do not call do_redirect
5473
* as we leave that up to the caller.
5474
*
5475
* Caller is responsible for managing lifetime of skb (i.e. calling
5476
* kfree_skb in response to actions it cannot handle/XDP_DROP).
5477
*/
5478
switch (act) {
5479
case XDP_REDIRECT:
5480
case XDP_TX:
5481
__skb_push(skb, mac_len);
5482
break;
5483
case XDP_PASS:
5484
metalen = xdp->data - xdp->data_meta;
5485
if (metalen)
5486
skb_metadata_set(skb, metalen);
5487
break;
5488
}
5489
5490
return act;
5491
}
5492
5493
static int
5494
netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
5495
{
5496
struct sk_buff *skb = *pskb;
5497
int err, hroom, troom;
5498
5499
local_lock_nested_bh(&system_page_pool.bh_lock);
5500
err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog);
5501
local_unlock_nested_bh(&system_page_pool.bh_lock);
5502
if (!err)
5503
return 0;
5504
5505
/* In case we have to go down the path and also linearize,
5506
* then lets do the pskb_expand_head() work just once here.
5507
*/
5508
hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
5509
troom = skb->tail + skb->data_len - skb->end;
5510
err = pskb_expand_head(skb,
5511
hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
5512
troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
5513
if (err)
5514
return err;
5515
5516
return skb_linearize(skb);
5517
}
5518
5519
static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
5520
struct xdp_buff *xdp,
5521
const struct bpf_prog *xdp_prog)
5522
{
5523
struct sk_buff *skb = *pskb;
5524
u32 mac_len, act = XDP_DROP;
5525
5526
/* Reinjected packets coming from act_mirred or similar should
5527
* not get XDP generic processing.
5528
*/
5529
if (skb_is_redirected(skb))
5530
return XDP_PASS;
5531
5532
/* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
5533
* bytes. This is the guarantee that also native XDP provides,
5534
* thus we need to do it here as well.
5535
*/
5536
mac_len = skb->data - skb_mac_header(skb);
5537
__skb_push(skb, mac_len);
5538
5539
if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
5540
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
5541
if (netif_skb_check_for_xdp(pskb, xdp_prog))
5542
goto do_drop;
5543
}
5544
5545
__skb_pull(*pskb, mac_len);
5546
5547
act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
5548
switch (act) {
5549
case XDP_REDIRECT:
5550
case XDP_TX:
5551
case XDP_PASS:
5552
break;
5553
default:
5554
bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
5555
fallthrough;
5556
case XDP_ABORTED:
5557
trace_xdp_exception((*pskb)->dev, xdp_prog, act);
5558
fallthrough;
5559
case XDP_DROP:
5560
do_drop:
5561
kfree_skb(*pskb);
5562
break;
5563
}
5564
5565
return act;
5566
}
5567
5568
/* When doing generic XDP we have to bypass the qdisc layer and the
5569
* network taps in order to match in-driver-XDP behavior. This also means
5570
* that XDP packets are able to starve other packets going through a qdisc,
5571
* and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
5572
* queues, so they do not have this starvation issue.
5573
*/
5574
void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog)
5575
{
5576
struct net_device *dev = skb->dev;
5577
struct netdev_queue *txq;
5578
bool free_skb = true;
5579
int cpu, rc;
5580
5581
txq = netdev_core_pick_tx(dev, skb, NULL);
5582
cpu = smp_processor_id();
5583
HARD_TX_LOCK(dev, txq, cpu);
5584
if (!netif_xmit_frozen_or_drv_stopped(txq)) {
5585
rc = netdev_start_xmit(skb, dev, txq, 0);
5586
if (dev_xmit_complete(rc))
5587
free_skb = false;
5588
}
5589
HARD_TX_UNLOCK(dev, txq);
5590
if (free_skb) {
5591
trace_xdp_exception(dev, xdp_prog, XDP_TX);
5592
dev_core_stats_tx_dropped_inc(dev);
5593
kfree_skb(skb);
5594
}
5595
}
5596
5597
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
5598
5599
int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb)
5600
{
5601
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
5602
5603
if (xdp_prog) {
5604
struct xdp_buff xdp;
5605
u32 act;
5606
int err;
5607
5608
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
5609
act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
5610
if (act != XDP_PASS) {
5611
switch (act) {
5612
case XDP_REDIRECT:
5613
err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
5614
&xdp, xdp_prog);
5615
if (err)
5616
goto out_redir;
5617
break;
5618
case XDP_TX:
5619
generic_xdp_tx(*pskb, xdp_prog);
5620
break;
5621
}
5622
bpf_net_ctx_clear(bpf_net_ctx);
5623
return XDP_DROP;
5624
}
5625
bpf_net_ctx_clear(bpf_net_ctx);
5626
}
5627
return XDP_PASS;
5628
out_redir:
5629
bpf_net_ctx_clear(bpf_net_ctx);
5630
kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
5631
return XDP_DROP;
5632
}
5633
EXPORT_SYMBOL_GPL(do_xdp_generic);
5634
5635
static int netif_rx_internal(struct sk_buff *skb)
5636
{
5637
int ret;
5638
5639
net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5640
5641
trace_netif_rx(skb);
5642
5643
#ifdef CONFIG_RPS
5644
if (static_branch_unlikely(&rps_needed)) {
5645
struct rps_dev_flow voidflow, *rflow = &voidflow;
5646
int cpu;
5647
5648
rcu_read_lock();
5649
5650
cpu = get_rps_cpu(skb->dev, skb, &rflow);
5651
if (cpu < 0)
5652
cpu = smp_processor_id();
5653
5654
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5655
5656
rcu_read_unlock();
5657
} else
5658
#endif
5659
{
5660
unsigned int qtail;
5661
5662
ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
5663
}
5664
return ret;
5665
}
5666
5667
/**
5668
* __netif_rx - Slightly optimized version of netif_rx
5669
* @skb: buffer to post
5670
*
5671
* This behaves as netif_rx except that it does not disable bottom halves.
5672
* As a result this function may only be invoked from the interrupt context
5673
* (either hard or soft interrupt).
5674
*/
5675
int __netif_rx(struct sk_buff *skb)
5676
{
5677
int ret;
5678
5679
lockdep_assert_once(hardirq_count() | softirq_count());
5680
5681
trace_netif_rx_entry(skb);
5682
ret = netif_rx_internal(skb);
5683
trace_netif_rx_exit(ret);
5684
return ret;
5685
}
5686
EXPORT_SYMBOL(__netif_rx);
5687
5688
/**
5689
* netif_rx - post buffer to the network code
5690
* @skb: buffer to post
5691
*
5692
* This function receives a packet from a device driver and queues it for
5693
* the upper (protocol) levels to process via the backlog NAPI device. It
5694
* always succeeds. The buffer may be dropped during processing for
5695
* congestion control or by the protocol layers.
5696
* The network buffer is passed via the backlog NAPI device. Modern NIC
5697
* driver should use NAPI and GRO.
5698
* This function can used from interrupt and from process context. The
5699
* caller from process context must not disable interrupts before invoking
5700
* this function.
5701
*
5702
* return values:
5703
* NET_RX_SUCCESS (no congestion)
5704
* NET_RX_DROP (packet was dropped)
5705
*
5706
*/
5707
int netif_rx(struct sk_buff *skb)
5708
{
5709
bool need_bh_off = !(hardirq_count() | softirq_count());
5710
int ret;
5711
5712
if (need_bh_off)
5713
local_bh_disable();
5714
trace_netif_rx_entry(skb);
5715
ret = netif_rx_internal(skb);
5716
trace_netif_rx_exit(ret);
5717
if (need_bh_off)
5718
local_bh_enable();
5719
return ret;
5720
}
5721
EXPORT_SYMBOL(netif_rx);
5722
5723
static __latent_entropy void net_tx_action(void)
5724
{
5725
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5726
5727
if (sd->completion_queue) {
5728
struct sk_buff *clist;
5729
5730
local_irq_disable();
5731
clist = sd->completion_queue;
5732
sd->completion_queue = NULL;
5733
local_irq_enable();
5734
5735
while (clist) {
5736
struct sk_buff *skb = clist;
5737
5738
clist = clist->next;
5739
5740
WARN_ON(refcount_read(&skb->users));
5741
if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
5742
trace_consume_skb(skb, net_tx_action);
5743
else
5744
trace_kfree_skb(skb, net_tx_action,
5745
get_kfree_skb_cb(skb)->reason, NULL);
5746
5747
if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
5748
__kfree_skb(skb);
5749
else
5750
__napi_kfree_skb(skb,
5751
get_kfree_skb_cb(skb)->reason);
5752
}
5753
}
5754
5755
if (sd->output_queue) {
5756
struct Qdisc *head;
5757
5758
local_irq_disable();
5759
head = sd->output_queue;
5760
sd->output_queue = NULL;
5761
sd->output_queue_tailp = &sd->output_queue;
5762
local_irq_enable();
5763
5764
rcu_read_lock();
5765
5766
while (head) {
5767
spinlock_t *root_lock = NULL;
5768
struct sk_buff *to_free;
5769
struct Qdisc *q = head;
5770
5771
head = head->next_sched;
5772
5773
/* We need to make sure head->next_sched is read
5774
* before clearing __QDISC_STATE_SCHED
5775
*/
5776
smp_mb__before_atomic();
5777
5778
if (!(q->flags & TCQ_F_NOLOCK)) {
5779
root_lock = qdisc_lock(q);
5780
spin_lock(root_lock);
5781
} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
5782
&q->state))) {
5783
/* There is a synchronize_net() between
5784
* STATE_DEACTIVATED flag being set and
5785
* qdisc_reset()/some_qdisc_is_busy() in
5786
* dev_deactivate(), so we can safely bail out
5787
* early here to avoid data race between
5788
* qdisc_deactivate() and some_qdisc_is_busy()
5789
* for lockless qdisc.
5790
*/
5791
clear_bit(__QDISC_STATE_SCHED, &q->state);
5792
continue;
5793
}
5794
5795
clear_bit(__QDISC_STATE_SCHED, &q->state);
5796
to_free = qdisc_run(q);
5797
if (root_lock)
5798
spin_unlock(root_lock);
5799
tcf_kfree_skb_list(to_free);
5800
}
5801
5802
rcu_read_unlock();
5803
}
5804
5805
xfrm_dev_backlog(sd);
5806
}
5807
5808
#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
5809
/* This hook is defined here for ATM LANE */
5810
int (*br_fdb_test_addr_hook)(struct net_device *dev,
5811
unsigned char *addr) __read_mostly;
5812
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
5813
#endif
5814
5815
/**
5816
* netdev_is_rx_handler_busy - check if receive handler is registered
5817
* @dev: device to check
5818
*
5819
* Check if a receive handler is already registered for a given device.
5820
* Return true if there one.
5821
*
5822
* The caller must hold the rtnl_mutex.
5823
*/
5824
bool netdev_is_rx_handler_busy(struct net_device *dev)
5825
{
5826
ASSERT_RTNL();
5827
return dev && rtnl_dereference(dev->rx_handler);
5828
}
5829
EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
5830
5831
/**
5832
* netdev_rx_handler_register - register receive handler
5833
* @dev: device to register a handler for
5834
* @rx_handler: receive handler to register
5835
* @rx_handler_data: data pointer that is used by rx handler
5836
*
5837
* Register a receive handler for a device. This handler will then be
5838
* called from __netif_receive_skb. A negative errno code is returned
5839
* on a failure.
5840
*
5841
* The caller must hold the rtnl_mutex.
5842
*
5843
* For a general description of rx_handler, see enum rx_handler_result.
5844
*/
5845
int netdev_rx_handler_register(struct net_device *dev,
5846
rx_handler_func_t *rx_handler,
5847
void *rx_handler_data)
5848
{
5849
if (netdev_is_rx_handler_busy(dev))
5850
return -EBUSY;
5851
5852
if (dev->priv_flags & IFF_NO_RX_HANDLER)
5853
return -EINVAL;
5854
5855
/* Note: rx_handler_data must be set before rx_handler */
5856
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5857
rcu_assign_pointer(dev->rx_handler, rx_handler);
5858
5859
return 0;
5860
}
5861
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5862
5863
/**
5864
* netdev_rx_handler_unregister - unregister receive handler
5865
* @dev: device to unregister a handler from
5866
*
5867
* Unregister a receive handler from a device.
5868
*
5869
* The caller must hold the rtnl_mutex.
5870
*/
5871
void netdev_rx_handler_unregister(struct net_device *dev)
5872
{
5873
5874
ASSERT_RTNL();
5875
RCU_INIT_POINTER(dev->rx_handler, NULL);
5876
/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5877
* section has a guarantee to see a non NULL rx_handler_data
5878
* as well.
5879
*/
5880
synchronize_net();
5881
RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5882
}
5883
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5884
5885
/*
5886
* Limit the use of PFMEMALLOC reserves to those protocols that implement
5887
* the special handling of PFMEMALLOC skbs.
5888
*/
5889
static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5890
{
5891
switch (skb->protocol) {
5892
case htons(ETH_P_ARP):
5893
case htons(ETH_P_IP):
5894
case htons(ETH_P_IPV6):
5895
case htons(ETH_P_8021Q):
5896
case htons(ETH_P_8021AD):
5897
return true;
5898
default:
5899
return false;
5900
}
5901
}
5902
5903
static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5904
int *ret, struct net_device *orig_dev)
5905
{
5906
if (nf_hook_ingress_active(skb)) {
5907
int ingress_retval;
5908
5909
if (unlikely(*pt_prev)) {
5910
*ret = deliver_skb(skb, *pt_prev, orig_dev);
5911
*pt_prev = NULL;
5912
}
5913
5914
rcu_read_lock();
5915
ingress_retval = nf_hook_ingress(skb);
5916
rcu_read_unlock();
5917
return ingress_retval;
5918
}
5919
return 0;
5920
}
5921
5922
static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5923
struct packet_type **ppt_prev)
5924
{
5925
enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO;
5926
struct packet_type *ptype, *pt_prev;
5927
rx_handler_func_t *rx_handler;
5928
struct sk_buff *skb = *pskb;
5929
struct net_device *orig_dev;
5930
bool deliver_exact = false;
5931
int ret = NET_RX_DROP;
5932
__be16 type;
5933
5934
net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5935
5936
trace_netif_receive_skb(skb);
5937
5938
orig_dev = skb->dev;
5939
5940
skb_reset_network_header(skb);
5941
#if !defined(CONFIG_DEBUG_NET)
5942
/* We plan to no longer reset the transport header here.
5943
* Give some time to fuzzers and dev build to catch bugs
5944
* in network stacks.
5945
*/
5946
if (!skb_transport_header_was_set(skb))
5947
skb_reset_transport_header(skb);
5948
#endif
5949
skb_reset_mac_len(skb);
5950
5951
pt_prev = NULL;
5952
5953
another_round:
5954
skb->skb_iif = skb->dev->ifindex;
5955
5956
__this_cpu_inc(softnet_data.processed);
5957
5958
if (static_branch_unlikely(&generic_xdp_needed_key)) {
5959
int ret2;
5960
5961
migrate_disable();
5962
ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
5963
&skb);
5964
migrate_enable();
5965
5966
if (ret2 != XDP_PASS) {
5967
ret = NET_RX_DROP;
5968
goto out;
5969
}
5970
}
5971
5972
if (eth_type_vlan(skb->protocol)) {
5973
skb = skb_vlan_untag(skb);
5974
if (unlikely(!skb))
5975
goto out;
5976
}
5977
5978
if (skb_skip_tc_classify(skb))
5979
goto skip_classify;
5980
5981
if (pfmemalloc)
5982
goto skip_taps;
5983
5984
list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
5985
list) {
5986
if (unlikely(pt_prev))
5987
ret = deliver_skb(skb, pt_prev, orig_dev);
5988
pt_prev = ptype;
5989
}
5990
5991
list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5992
if (unlikely(pt_prev))
5993
ret = deliver_skb(skb, pt_prev, orig_dev);
5994
pt_prev = ptype;
5995
}
5996
5997
skip_taps:
5998
#ifdef CONFIG_NET_INGRESS
5999
if (static_branch_unlikely(&ingress_needed_key)) {
6000
bool another = false;
6001
6002
nf_skip_egress(skb, true);
6003
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
6004
&another);
6005
if (another)
6006
goto another_round;
6007
if (!skb)
6008
goto out;
6009
6010
nf_skip_egress(skb, false);
6011
if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
6012
goto out;
6013
}
6014
#endif
6015
skb_reset_redirect(skb);
6016
skip_classify:
6017
if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) {
6018
drop_reason = SKB_DROP_REASON_PFMEMALLOC;
6019
goto drop;
6020
}
6021
6022
if (skb_vlan_tag_present(skb)) {
6023
if (unlikely(pt_prev)) {
6024
ret = deliver_skb(skb, pt_prev, orig_dev);
6025
pt_prev = NULL;
6026
}
6027
if (vlan_do_receive(&skb))
6028
goto another_round;
6029
else if (unlikely(!skb))
6030
goto out;
6031
}
6032
6033
rx_handler = rcu_dereference(skb->dev->rx_handler);
6034
if (rx_handler) {
6035
if (unlikely(pt_prev)) {
6036
ret = deliver_skb(skb, pt_prev, orig_dev);
6037
pt_prev = NULL;
6038
}
6039
switch (rx_handler(&skb)) {
6040
case RX_HANDLER_CONSUMED:
6041
ret = NET_RX_SUCCESS;
6042
goto out;
6043
case RX_HANDLER_ANOTHER:
6044
goto another_round;
6045
case RX_HANDLER_EXACT:
6046
deliver_exact = true;
6047
break;
6048
case RX_HANDLER_PASS:
6049
break;
6050
default:
6051
BUG();
6052
}
6053
}
6054
6055
if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
6056
check_vlan_id:
6057
if (skb_vlan_tag_get_id(skb)) {
6058
/* Vlan id is non 0 and vlan_do_receive() above couldn't
6059
* find vlan device.
6060
*/
6061
skb->pkt_type = PACKET_OTHERHOST;
6062
} else if (eth_type_vlan(skb->protocol)) {
6063
/* Outer header is 802.1P with vlan 0, inner header is
6064
* 802.1Q or 802.1AD and vlan_do_receive() above could
6065
* not find vlan dev for vlan id 0.
6066
*/
6067
__vlan_hwaccel_clear_tag(skb);
6068
skb = skb_vlan_untag(skb);
6069
if (unlikely(!skb))
6070
goto out;
6071
if (vlan_do_receive(&skb))
6072
/* After stripping off 802.1P header with vlan 0
6073
* vlan dev is found for inner header.
6074
*/
6075
goto another_round;
6076
else if (unlikely(!skb))
6077
goto out;
6078
else
6079
/* We have stripped outer 802.1P vlan 0 header.
6080
* But could not find vlan dev.
6081
* check again for vlan id to set OTHERHOST.
6082
*/
6083
goto check_vlan_id;
6084
}
6085
/* Note: we might in the future use prio bits
6086
* and set skb->priority like in vlan_do_receive()
6087
* For the time being, just ignore Priority Code Point
6088
*/
6089
__vlan_hwaccel_clear_tag(skb);
6090
}
6091
6092
type = skb->protocol;
6093
6094
/* deliver only exact match when indicated */
6095
if (likely(!deliver_exact)) {
6096
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
6097
&ptype_base[ntohs(type) &
6098
PTYPE_HASH_MASK]);
6099
6100
/* orig_dev and skb->dev could belong to different netns;
6101
* Even in such case we need to traverse only the list
6102
* coming from skb->dev, as the ptype owner (packet socket)
6103
* will use dev_net(skb->dev) to do namespace filtering.
6104
*/
6105
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
6106
&dev_net_rcu(skb->dev)->ptype_specific);
6107
}
6108
6109
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
6110
&orig_dev->ptype_specific);
6111
6112
if (unlikely(skb->dev != orig_dev)) {
6113
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
6114
&skb->dev->ptype_specific);
6115
}
6116
6117
if (pt_prev) {
6118
*ppt_prev = pt_prev;
6119
} else {
6120
drop:
6121
if (!deliver_exact)
6122
dev_core_stats_rx_dropped_inc(skb->dev);
6123
else
6124
dev_core_stats_rx_nohandler_inc(skb->dev);
6125
6126
kfree_skb_reason(skb, drop_reason);
6127
/* Jamal, now you will not able to escape explaining
6128
* me how you were going to use this. :-)
6129
*/
6130
ret = NET_RX_DROP;
6131
}
6132
6133
out:
6134
/* The invariant here is that if *ppt_prev is not NULL
6135
* then skb should also be non-NULL.
6136
*
6137
* Apparently *ppt_prev assignment above holds this invariant due to
6138
* skb dereferencing near it.
6139
*/
6140
*pskb = skb;
6141
return ret;
6142
}
6143
6144
static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
6145
{
6146
struct net_device *orig_dev = skb->dev;
6147
struct packet_type *pt_prev = NULL;
6148
int ret;
6149
6150
ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
6151
if (pt_prev)
6152
ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
6153
skb->dev, pt_prev, orig_dev);
6154
return ret;
6155
}
6156
6157
/**
6158
* netif_receive_skb_core - special purpose version of netif_receive_skb
6159
* @skb: buffer to process
6160
*
6161
* More direct receive version of netif_receive_skb(). It should
6162
* only be used by callers that have a need to skip RPS and Generic XDP.
6163
* Caller must also take care of handling if ``(page_is_)pfmemalloc``.
6164
*
6165
* This function may only be called from softirq context and interrupts
6166
* should be enabled.
6167
*
6168
* Return values (usually ignored):
6169
* NET_RX_SUCCESS: no congestion
6170
* NET_RX_DROP: packet was dropped
6171
*/
6172
int netif_receive_skb_core(struct sk_buff *skb)
6173
{
6174
int ret;
6175
6176
rcu_read_lock();
6177
ret = __netif_receive_skb_one_core(skb, false);
6178
rcu_read_unlock();
6179
6180
return ret;
6181
}
6182
EXPORT_SYMBOL(netif_receive_skb_core);
6183
6184
static inline void __netif_receive_skb_list_ptype(struct list_head *head,
6185
struct packet_type *pt_prev,
6186
struct net_device *orig_dev)
6187
{
6188
struct sk_buff *skb, *next;
6189
6190
if (!pt_prev)
6191
return;
6192
if (list_empty(head))
6193
return;
6194
if (pt_prev->list_func != NULL)
6195
INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
6196
ip_list_rcv, head, pt_prev, orig_dev);
6197
else
6198
list_for_each_entry_safe(skb, next, head, list) {
6199
skb_list_del_init(skb);
6200
pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
6201
}
6202
}
6203
6204
static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
6205
{
6206
/* Fast-path assumptions:
6207
* - There is no RX handler.
6208
* - Only one packet_type matches.
6209
* If either of these fails, we will end up doing some per-packet
6210
* processing in-line, then handling the 'last ptype' for the whole
6211
* sublist. This can't cause out-of-order delivery to any single ptype,
6212
* because the 'last ptype' must be constant across the sublist, and all
6213
* other ptypes are handled per-packet.
6214
*/
6215
/* Current (common) ptype of sublist */
6216
struct packet_type *pt_curr = NULL;
6217
/* Current (common) orig_dev of sublist */
6218
struct net_device *od_curr = NULL;
6219
struct sk_buff *skb, *next;
6220
LIST_HEAD(sublist);
6221
6222
list_for_each_entry_safe(skb, next, head, list) {
6223
struct net_device *orig_dev = skb->dev;
6224
struct packet_type *pt_prev = NULL;
6225
6226
skb_list_del_init(skb);
6227
__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
6228
if (!pt_prev)
6229
continue;
6230
if (pt_curr != pt_prev || od_curr != orig_dev) {
6231
/* dispatch old sublist */
6232
__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
6233
/* start new sublist */
6234
INIT_LIST_HEAD(&sublist);
6235
pt_curr = pt_prev;
6236
od_curr = orig_dev;
6237
}
6238
list_add_tail(&skb->list, &sublist);
6239
}
6240
6241
/* dispatch final sublist */
6242
__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
6243
}
6244
6245
static int __netif_receive_skb(struct sk_buff *skb)
6246
{
6247
int ret;
6248
6249
if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
6250
unsigned int noreclaim_flag;
6251
6252
/*
6253
* PFMEMALLOC skbs are special, they should
6254
* - be delivered to SOCK_MEMALLOC sockets only
6255
* - stay away from userspace
6256
* - have bounded memory usage
6257
*
6258
* Use PF_MEMALLOC as this saves us from propagating the allocation
6259
* context down to all allocation sites.
6260
*/
6261
noreclaim_flag = memalloc_noreclaim_save();
6262
ret = __netif_receive_skb_one_core(skb, true);
6263
memalloc_noreclaim_restore(noreclaim_flag);
6264
} else
6265
ret = __netif_receive_skb_one_core(skb, false);
6266
6267
return ret;
6268
}
6269
6270
static void __netif_receive_skb_list(struct list_head *head)
6271
{
6272
unsigned long noreclaim_flag = 0;
6273
struct sk_buff *skb, *next;
6274
bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
6275
6276
list_for_each_entry_safe(skb, next, head, list) {
6277
if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
6278
struct list_head sublist;
6279
6280
/* Handle the previous sublist */
6281
list_cut_before(&sublist, head, &skb->list);
6282
if (!list_empty(&sublist))
6283
__netif_receive_skb_list_core(&sublist, pfmemalloc);
6284
pfmemalloc = !pfmemalloc;
6285
/* See comments in __netif_receive_skb */
6286
if (pfmemalloc)
6287
noreclaim_flag = memalloc_noreclaim_save();
6288
else
6289
memalloc_noreclaim_restore(noreclaim_flag);
6290
}
6291
}
6292
/* Handle the remaining sublist */
6293
if (!list_empty(head))
6294
__netif_receive_skb_list_core(head, pfmemalloc);
6295
/* Restore pflags */
6296
if (pfmemalloc)
6297
memalloc_noreclaim_restore(noreclaim_flag);
6298
}
6299
6300
static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
6301
{
6302
struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
6303
struct bpf_prog *new = xdp->prog;
6304
int ret = 0;
6305
6306
switch (xdp->command) {
6307
case XDP_SETUP_PROG:
6308
rcu_assign_pointer(dev->xdp_prog, new);
6309
if (old)
6310
bpf_prog_put(old);
6311
6312
if (old && !new) {
6313
static_branch_dec(&generic_xdp_needed_key);
6314
} else if (new && !old) {
6315
static_branch_inc(&generic_xdp_needed_key);
6316
netif_disable_lro(dev);
6317
dev_disable_gro_hw(dev);
6318
}
6319
break;
6320
6321
default:
6322
ret = -EINVAL;
6323
break;
6324
}
6325
6326
return ret;
6327
}
6328
6329
static int netif_receive_skb_internal(struct sk_buff *skb)
6330
{
6331
int ret;
6332
6333
net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
6334
6335
if (skb_defer_rx_timestamp(skb))
6336
return NET_RX_SUCCESS;
6337
6338
rcu_read_lock();
6339
#ifdef CONFIG_RPS
6340
if (static_branch_unlikely(&rps_needed)) {
6341
struct rps_dev_flow voidflow, *rflow = &voidflow;
6342
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
6343
6344
if (cpu >= 0) {
6345
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
6346
rcu_read_unlock();
6347
return ret;
6348
}
6349
}
6350
#endif
6351
ret = __netif_receive_skb(skb);
6352
rcu_read_unlock();
6353
return ret;
6354
}
6355
6356
void netif_receive_skb_list_internal(struct list_head *head)
6357
{
6358
struct sk_buff *skb, *next;
6359
LIST_HEAD(sublist);
6360
6361
list_for_each_entry_safe(skb, next, head, list) {
6362
net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
6363
skb);
6364
skb_list_del_init(skb);
6365
if (!skb_defer_rx_timestamp(skb))
6366
list_add_tail(&skb->list, &sublist);
6367
}
6368
list_splice_init(&sublist, head);
6369
6370
rcu_read_lock();
6371
#ifdef CONFIG_RPS
6372
if (static_branch_unlikely(&rps_needed)) {
6373
list_for_each_entry_safe(skb, next, head, list) {
6374
struct rps_dev_flow voidflow, *rflow = &voidflow;
6375
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
6376
6377
if (cpu >= 0) {
6378
/* Will be handled, remove from list */
6379
skb_list_del_init(skb);
6380
enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
6381
}
6382
}
6383
}
6384
#endif
6385
__netif_receive_skb_list(head);
6386
rcu_read_unlock();
6387
}
6388
6389
/**
6390
* netif_receive_skb - process receive buffer from network
6391
* @skb: buffer to process
6392
*
6393
* netif_receive_skb() is the main receive data processing function.
6394
* It always succeeds. The buffer may be dropped during processing
6395
* for congestion control or by the protocol layers.
6396
*
6397
* This function may only be called from softirq context and interrupts
6398
* should be enabled.
6399
*
6400
* Return values (usually ignored):
6401
* NET_RX_SUCCESS: no congestion
6402
* NET_RX_DROP: packet was dropped
6403
*/
6404
int netif_receive_skb(struct sk_buff *skb)
6405
{
6406
int ret;
6407
6408
trace_netif_receive_skb_entry(skb);
6409
6410
ret = netif_receive_skb_internal(skb);
6411
trace_netif_receive_skb_exit(ret);
6412
6413
return ret;
6414
}
6415
EXPORT_SYMBOL(netif_receive_skb);
6416
6417
/**
6418
* netif_receive_skb_list - process many receive buffers from network
6419
* @head: list of skbs to process.
6420
*
6421
* Since return value of netif_receive_skb() is normally ignored, and
6422
* wouldn't be meaningful for a list, this function returns void.
6423
*
6424
* This function may only be called from softirq context and interrupts
6425
* should be enabled.
6426
*/
6427
void netif_receive_skb_list(struct list_head *head)
6428
{
6429
struct sk_buff *skb;
6430
6431
if (list_empty(head))
6432
return;
6433
if (trace_netif_receive_skb_list_entry_enabled()) {
6434
list_for_each_entry(skb, head, list)
6435
trace_netif_receive_skb_list_entry(skb);
6436
}
6437
netif_receive_skb_list_internal(head);
6438
trace_netif_receive_skb_list_exit(0);
6439
}
6440
EXPORT_SYMBOL(netif_receive_skb_list);
6441
6442
/* Network device is going away, flush any packets still pending */
6443
static void flush_backlog(struct work_struct *work)
6444
{
6445
struct sk_buff *skb, *tmp;
6446
struct sk_buff_head list;
6447
struct softnet_data *sd;
6448
6449
__skb_queue_head_init(&list);
6450
local_bh_disable();
6451
sd = this_cpu_ptr(&softnet_data);
6452
6453
backlog_lock_irq_disable(sd);
6454
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6455
if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
6456
__skb_unlink(skb, &sd->input_pkt_queue);
6457
__skb_queue_tail(&list, skb);
6458
rps_input_queue_head_incr(sd);
6459
}
6460
}
6461
backlog_unlock_irq_enable(sd);
6462
6463
local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
6464
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
6465
if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
6466
__skb_unlink(skb, &sd->process_queue);
6467
__skb_queue_tail(&list, skb);
6468
rps_input_queue_head_incr(sd);
6469
}
6470
}
6471
local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
6472
local_bh_enable();
6473
6474
__skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY);
6475
}
6476
6477
static bool flush_required(int cpu)
6478
{
6479
#if IS_ENABLED(CONFIG_RPS)
6480
struct softnet_data *sd = &per_cpu(softnet_data, cpu);
6481
bool do_flush;
6482
6483
backlog_lock_irq_disable(sd);
6484
6485
/* as insertion into process_queue happens with the rps lock held,
6486
* process_queue access may race only with dequeue
6487
*/
6488
do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
6489
!skb_queue_empty_lockless(&sd->process_queue);
6490
backlog_unlock_irq_enable(sd);
6491
6492
return do_flush;
6493
#endif
6494
/* without RPS we can't safely check input_pkt_queue: during a
6495
* concurrent remote skb_queue_splice() we can detect as empty both
6496
* input_pkt_queue and process_queue even if the latter could end-up
6497
* containing a lot of packets.
6498
*/
6499
return true;
6500
}
6501
6502
struct flush_backlogs {
6503
cpumask_t flush_cpus;
6504
struct work_struct w[];
6505
};
6506
6507
static struct flush_backlogs *flush_backlogs_alloc(void)
6508
{
6509
return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids),
6510
GFP_KERNEL);
6511
}
6512
6513
static struct flush_backlogs *flush_backlogs_fallback;
6514
static DEFINE_MUTEX(flush_backlogs_mutex);
6515
6516
static void flush_all_backlogs(void)
6517
{
6518
struct flush_backlogs *ptr = flush_backlogs_alloc();
6519
unsigned int cpu;
6520
6521
if (!ptr) {
6522
mutex_lock(&flush_backlogs_mutex);
6523
ptr = flush_backlogs_fallback;
6524
}
6525
cpumask_clear(&ptr->flush_cpus);
6526
6527
cpus_read_lock();
6528
6529
for_each_online_cpu(cpu) {
6530
if (flush_required(cpu)) {
6531
INIT_WORK(&ptr->w[cpu], flush_backlog);
6532
queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]);
6533
__cpumask_set_cpu(cpu, &ptr->flush_cpus);
6534
}
6535
}
6536
6537
/* we can have in flight packet[s] on the cpus we are not flushing,
6538
* synchronize_net() in unregister_netdevice_many() will take care of
6539
* them.
6540
*/
6541
for_each_cpu(cpu, &ptr->flush_cpus)
6542
flush_work(&ptr->w[cpu]);
6543
6544
cpus_read_unlock();
6545
6546
if (ptr != flush_backlogs_fallback)
6547
kfree(ptr);
6548
else
6549
mutex_unlock(&flush_backlogs_mutex);
6550
}
6551
6552
static void net_rps_send_ipi(struct softnet_data *remsd)
6553
{
6554
#ifdef CONFIG_RPS
6555
while (remsd) {
6556
struct softnet_data *next = remsd->rps_ipi_next;
6557
6558
if (cpu_online(remsd->cpu))
6559
smp_call_function_single_async(remsd->cpu, &remsd->csd);
6560
remsd = next;
6561
}
6562
#endif
6563
}
6564
6565
/*
6566
* net_rps_action_and_irq_enable sends any pending IPI's for rps.
6567
* Note: called with local irq disabled, but exits with local irq enabled.
6568
*/
6569
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
6570
{
6571
#ifdef CONFIG_RPS
6572
struct softnet_data *remsd = sd->rps_ipi_list;
6573
6574
if (!use_backlog_threads() && remsd) {
6575
sd->rps_ipi_list = NULL;
6576
6577
local_irq_enable();
6578
6579
/* Send pending IPI's to kick RPS processing on remote cpus. */
6580
net_rps_send_ipi(remsd);
6581
} else
6582
#endif
6583
local_irq_enable();
6584
}
6585
6586
static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
6587
{
6588
#ifdef CONFIG_RPS
6589
return !use_backlog_threads() && sd->rps_ipi_list;
6590
#else
6591
return false;
6592
#endif
6593
}
6594
6595
static int process_backlog(struct napi_struct *napi, int quota)
6596
{
6597
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
6598
bool again = true;
6599
int work = 0;
6600
6601
/* Check if we have pending ipi, its better to send them now,
6602
* not waiting net_rx_action() end.
6603
*/
6604
if (sd_has_rps_ipi_waiting(sd)) {
6605
local_irq_disable();
6606
net_rps_action_and_irq_enable(sd);
6607
}
6608
6609
napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
6610
while (again) {
6611
struct sk_buff *skb;
6612
6613
local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
6614
while ((skb = __skb_dequeue(&sd->process_queue))) {
6615
local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
6616
rcu_read_lock();
6617
__netif_receive_skb(skb);
6618
rcu_read_unlock();
6619
if (++work >= quota) {
6620
rps_input_queue_head_add(sd, work);
6621
return work;
6622
}
6623
6624
local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
6625
}
6626
local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
6627
6628
backlog_lock_irq_disable(sd);
6629
if (skb_queue_empty(&sd->input_pkt_queue)) {
6630
/*
6631
* Inline a custom version of __napi_complete().
6632
* only current cpu owns and manipulates this napi,
6633
* and NAPI_STATE_SCHED is the only possible flag set
6634
* on backlog.
6635
* We can use a plain write instead of clear_bit(),
6636
* and we dont need an smp_mb() memory barrier.
6637
*/
6638
napi->state &= NAPIF_STATE_THREADED;
6639
again = false;
6640
} else {
6641
local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
6642
skb_queue_splice_tail_init(&sd->input_pkt_queue,
6643
&sd->process_queue);
6644
local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
6645
}
6646
backlog_unlock_irq_enable(sd);
6647
}
6648
6649
if (work)
6650
rps_input_queue_head_add(sd, work);
6651
return work;
6652
}
6653
6654
/**
6655
* __napi_schedule - schedule for receive
6656
* @n: entry to schedule
6657
*
6658
* The entry's receive function will be scheduled to run.
6659
* Consider using __napi_schedule_irqoff() if hard irqs are masked.
6660
*/
6661
void __napi_schedule(struct napi_struct *n)
6662
{
6663
unsigned long flags;
6664
6665
local_irq_save(flags);
6666
____napi_schedule(this_cpu_ptr(&softnet_data), n);
6667
local_irq_restore(flags);
6668
}
6669
EXPORT_SYMBOL(__napi_schedule);
6670
6671
/**
6672
* napi_schedule_prep - check if napi can be scheduled
6673
* @n: napi context
6674
*
6675
* Test if NAPI routine is already running, and if not mark
6676
* it as running. This is used as a condition variable to
6677
* insure only one NAPI poll instance runs. We also make
6678
* sure there is no pending NAPI disable.
6679
*/
6680
bool napi_schedule_prep(struct napi_struct *n)
6681
{
6682
unsigned long new, val = READ_ONCE(n->state);
6683
6684
do {
6685
if (unlikely(val & NAPIF_STATE_DISABLE))
6686
return false;
6687
new = val | NAPIF_STATE_SCHED;
6688
6689
/* Sets STATE_MISSED bit if STATE_SCHED was already set
6690
* This was suggested by Alexander Duyck, as compiler
6691
* emits better code than :
6692
* if (val & NAPIF_STATE_SCHED)
6693
* new |= NAPIF_STATE_MISSED;
6694
*/
6695
new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6696
NAPIF_STATE_MISSED;
6697
} while (!try_cmpxchg(&n->state, &val, new));
6698
6699
return !(val & NAPIF_STATE_SCHED);
6700
}
6701
EXPORT_SYMBOL(napi_schedule_prep);
6702
6703
/**
6704
* __napi_schedule_irqoff - schedule for receive
6705
* @n: entry to schedule
6706
*
6707
* Variant of __napi_schedule() assuming hard irqs are masked.
6708
*
6709
* On PREEMPT_RT enabled kernels this maps to __napi_schedule()
6710
* because the interrupt disabled assumption might not be true
6711
* due to force-threaded interrupts and spinlock substitution.
6712
*/
6713
void __napi_schedule_irqoff(struct napi_struct *n)
6714
{
6715
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6716
____napi_schedule(this_cpu_ptr(&softnet_data), n);
6717
else
6718
__napi_schedule(n);
6719
}
6720
EXPORT_SYMBOL(__napi_schedule_irqoff);
6721
6722
bool napi_complete_done(struct napi_struct *n, int work_done)
6723
{
6724
unsigned long flags, val, new, timeout = 0;
6725
bool ret = true;
6726
6727
/*
6728
* 1) Don't let napi dequeue from the cpu poll list
6729
* just in case its running on a different cpu.
6730
* 2) If we are busy polling, do nothing here, we have
6731
* the guarantee we will be called later.
6732
*/
6733
if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6734
NAPIF_STATE_IN_BUSY_POLL)))
6735
return false;
6736
6737
if (work_done) {
6738
if (n->gro.bitmask)
6739
timeout = napi_get_gro_flush_timeout(n);
6740
n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
6741
}
6742
if (n->defer_hard_irqs_count > 0) {
6743
n->defer_hard_irqs_count--;
6744
timeout = napi_get_gro_flush_timeout(n);
6745
if (timeout)
6746
ret = false;
6747
}
6748
6749
/*
6750
* When the NAPI instance uses a timeout and keeps postponing
6751
* it, we need to bound somehow the time packets are kept in
6752
* the GRO layer.
6753
*/
6754
gro_flush_normal(&n->gro, !!timeout);
6755
6756
if (unlikely(!list_empty(&n->poll_list))) {
6757
/* If n->poll_list is not empty, we need to mask irqs */
6758
local_irq_save(flags);
6759
list_del_init(&n->poll_list);
6760
local_irq_restore(flags);
6761
}
6762
WRITE_ONCE(n->list_owner, -1);
6763
6764
val = READ_ONCE(n->state);
6765
do {
6766
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6767
6768
new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
6769
NAPIF_STATE_SCHED_THREADED |
6770
NAPIF_STATE_PREFER_BUSY_POLL);
6771
6772
/* If STATE_MISSED was set, leave STATE_SCHED set,
6773
* because we will call napi->poll() one more time.
6774
* This C code was suggested by Alexander Duyck to help gcc.
6775
*/
6776
new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6777
NAPIF_STATE_SCHED;
6778
} while (!try_cmpxchg(&n->state, &val, new));
6779
6780
if (unlikely(val & NAPIF_STATE_MISSED)) {
6781
__napi_schedule(n);
6782
return false;
6783
}
6784
6785
if (timeout)
6786
hrtimer_start(&n->timer, ns_to_ktime(timeout),
6787
HRTIMER_MODE_REL_PINNED);
6788
return ret;
6789
}
6790
EXPORT_SYMBOL(napi_complete_done);
6791
6792
static void skb_defer_free_flush(void)
6793
{
6794
struct llist_node *free_list;
6795
struct sk_buff *skb, *next;
6796
struct skb_defer_node *sdn;
6797
int node;
6798
6799
for_each_node(node) {
6800
sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node;
6801
6802
if (llist_empty(&sdn->defer_list))
6803
continue;
6804
atomic_long_set(&sdn->defer_count, 0);
6805
free_list = llist_del_all(&sdn->defer_list);
6806
6807
llist_for_each_entry_safe(skb, next, free_list, ll_node) {
6808
prefetch(next);
6809
napi_consume_skb(skb, 1);
6810
}
6811
}
6812
}
6813
6814
#if defined(CONFIG_NET_RX_BUSY_POLL)
6815
6816
static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
6817
{
6818
if (!skip_schedule) {
6819
gro_normal_list(&napi->gro);
6820
__napi_schedule(napi);
6821
return;
6822
}
6823
6824
/* Flush too old packets. If HZ < 1000, flush all packets */
6825
gro_flush_normal(&napi->gro, HZ >= 1000);
6826
6827
clear_bit(NAPI_STATE_SCHED, &napi->state);
6828
}
6829
6830
enum {
6831
NAPI_F_PREFER_BUSY_POLL = 1,
6832
NAPI_F_END_ON_RESCHED = 2,
6833
};
6834
6835
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
6836
unsigned flags, u16 budget)
6837
{
6838
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
6839
bool skip_schedule = false;
6840
unsigned long timeout;
6841
int rc;
6842
6843
/* Busy polling means there is a high chance device driver hard irq
6844
* could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6845
* set in napi_schedule_prep().
6846
* Since we are about to call napi->poll() once more, we can safely
6847
* clear NAPI_STATE_MISSED.
6848
*
6849
* Note: x86 could use a single "lock and ..." instruction
6850
* to perform these two clear_bit()
6851
*/
6852
clear_bit(NAPI_STATE_MISSED, &napi->state);
6853
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6854
6855
local_bh_disable();
6856
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
6857
6858
if (flags & NAPI_F_PREFER_BUSY_POLL) {
6859
napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
6860
timeout = napi_get_gro_flush_timeout(napi);
6861
if (napi->defer_hard_irqs_count && timeout) {
6862
hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
6863
skip_schedule = true;
6864
}
6865
}
6866
6867
/* All we really want here is to re-enable device interrupts.
6868
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
6869
*/
6870
rc = napi->poll(napi, budget);
6871
/* We can't gro_normal_list() here, because napi->poll() might have
6872
* rearmed the napi (napi_complete_done()) in which case it could
6873
* already be running on another CPU.
6874
*/
6875
trace_napi_poll(napi, rc, budget);
6876
netpoll_poll_unlock(have_poll_lock);
6877
if (rc == budget)
6878
__busy_poll_stop(napi, skip_schedule);
6879
bpf_net_ctx_clear(bpf_net_ctx);
6880
local_bh_enable();
6881
}
6882
6883
static void __napi_busy_loop(unsigned int napi_id,
6884
bool (*loop_end)(void *, unsigned long),
6885
void *loop_end_arg, unsigned flags, u16 budget)
6886
{
6887
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6888
int (*napi_poll)(struct napi_struct *napi, int budget);
6889
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
6890
void *have_poll_lock = NULL;
6891
struct napi_struct *napi;
6892
6893
WARN_ON_ONCE(!rcu_read_lock_held());
6894
6895
restart:
6896
napi_poll = NULL;
6897
6898
napi = napi_by_id(napi_id);
6899
if (!napi)
6900
return;
6901
6902
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6903
preempt_disable();
6904
for (;;) {
6905
int work = 0;
6906
6907
local_bh_disable();
6908
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
6909
if (!napi_poll) {
6910
unsigned long val = READ_ONCE(napi->state);
6911
6912
/* If multiple threads are competing for this napi,
6913
* we avoid dirtying napi->state as much as we can.
6914
*/
6915
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6916
NAPIF_STATE_IN_BUSY_POLL)) {
6917
if (flags & NAPI_F_PREFER_BUSY_POLL)
6918
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6919
goto count;
6920
}
6921
if (cmpxchg(&napi->state, val,
6922
val | NAPIF_STATE_IN_BUSY_POLL |
6923
NAPIF_STATE_SCHED) != val) {
6924
if (flags & NAPI_F_PREFER_BUSY_POLL)
6925
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6926
goto count;
6927
}
6928
have_poll_lock = netpoll_poll_lock(napi);
6929
napi_poll = napi->poll;
6930
}
6931
work = napi_poll(napi, budget);
6932
trace_napi_poll(napi, work, budget);
6933
gro_normal_list(&napi->gro);
6934
count:
6935
if (work > 0)
6936
__NET_ADD_STATS(dev_net(napi->dev),
6937
LINUX_MIB_BUSYPOLLRXPACKETS, work);
6938
skb_defer_free_flush();
6939
bpf_net_ctx_clear(bpf_net_ctx);
6940
local_bh_enable();
6941
6942
if (!loop_end || loop_end(loop_end_arg, start_time))
6943
break;
6944
6945
if (unlikely(need_resched())) {
6946
if (flags & NAPI_F_END_ON_RESCHED)
6947
break;
6948
if (napi_poll)
6949
busy_poll_stop(napi, have_poll_lock, flags, budget);
6950
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6951
preempt_enable();
6952
rcu_read_unlock();
6953
cond_resched();
6954
rcu_read_lock();
6955
if (loop_end(loop_end_arg, start_time))
6956
return;
6957
goto restart;
6958
}
6959
cpu_relax();
6960
}
6961
if (napi_poll)
6962
busy_poll_stop(napi, have_poll_lock, flags, budget);
6963
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6964
preempt_enable();
6965
}
6966
6967
void napi_busy_loop_rcu(unsigned int napi_id,
6968
bool (*loop_end)(void *, unsigned long),
6969
void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6970
{
6971
unsigned flags = NAPI_F_END_ON_RESCHED;
6972
6973
if (prefer_busy_poll)
6974
flags |= NAPI_F_PREFER_BUSY_POLL;
6975
6976
__napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
6977
}
6978
6979
void napi_busy_loop(unsigned int napi_id,
6980
bool (*loop_end)(void *, unsigned long),
6981
void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6982
{
6983
unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
6984
6985
rcu_read_lock();
6986
__napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
6987
rcu_read_unlock();
6988
}
6989
EXPORT_SYMBOL(napi_busy_loop);
6990
6991
void napi_suspend_irqs(unsigned int napi_id)
6992
{
6993
struct napi_struct *napi;
6994
6995
rcu_read_lock();
6996
napi = napi_by_id(napi_id);
6997
if (napi) {
6998
unsigned long timeout = napi_get_irq_suspend_timeout(napi);
6999
7000
if (timeout)
7001
hrtimer_start(&napi->timer, ns_to_ktime(timeout),
7002
HRTIMER_MODE_REL_PINNED);
7003
}
7004
rcu_read_unlock();
7005
}
7006
7007
void napi_resume_irqs(unsigned int napi_id)
7008
{
7009
struct napi_struct *napi;
7010
7011
rcu_read_lock();
7012
napi = napi_by_id(napi_id);
7013
if (napi) {
7014
/* If irq_suspend_timeout is set to 0 between the call to
7015
* napi_suspend_irqs and now, the original value still
7016
* determines the safety timeout as intended and napi_watchdog
7017
* will resume irq processing.
7018
*/
7019
if (napi_get_irq_suspend_timeout(napi)) {
7020
local_bh_disable();
7021
napi_schedule(napi);
7022
local_bh_enable();
7023
}
7024
}
7025
rcu_read_unlock();
7026
}
7027
7028
#endif /* CONFIG_NET_RX_BUSY_POLL */
7029
7030
static void __napi_hash_add_with_id(struct napi_struct *napi,
7031
unsigned int napi_id)
7032
{
7033
napi->gro.cached_napi_id = napi_id;
7034
7035
WRITE_ONCE(napi->napi_id, napi_id);
7036
hlist_add_head_rcu(&napi->napi_hash_node,
7037
&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
7038
}
7039
7040
static void napi_hash_add_with_id(struct napi_struct *napi,
7041
unsigned int napi_id)
7042
{
7043
unsigned long flags;
7044
7045
spin_lock_irqsave(&napi_hash_lock, flags);
7046
WARN_ON_ONCE(napi_by_id(napi_id));
7047
__napi_hash_add_with_id(napi, napi_id);
7048
spin_unlock_irqrestore(&napi_hash_lock, flags);
7049
}
7050
7051
static void napi_hash_add(struct napi_struct *napi)
7052
{
7053
unsigned long flags;
7054
7055
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
7056
return;
7057
7058
spin_lock_irqsave(&napi_hash_lock, flags);
7059
7060
/* 0..NR_CPUS range is reserved for sender_cpu use */
7061
do {
7062
if (unlikely(!napi_id_valid(++napi_gen_id)))
7063
napi_gen_id = MIN_NAPI_ID;
7064
} while (napi_by_id(napi_gen_id));
7065
7066
__napi_hash_add_with_id(napi, napi_gen_id);
7067
7068
spin_unlock_irqrestore(&napi_hash_lock, flags);
7069
}
7070
7071
/* Warning : caller is responsible to make sure rcu grace period
7072
* is respected before freeing memory containing @napi
7073
*/
7074
static void napi_hash_del(struct napi_struct *napi)
7075
{
7076
unsigned long flags;
7077
7078
spin_lock_irqsave(&napi_hash_lock, flags);
7079
7080
hlist_del_init_rcu(&napi->napi_hash_node);
7081
7082
spin_unlock_irqrestore(&napi_hash_lock, flags);
7083
}
7084
7085
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
7086
{
7087
struct napi_struct *napi;
7088
7089
napi = container_of(timer, struct napi_struct, timer);
7090
7091
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
7092
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
7093
*/
7094
if (!napi_disable_pending(napi) &&
7095
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
7096
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
7097
__napi_schedule_irqoff(napi);
7098
}
7099
7100
return HRTIMER_NORESTART;
7101
}
7102
7103
static void napi_stop_kthread(struct napi_struct *napi)
7104
{
7105
unsigned long val, new;
7106
7107
/* Wait until the napi STATE_THREADED is unset. */
7108
while (true) {
7109
val = READ_ONCE(napi->state);
7110
7111
/* If napi kthread own this napi or the napi is idle,
7112
* STATE_THREADED can be unset here.
7113
*/
7114
if ((val & NAPIF_STATE_SCHED_THREADED) ||
7115
!(val & NAPIF_STATE_SCHED)) {
7116
new = val & (~(NAPIF_STATE_THREADED |
7117
NAPIF_STATE_THREADED_BUSY_POLL));
7118
} else {
7119
msleep(20);
7120
continue;
7121
}
7122
7123
if (try_cmpxchg(&napi->state, &val, new))
7124
break;
7125
}
7126
7127
/* Once STATE_THREADED is unset, wait for SCHED_THREADED to be unset by
7128
* the kthread.
7129
*/
7130
while (true) {
7131
if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state))
7132
break;
7133
7134
msleep(20);
7135
}
7136
7137
kthread_stop(napi->thread);
7138
napi->thread = NULL;
7139
}
7140
7141
static void napi_set_threaded_state(struct napi_struct *napi,
7142
enum netdev_napi_threaded threaded_mode)
7143
{
7144
bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED;
7145
bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL;
7146
7147
assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
7148
assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll);
7149
}
7150
7151
int napi_set_threaded(struct napi_struct *napi,
7152
enum netdev_napi_threaded threaded)
7153
{
7154
if (threaded) {
7155
if (!napi->thread) {
7156
int err = napi_kthread_create(napi);
7157
7158
if (err)
7159
return err;
7160
}
7161
}
7162
7163
if (napi->config)
7164
napi->config->threaded = threaded;
7165
7166
/* Setting/unsetting threaded mode on a napi might not immediately
7167
* take effect, if the current napi instance is actively being
7168
* polled. In this case, the switch between threaded mode and
7169
* softirq mode will happen in the next round of napi_schedule().
7170
* This should not cause hiccups/stalls to the live traffic.
7171
*/
7172
if (!threaded && napi->thread) {
7173
napi_stop_kthread(napi);
7174
} else {
7175
/* Make sure kthread is created before THREADED bit is set. */
7176
smp_mb__before_atomic();
7177
napi_set_threaded_state(napi, threaded);
7178
}
7179
7180
return 0;
7181
}
7182
7183
int netif_set_threaded(struct net_device *dev,
7184
enum netdev_napi_threaded threaded)
7185
{
7186
struct napi_struct *napi;
7187
int i, err = 0;
7188
7189
netdev_assert_locked_or_invisible(dev);
7190
7191
if (threaded) {
7192
list_for_each_entry(napi, &dev->napi_list, dev_list) {
7193
if (!napi->thread) {
7194
err = napi_kthread_create(napi);
7195
if (err) {
7196
threaded = NETDEV_NAPI_THREADED_DISABLED;
7197
break;
7198
}
7199
}
7200
}
7201
}
7202
7203
WRITE_ONCE(dev->threaded, threaded);
7204
7205
/* The error should not occur as the kthreads are already created. */
7206
list_for_each_entry(napi, &dev->napi_list, dev_list)
7207
WARN_ON_ONCE(napi_set_threaded(napi, threaded));
7208
7209
/* Override the config for all NAPIs even if currently not listed */
7210
for (i = 0; i < dev->num_napi_configs; i++)
7211
dev->napi_config[i].threaded = threaded;
7212
7213
return err;
7214
}
7215
7216
/**
7217
* netif_threaded_enable() - enable threaded NAPIs
7218
* @dev: net_device instance
7219
*
7220
* Enable threaded mode for the NAPI instances of the device. This may be useful
7221
* for devices where multiple NAPI instances get scheduled by a single
7222
* interrupt. Threaded NAPI allows moving the NAPI processing to cores other
7223
* than the core where IRQ is mapped.
7224
*
7225
* This function should be called before @dev is registered.
7226
*/
7227
void netif_threaded_enable(struct net_device *dev)
7228
{
7229
WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED));
7230
}
7231
EXPORT_SYMBOL(netif_threaded_enable);
7232
7233
/**
7234
* netif_queue_set_napi - Associate queue with the napi
7235
* @dev: device to which NAPI and queue belong
7236
* @queue_index: Index of queue
7237
* @type: queue type as RX or TX
7238
* @napi: NAPI context, pass NULL to clear previously set NAPI
7239
*
7240
* Set queue with its corresponding napi context. This should be done after
7241
* registering the NAPI handler for the queue-vector and the queues have been
7242
* mapped to the corresponding interrupt vector.
7243
*/
7244
void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
7245
enum netdev_queue_type type, struct napi_struct *napi)
7246
{
7247
struct netdev_rx_queue *rxq;
7248
struct netdev_queue *txq;
7249
7250
if (WARN_ON_ONCE(napi && !napi->dev))
7251
return;
7252
netdev_ops_assert_locked_or_invisible(dev);
7253
7254
switch (type) {
7255
case NETDEV_QUEUE_TYPE_RX:
7256
rxq = __netif_get_rx_queue(dev, queue_index);
7257
rxq->napi = napi;
7258
return;
7259
case NETDEV_QUEUE_TYPE_TX:
7260
txq = netdev_get_tx_queue(dev, queue_index);
7261
txq->napi = napi;
7262
return;
7263
default:
7264
return;
7265
}
7266
}
7267
EXPORT_SYMBOL(netif_queue_set_napi);
7268
7269
static void
7270
netif_napi_irq_notify(struct irq_affinity_notify *notify,
7271
const cpumask_t *mask)
7272
{
7273
struct napi_struct *napi =
7274
container_of(notify, struct napi_struct, notify);
7275
#ifdef CONFIG_RFS_ACCEL
7276
struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
7277
int err;
7278
#endif
7279
7280
if (napi->config && napi->dev->irq_affinity_auto)
7281
cpumask_copy(&napi->config->affinity_mask, mask);
7282
7283
#ifdef CONFIG_RFS_ACCEL
7284
if (napi->dev->rx_cpu_rmap_auto) {
7285
err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask);
7286
if (err)
7287
netdev_warn(napi->dev, "RMAP update failed (%d)\n",
7288
err);
7289
}
7290
#endif
7291
}
7292
7293
#ifdef CONFIG_RFS_ACCEL
7294
static void netif_napi_affinity_release(struct kref *ref)
7295
{
7296
struct napi_struct *napi =
7297
container_of(ref, struct napi_struct, notify.kref);
7298
struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
7299
7300
netdev_assert_locked(napi->dev);
7301
WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER,
7302
&napi->state));
7303
7304
if (!napi->dev->rx_cpu_rmap_auto)
7305
return;
7306
rmap->obj[napi->napi_rmap_idx] = NULL;
7307
napi->napi_rmap_idx = -1;
7308
cpu_rmap_put(rmap);
7309
}
7310
7311
int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
7312
{
7313
if (dev->rx_cpu_rmap_auto)
7314
return 0;
7315
7316
dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
7317
if (!dev->rx_cpu_rmap)
7318
return -ENOMEM;
7319
7320
dev->rx_cpu_rmap_auto = true;
7321
return 0;
7322
}
7323
EXPORT_SYMBOL(netif_enable_cpu_rmap);
7324
7325
static void netif_del_cpu_rmap(struct net_device *dev)
7326
{
7327
struct cpu_rmap *rmap = dev->rx_cpu_rmap;
7328
7329
if (!dev->rx_cpu_rmap_auto)
7330
return;
7331
7332
/* Free the rmap */
7333
cpu_rmap_put(rmap);
7334
dev->rx_cpu_rmap = NULL;
7335
dev->rx_cpu_rmap_auto = false;
7336
}
7337
7338
#else
7339
static void netif_napi_affinity_release(struct kref *ref)
7340
{
7341
}
7342
7343
int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
7344
{
7345
return 0;
7346
}
7347
EXPORT_SYMBOL(netif_enable_cpu_rmap);
7348
7349
static void netif_del_cpu_rmap(struct net_device *dev)
7350
{
7351
}
7352
#endif
7353
7354
void netif_set_affinity_auto(struct net_device *dev)
7355
{
7356
unsigned int i, maxqs, numa;
7357
7358
maxqs = max(dev->num_tx_queues, dev->num_rx_queues);
7359
numa = dev_to_node(&dev->dev);
7360
7361
for (i = 0; i < maxqs; i++)
7362
cpumask_set_cpu(cpumask_local_spread(i, numa),
7363
&dev->napi_config[i].affinity_mask);
7364
7365
dev->irq_affinity_auto = true;
7366
}
7367
EXPORT_SYMBOL(netif_set_affinity_auto);
7368
7369
void netif_napi_set_irq_locked(struct napi_struct *napi, int irq)
7370
{
7371
int rc;
7372
7373
netdev_assert_locked_or_invisible(napi->dev);
7374
7375
if (napi->irq == irq)
7376
return;
7377
7378
/* Remove existing resources */
7379
if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
7380
irq_set_affinity_notifier(napi->irq, NULL);
7381
7382
napi->irq = irq;
7383
if (irq < 0 ||
7384
(!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto))
7385
return;
7386
7387
/* Abort for buggy drivers */
7388
if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config))
7389
return;
7390
7391
#ifdef CONFIG_RFS_ACCEL
7392
if (napi->dev->rx_cpu_rmap_auto) {
7393
rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi);
7394
if (rc < 0)
7395
return;
7396
7397
cpu_rmap_get(napi->dev->rx_cpu_rmap);
7398
napi->napi_rmap_idx = rc;
7399
}
7400
#endif
7401
7402
/* Use core IRQ notifier */
7403
napi->notify.notify = netif_napi_irq_notify;
7404
napi->notify.release = netif_napi_affinity_release;
7405
rc = irq_set_affinity_notifier(irq, &napi->notify);
7406
if (rc) {
7407
netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n",
7408
rc);
7409
goto put_rmap;
7410
}
7411
7412
set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state);
7413
return;
7414
7415
put_rmap:
7416
#ifdef CONFIG_RFS_ACCEL
7417
if (napi->dev->rx_cpu_rmap_auto) {
7418
napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL;
7419
cpu_rmap_put(napi->dev->rx_cpu_rmap);
7420
napi->napi_rmap_idx = -1;
7421
}
7422
#endif
7423
napi->notify.notify = NULL;
7424
napi->notify.release = NULL;
7425
}
7426
EXPORT_SYMBOL(netif_napi_set_irq_locked);
7427
7428
static void napi_restore_config(struct napi_struct *n)
7429
{
7430
n->defer_hard_irqs = n->config->defer_hard_irqs;
7431
n->gro_flush_timeout = n->config->gro_flush_timeout;
7432
n->irq_suspend_timeout = n->config->irq_suspend_timeout;
7433
7434
if (n->dev->irq_affinity_auto &&
7435
test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state))
7436
irq_set_affinity(n->irq, &n->config->affinity_mask);
7437
7438
/* a NAPI ID might be stored in the config, if so use it. if not, use
7439
* napi_hash_add to generate one for us.
7440
*/
7441
if (n->config->napi_id) {
7442
napi_hash_add_with_id(n, n->config->napi_id);
7443
} else {
7444
napi_hash_add(n);
7445
n->config->napi_id = n->napi_id;
7446
}
7447
7448
WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded));
7449
}
7450
7451
static void napi_save_config(struct napi_struct *n)
7452
{
7453
n->config->defer_hard_irqs = n->defer_hard_irqs;
7454
n->config->gro_flush_timeout = n->gro_flush_timeout;
7455
n->config->irq_suspend_timeout = n->irq_suspend_timeout;
7456
napi_hash_del(n);
7457
}
7458
7459
/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
7460
* inherit an existing ID try to insert it at the right position.
7461
*/
7462
static void
7463
netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
7464
{
7465
unsigned int new_id, pos_id;
7466
struct list_head *higher;
7467
struct napi_struct *pos;
7468
7469
new_id = UINT_MAX;
7470
if (napi->config && napi->config->napi_id)
7471
new_id = napi->config->napi_id;
7472
7473
higher = &dev->napi_list;
7474
list_for_each_entry(pos, &dev->napi_list, dev_list) {
7475
if (napi_id_valid(pos->napi_id))
7476
pos_id = pos->napi_id;
7477
else if (pos->config)
7478
pos_id = pos->config->napi_id;
7479
else
7480
pos_id = UINT_MAX;
7481
7482
if (pos_id <= new_id)
7483
break;
7484
higher = &pos->dev_list;
7485
}
7486
list_add_rcu(&napi->dev_list, higher); /* adds after higher */
7487
}
7488
7489
/* Double check that napi_get_frags() allocates skbs with
7490
* skb->head being backed by slab, not a page fragment.
7491
* This is to make sure bug fixed in 3226b158e67c
7492
* ("net: avoid 32 x truesize under-estimation for tiny skbs")
7493
* does not accidentally come back.
7494
*/
7495
static void napi_get_frags_check(struct napi_struct *napi)
7496
{
7497
struct sk_buff *skb;
7498
7499
local_bh_disable();
7500
skb = napi_get_frags(napi);
7501
WARN_ON_ONCE(skb && skb->head_frag);
7502
napi_free_frags(napi);
7503
local_bh_enable();
7504
}
7505
7506
void netif_napi_add_weight_locked(struct net_device *dev,
7507
struct napi_struct *napi,
7508
int (*poll)(struct napi_struct *, int),
7509
int weight)
7510
{
7511
netdev_assert_locked(dev);
7512
if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
7513
return;
7514
7515
INIT_LIST_HEAD(&napi->poll_list);
7516
INIT_HLIST_NODE(&napi->napi_hash_node);
7517
hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
7518
gro_init(&napi->gro);
7519
napi->skb = NULL;
7520
napi->poll = poll;
7521
if (weight > NAPI_POLL_WEIGHT)
7522
netdev_err_once(dev, "%s() called with weight %d\n", __func__,
7523
weight);
7524
napi->weight = weight;
7525
napi->dev = dev;
7526
#ifdef CONFIG_NETPOLL
7527
napi->poll_owner = -1;
7528
#endif
7529
napi->list_owner = -1;
7530
set_bit(NAPI_STATE_SCHED, &napi->state);
7531
set_bit(NAPI_STATE_NPSVC, &napi->state);
7532
netif_napi_dev_list_add(dev, napi);
7533
7534
/* default settings from sysfs are applied to all NAPIs. any per-NAPI
7535
* configuration will be loaded in napi_enable
7536
*/
7537
napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
7538
napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
7539
7540
napi_get_frags_check(napi);
7541
/* Create kthread for this napi if dev->threaded is set.
7542
* Clear dev->threaded if kthread creation failed so that
7543
* threaded mode will not be enabled in napi_enable().
7544
*/
7545
if (napi_get_threaded_config(dev, napi))
7546
if (napi_kthread_create(napi))
7547
dev->threaded = NETDEV_NAPI_THREADED_DISABLED;
7548
netif_napi_set_irq_locked(napi, -1);
7549
}
7550
EXPORT_SYMBOL(netif_napi_add_weight_locked);
7551
7552
void napi_disable_locked(struct napi_struct *n)
7553
{
7554
unsigned long val, new;
7555
7556
might_sleep();
7557
netdev_assert_locked(n->dev);
7558
7559
set_bit(NAPI_STATE_DISABLE, &n->state);
7560
7561
val = READ_ONCE(n->state);
7562
do {
7563
while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
7564
usleep_range(20, 200);
7565
val = READ_ONCE(n->state);
7566
}
7567
7568
new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
7569
new &= ~(NAPIF_STATE_THREADED |
7570
NAPIF_STATE_THREADED_BUSY_POLL |
7571
NAPIF_STATE_PREFER_BUSY_POLL);
7572
} while (!try_cmpxchg(&n->state, &val, new));
7573
7574
hrtimer_cancel(&n->timer);
7575
7576
if (n->config)
7577
napi_save_config(n);
7578
else
7579
napi_hash_del(n);
7580
7581
clear_bit(NAPI_STATE_DISABLE, &n->state);
7582
}
7583
EXPORT_SYMBOL(napi_disable_locked);
7584
7585
/**
7586
* napi_disable() - prevent NAPI from scheduling
7587
* @n: NAPI context
7588
*
7589
* Stop NAPI from being scheduled on this context.
7590
* Waits till any outstanding processing completes.
7591
* Takes netdev_lock() for associated net_device.
7592
*/
7593
void napi_disable(struct napi_struct *n)
7594
{
7595
netdev_lock(n->dev);
7596
napi_disable_locked(n);
7597
netdev_unlock(n->dev);
7598
}
7599
EXPORT_SYMBOL(napi_disable);
7600
7601
void napi_enable_locked(struct napi_struct *n)
7602
{
7603
unsigned long new, val = READ_ONCE(n->state);
7604
7605
if (n->config)
7606
napi_restore_config(n);
7607
else
7608
napi_hash_add(n);
7609
7610
do {
7611
BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
7612
7613
new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
7614
if (n->dev->threaded && n->thread)
7615
new |= NAPIF_STATE_THREADED;
7616
} while (!try_cmpxchg(&n->state, &val, new));
7617
}
7618
EXPORT_SYMBOL(napi_enable_locked);
7619
7620
/**
7621
* napi_enable() - enable NAPI scheduling
7622
* @n: NAPI context
7623
*
7624
* Enable scheduling of a NAPI instance.
7625
* Must be paired with napi_disable().
7626
* Takes netdev_lock() for associated net_device.
7627
*/
7628
void napi_enable(struct napi_struct *n)
7629
{
7630
netdev_lock(n->dev);
7631
napi_enable_locked(n);
7632
netdev_unlock(n->dev);
7633
}
7634
EXPORT_SYMBOL(napi_enable);
7635
7636
/* Must be called in process context */
7637
void __netif_napi_del_locked(struct napi_struct *napi)
7638
{
7639
netdev_assert_locked(napi->dev);
7640
7641
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
7642
return;
7643
7644
/* Make sure NAPI is disabled (or was never enabled). */
7645
WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
7646
7647
if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
7648
irq_set_affinity_notifier(napi->irq, NULL);
7649
7650
if (napi->config) {
7651
napi->index = -1;
7652
napi->config = NULL;
7653
}
7654
7655
list_del_rcu(&napi->dev_list);
7656
napi_free_frags(napi);
7657
7658
gro_cleanup(&napi->gro);
7659
7660
if (napi->thread) {
7661
kthread_stop(napi->thread);
7662
napi->thread = NULL;
7663
}
7664
}
7665
EXPORT_SYMBOL(__netif_napi_del_locked);
7666
7667
static int __napi_poll(struct napi_struct *n, bool *repoll)
7668
{
7669
int work, weight;
7670
7671
weight = n->weight;
7672
7673
/* This NAPI_STATE_SCHED test is for avoiding a race
7674
* with netpoll's poll_napi(). Only the entity which
7675
* obtains the lock and sees NAPI_STATE_SCHED set will
7676
* actually make the ->poll() call. Therefore we avoid
7677
* accidentally calling ->poll() when NAPI is not scheduled.
7678
*/
7679
work = 0;
7680
if (napi_is_scheduled(n)) {
7681
work = n->poll(n, weight);
7682
trace_napi_poll(n, work, weight);
7683
7684
xdp_do_check_flushed(n);
7685
}
7686
7687
if (unlikely(work > weight))
7688
netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
7689
n->poll, work, weight);
7690
7691
if (likely(work < weight))
7692
return work;
7693
7694
/* Drivers must not modify the NAPI state if they
7695
* consume the entire weight. In such cases this code
7696
* still "owns" the NAPI instance and therefore can
7697
* move the instance around on the list at-will.
7698
*/
7699
if (unlikely(napi_disable_pending(n))) {
7700
napi_complete(n);
7701
return work;
7702
}
7703
7704
/* The NAPI context has more processing work, but busy-polling
7705
* is preferred. Exit early.
7706
*/
7707
if (napi_prefer_busy_poll(n)) {
7708
if (napi_complete_done(n, work)) {
7709
/* If timeout is not set, we need to make sure
7710
* that the NAPI is re-scheduled.
7711
*/
7712
napi_schedule(n);
7713
}
7714
return work;
7715
}
7716
7717
/* Flush too old packets. If HZ < 1000, flush all packets */
7718
gro_flush_normal(&n->gro, HZ >= 1000);
7719
7720
/* Some drivers may have called napi_schedule
7721
* prior to exhausting their budget.
7722
*/
7723
if (unlikely(!list_empty(&n->poll_list))) {
7724
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
7725
n->dev ? n->dev->name : "backlog");
7726
return work;
7727
}
7728
7729
*repoll = true;
7730
7731
return work;
7732
}
7733
7734
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
7735
{
7736
bool do_repoll = false;
7737
void *have;
7738
int work;
7739
7740
list_del_init(&n->poll_list);
7741
7742
have = netpoll_poll_lock(n);
7743
7744
work = __napi_poll(n, &do_repoll);
7745
7746
if (do_repoll) {
7747
#if defined(CONFIG_DEBUG_NET)
7748
if (unlikely(!napi_is_scheduled(n)))
7749
pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n",
7750
n->dev->name, n->poll);
7751
#endif
7752
list_add_tail(&n->poll_list, repoll);
7753
}
7754
netpoll_poll_unlock(have);
7755
7756
return work;
7757
}
7758
7759
static int napi_thread_wait(struct napi_struct *napi)
7760
{
7761
set_current_state(TASK_INTERRUPTIBLE);
7762
7763
while (!kthread_should_stop()) {
7764
/* Testing SCHED_THREADED bit here to make sure the current
7765
* kthread owns this napi and could poll on this napi.
7766
* Testing SCHED bit is not enough because SCHED bit might be
7767
* set by some other busy poll thread or by napi_disable().
7768
*/
7769
if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
7770
WARN_ON(!list_empty(&napi->poll_list));
7771
__set_current_state(TASK_RUNNING);
7772
return 0;
7773
}
7774
7775
schedule();
7776
set_current_state(TASK_INTERRUPTIBLE);
7777
}
7778
__set_current_state(TASK_RUNNING);
7779
7780
return -1;
7781
}
7782
7783
static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll)
7784
{
7785
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
7786
struct softnet_data *sd;
7787
unsigned long last_qs = jiffies;
7788
7789
for (;;) {
7790
bool repoll = false;
7791
void *have;
7792
7793
local_bh_disable();
7794
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
7795
7796
sd = this_cpu_ptr(&softnet_data);
7797
sd->in_napi_threaded_poll = true;
7798
7799
have = netpoll_poll_lock(napi);
7800
__napi_poll(napi, &repoll);
7801
netpoll_poll_unlock(have);
7802
7803
sd->in_napi_threaded_poll = false;
7804
barrier();
7805
7806
if (sd_has_rps_ipi_waiting(sd)) {
7807
local_irq_disable();
7808
net_rps_action_and_irq_enable(sd);
7809
}
7810
skb_defer_free_flush();
7811
bpf_net_ctx_clear(bpf_net_ctx);
7812
7813
/* When busy poll is enabled, the old packets are not flushed in
7814
* napi_complete_done. So flush them here.
7815
*/
7816
if (busy_poll)
7817
gro_flush_normal(&napi->gro, HZ >= 1000);
7818
local_bh_enable();
7819
7820
/* Call cond_resched here to avoid watchdog warnings. */
7821
if (repoll || busy_poll) {
7822
rcu_softirq_qs_periodic(last_qs);
7823
cond_resched();
7824
}
7825
7826
if (!repoll)
7827
break;
7828
}
7829
}
7830
7831
static int napi_threaded_poll(void *data)
7832
{
7833
struct napi_struct *napi = data;
7834
bool want_busy_poll;
7835
bool in_busy_poll;
7836
unsigned long val;
7837
7838
while (!napi_thread_wait(napi)) {
7839
val = READ_ONCE(napi->state);
7840
7841
want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL;
7842
in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL;
7843
7844
if (unlikely(val & NAPIF_STATE_DISABLE))
7845
want_busy_poll = false;
7846
7847
if (want_busy_poll != in_busy_poll)
7848
assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state,
7849
want_busy_poll);
7850
7851
napi_threaded_poll_loop(napi, want_busy_poll);
7852
}
7853
7854
return 0;
7855
}
7856
7857
static __latent_entropy void net_rx_action(void)
7858
{
7859
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
7860
unsigned long time_limit = jiffies +
7861
usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
7862
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
7863
int budget = READ_ONCE(net_hotdata.netdev_budget);
7864
LIST_HEAD(list);
7865
LIST_HEAD(repoll);
7866
7867
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
7868
start:
7869
sd->in_net_rx_action = true;
7870
local_irq_disable();
7871
list_splice_init(&sd->poll_list, &list);
7872
local_irq_enable();
7873
7874
for (;;) {
7875
struct napi_struct *n;
7876
7877
skb_defer_free_flush();
7878
7879
if (list_empty(&list)) {
7880
if (list_empty(&repoll)) {
7881
sd->in_net_rx_action = false;
7882
barrier();
7883
/* We need to check if ____napi_schedule()
7884
* had refilled poll_list while
7885
* sd->in_net_rx_action was true.
7886
*/
7887
if (!list_empty(&sd->poll_list))
7888
goto start;
7889
if (!sd_has_rps_ipi_waiting(sd))
7890
goto end;
7891
}
7892
break;
7893
}
7894
7895
n = list_first_entry(&list, struct napi_struct, poll_list);
7896
budget -= napi_poll(n, &repoll);
7897
7898
/* If softirq window is exhausted then punt.
7899
* Allow this to run for 2 jiffies since which will allow
7900
* an average latency of 1.5/HZ.
7901
*/
7902
if (unlikely(budget <= 0 ||
7903
time_after_eq(jiffies, time_limit))) {
7904
/* Pairs with READ_ONCE() in softnet_seq_show() */
7905
WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1);
7906
break;
7907
}
7908
}
7909
7910
local_irq_disable();
7911
7912
list_splice_tail_init(&sd->poll_list, &list);
7913
list_splice_tail(&repoll, &list);
7914
list_splice(&list, &sd->poll_list);
7915
if (!list_empty(&sd->poll_list))
7916
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
7917
else
7918
sd->in_net_rx_action = false;
7919
7920
net_rps_action_and_irq_enable(sd);
7921
end:
7922
bpf_net_ctx_clear(bpf_net_ctx);
7923
}
7924
7925
struct netdev_adjacent {
7926
struct net_device *dev;
7927
netdevice_tracker dev_tracker;
7928
7929
/* upper master flag, there can only be one master device per list */
7930
bool master;
7931
7932
/* lookup ignore flag */
7933
bool ignore;
7934
7935
/* counter for the number of times this device was added to us */
7936
u16 ref_nr;
7937
7938
/* private field for the users */
7939
void *private;
7940
7941
struct list_head list;
7942
struct rcu_head rcu;
7943
};
7944
7945
static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
7946
struct list_head *adj_list)
7947
{
7948
struct netdev_adjacent *adj;
7949
7950
list_for_each_entry(adj, adj_list, list) {
7951
if (adj->dev == adj_dev)
7952
return adj;
7953
}
7954
return NULL;
7955
}
7956
7957
static int ____netdev_has_upper_dev(struct net_device *upper_dev,
7958
struct netdev_nested_priv *priv)
7959
{
7960
struct net_device *dev = (struct net_device *)priv->data;
7961
7962
return upper_dev == dev;
7963
}
7964
7965
/**
7966
* netdev_has_upper_dev - Check if device is linked to an upper device
7967
* @dev: device
7968
* @upper_dev: upper device to check
7969
*
7970
* Find out if a device is linked to specified upper device and return true
7971
* in case it is. Note that this checks only immediate upper device,
7972
* not through a complete stack of devices. The caller must hold the RTNL lock.
7973
*/
7974
bool netdev_has_upper_dev(struct net_device *dev,
7975
struct net_device *upper_dev)
7976
{
7977
struct netdev_nested_priv priv = {
7978
.data = (void *)upper_dev,
7979
};
7980
7981
ASSERT_RTNL();
7982
7983
return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
7984
&priv);
7985
}
7986
EXPORT_SYMBOL(netdev_has_upper_dev);
7987
7988
/**
7989
* netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
7990
* @dev: device
7991
* @upper_dev: upper device to check
7992
*
7993
* Find out if a device is linked to specified upper device and return true
7994
* in case it is. Note that this checks the entire upper device chain.
7995
* The caller must hold rcu lock.
7996
*/
7997
7998
bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
7999
struct net_device *upper_dev)
8000
{
8001
struct netdev_nested_priv priv = {
8002
.data = (void *)upper_dev,
8003
};
8004
8005
return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
8006
&priv);
8007
}
8008
EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
8009
8010
/**
8011
* netdev_has_any_upper_dev - Check if device is linked to some device
8012
* @dev: device
8013
*
8014
* Find out if a device is linked to an upper device and return true in case
8015
* it is. The caller must hold the RTNL lock.
8016
*/
8017
bool netdev_has_any_upper_dev(struct net_device *dev)
8018
{
8019
ASSERT_RTNL();
8020
8021
return !list_empty(&dev->adj_list.upper);
8022
}
8023
EXPORT_SYMBOL(netdev_has_any_upper_dev);
8024
8025
/**
8026
* netdev_master_upper_dev_get - Get master upper device
8027
* @dev: device
8028
*
8029
* Find a master upper device and return pointer to it or NULL in case
8030
* it's not there. The caller must hold the RTNL lock.
8031
*/
8032
struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
8033
{
8034
struct netdev_adjacent *upper;
8035
8036
ASSERT_RTNL();
8037
8038
if (list_empty(&dev->adj_list.upper))
8039
return NULL;
8040
8041
upper = list_first_entry(&dev->adj_list.upper,
8042
struct netdev_adjacent, list);
8043
if (likely(upper->master))
8044
return upper->dev;
8045
return NULL;
8046
}
8047
EXPORT_SYMBOL(netdev_master_upper_dev_get);
8048
8049
static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
8050
{
8051
struct netdev_adjacent *upper;
8052
8053
ASSERT_RTNL();
8054
8055
if (list_empty(&dev->adj_list.upper))
8056
return NULL;
8057
8058
upper = list_first_entry(&dev->adj_list.upper,
8059
struct netdev_adjacent, list);
8060
if (likely(upper->master) && !upper->ignore)
8061
return upper->dev;
8062
return NULL;
8063
}
8064
8065
/**
8066
* netdev_has_any_lower_dev - Check if device is linked to some device
8067
* @dev: device
8068
*
8069
* Find out if a device is linked to a lower device and return true in case
8070
* it is. The caller must hold the RTNL lock.
8071
*/
8072
static bool netdev_has_any_lower_dev(struct net_device *dev)
8073
{
8074
ASSERT_RTNL();
8075
8076
return !list_empty(&dev->adj_list.lower);
8077
}
8078
8079
void *netdev_adjacent_get_private(struct list_head *adj_list)
8080
{
8081
struct netdev_adjacent *adj;
8082
8083
adj = list_entry(adj_list, struct netdev_adjacent, list);
8084
8085
return adj->private;
8086
}
8087
EXPORT_SYMBOL(netdev_adjacent_get_private);
8088
8089
/**
8090
* netdev_upper_get_next_dev_rcu - Get the next dev from upper list
8091
* @dev: device
8092
* @iter: list_head ** of the current position
8093
*
8094
* Gets the next device from the dev's upper list, starting from iter
8095
* position. The caller must hold RCU read lock.
8096
*/
8097
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
8098
struct list_head **iter)
8099
{
8100
struct netdev_adjacent *upper;
8101
8102
WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
8103
8104
upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
8105
8106
if (&upper->list == &dev->adj_list.upper)
8107
return NULL;
8108
8109
*iter = &upper->list;
8110
8111
return upper->dev;
8112
}
8113
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
8114
8115
static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
8116
struct list_head **iter,
8117
bool *ignore)
8118
{
8119
struct netdev_adjacent *upper;
8120
8121
upper = list_entry((*iter)->next, struct netdev_adjacent, list);
8122
8123
if (&upper->list == &dev->adj_list.upper)
8124
return NULL;
8125
8126
*iter = &upper->list;
8127
*ignore = upper->ignore;
8128
8129
return upper->dev;
8130
}
8131
8132
static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
8133
struct list_head **iter)
8134
{
8135
struct netdev_adjacent *upper;
8136
8137
WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
8138
8139
upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
8140
8141
if (&upper->list == &dev->adj_list.upper)
8142
return NULL;
8143
8144
*iter = &upper->list;
8145
8146
return upper->dev;
8147
}
8148
8149
static int __netdev_walk_all_upper_dev(struct net_device *dev,
8150
int (*fn)(struct net_device *dev,
8151
struct netdev_nested_priv *priv),
8152
struct netdev_nested_priv *priv)
8153
{
8154
struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
8155
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
8156
int ret, cur = 0;
8157
bool ignore;
8158
8159
now = dev;
8160
iter = &dev->adj_list.upper;
8161
8162
while (1) {
8163
if (now != dev) {
8164
ret = fn(now, priv);
8165
if (ret)
8166
return ret;
8167
}
8168
8169
next = NULL;
8170
while (1) {
8171
udev = __netdev_next_upper_dev(now, &iter, &ignore);
8172
if (!udev)
8173
break;
8174
if (ignore)
8175
continue;
8176
8177
next = udev;
8178
niter = &udev->adj_list.upper;
8179
dev_stack[cur] = now;
8180
iter_stack[cur++] = iter;
8181
break;
8182
}
8183
8184
if (!next) {
8185
if (!cur)
8186
return 0;
8187
next = dev_stack[--cur];
8188
niter = iter_stack[cur];
8189
}
8190
8191
now = next;
8192
iter = niter;
8193
}
8194
8195
return 0;
8196
}
8197
8198
int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
8199
int (*fn)(struct net_device *dev,
8200
struct netdev_nested_priv *priv),
8201
struct netdev_nested_priv *priv)
8202
{
8203
struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
8204
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
8205
int ret, cur = 0;
8206
8207
now = dev;
8208
iter = &dev->adj_list.upper;
8209
8210
while (1) {
8211
if (now != dev) {
8212
ret = fn(now, priv);
8213
if (ret)
8214
return ret;
8215
}
8216
8217
next = NULL;
8218
while (1) {
8219
udev = netdev_next_upper_dev_rcu(now, &iter);
8220
if (!udev)
8221
break;
8222
8223
next = udev;
8224
niter = &udev->adj_list.upper;
8225
dev_stack[cur] = now;
8226
iter_stack[cur++] = iter;
8227
break;
8228
}
8229
8230
if (!next) {
8231
if (!cur)
8232
return 0;
8233
next = dev_stack[--cur];
8234
niter = iter_stack[cur];
8235
}
8236
8237
now = next;
8238
iter = niter;
8239
}
8240
8241
return 0;
8242
}
8243
EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
8244
8245
static bool __netdev_has_upper_dev(struct net_device *dev,
8246
struct net_device *upper_dev)
8247
{
8248
struct netdev_nested_priv priv = {
8249
.flags = 0,
8250
.data = (void *)upper_dev,
8251
};
8252
8253
ASSERT_RTNL();
8254
8255
return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
8256
&priv);
8257
}
8258
8259
/**
8260
* netdev_lower_get_next_private - Get the next ->private from the
8261
* lower neighbour list
8262
* @dev: device
8263
* @iter: list_head ** of the current position
8264
*
8265
* Gets the next netdev_adjacent->private from the dev's lower neighbour
8266
* list, starting from iter position. The caller must hold either hold the
8267
* RTNL lock or its own locking that guarantees that the neighbour lower
8268
* list will remain unchanged.
8269
*/
8270
void *netdev_lower_get_next_private(struct net_device *dev,
8271
struct list_head **iter)
8272
{
8273
struct netdev_adjacent *lower;
8274
8275
lower = list_entry(*iter, struct netdev_adjacent, list);
8276
8277
if (&lower->list == &dev->adj_list.lower)
8278
return NULL;
8279
8280
*iter = lower->list.next;
8281
8282
return lower->private;
8283
}
8284
EXPORT_SYMBOL(netdev_lower_get_next_private);
8285
8286
/**
8287
* netdev_lower_get_next_private_rcu - Get the next ->private from the
8288
* lower neighbour list, RCU
8289
* variant
8290
* @dev: device
8291
* @iter: list_head ** of the current position
8292
*
8293
* Gets the next netdev_adjacent->private from the dev's lower neighbour
8294
* list, starting from iter position. The caller must hold RCU read lock.
8295
*/
8296
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
8297
struct list_head **iter)
8298
{
8299
struct netdev_adjacent *lower;
8300
8301
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
8302
8303
lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
8304
8305
if (&lower->list == &dev->adj_list.lower)
8306
return NULL;
8307
8308
*iter = &lower->list;
8309
8310
return lower->private;
8311
}
8312
EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
8313
8314
/**
8315
* netdev_lower_get_next - Get the next device from the lower neighbour
8316
* list
8317
* @dev: device
8318
* @iter: list_head ** of the current position
8319
*
8320
* Gets the next netdev_adjacent from the dev's lower neighbour
8321
* list, starting from iter position. The caller must hold RTNL lock or
8322
* its own locking that guarantees that the neighbour lower
8323
* list will remain unchanged.
8324
*/
8325
void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
8326
{
8327
struct netdev_adjacent *lower;
8328
8329
lower = list_entry(*iter, struct netdev_adjacent, list);
8330
8331
if (&lower->list == &dev->adj_list.lower)
8332
return NULL;
8333
8334
*iter = lower->list.next;
8335
8336
return lower->dev;
8337
}
8338
EXPORT_SYMBOL(netdev_lower_get_next);
8339
8340
static struct net_device *netdev_next_lower_dev(struct net_device *dev,
8341
struct list_head **iter)
8342
{
8343
struct netdev_adjacent *lower;
8344
8345
lower = list_entry((*iter)->next, struct netdev_adjacent, list);
8346
8347
if (&lower->list == &dev->adj_list.lower)
8348
return NULL;
8349
8350
*iter = &lower->list;
8351
8352
return lower->dev;
8353
}
8354
8355
static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
8356
struct list_head **iter,
8357
bool *ignore)
8358
{
8359
struct netdev_adjacent *lower;
8360
8361
lower = list_entry((*iter)->next, struct netdev_adjacent, list);
8362
8363
if (&lower->list == &dev->adj_list.lower)
8364
return NULL;
8365
8366
*iter = &lower->list;
8367
*ignore = lower->ignore;
8368
8369
return lower->dev;
8370
}
8371
8372
int netdev_walk_all_lower_dev(struct net_device *dev,
8373
int (*fn)(struct net_device *dev,
8374
struct netdev_nested_priv *priv),
8375
struct netdev_nested_priv *priv)
8376
{
8377
struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
8378
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
8379
int ret, cur = 0;
8380
8381
now = dev;
8382
iter = &dev->adj_list.lower;
8383
8384
while (1) {
8385
if (now != dev) {
8386
ret = fn(now, priv);
8387
if (ret)
8388
return ret;
8389
}
8390
8391
next = NULL;
8392
while (1) {
8393
ldev = netdev_next_lower_dev(now, &iter);
8394
if (!ldev)
8395
break;
8396
8397
next = ldev;
8398
niter = &ldev->adj_list.lower;
8399
dev_stack[cur] = now;
8400
iter_stack[cur++] = iter;
8401
break;
8402
}
8403
8404
if (!next) {
8405
if (!cur)
8406
return 0;
8407
next = dev_stack[--cur];
8408
niter = iter_stack[cur];
8409
}
8410
8411
now = next;
8412
iter = niter;
8413
}
8414
8415
return 0;
8416
}
8417
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
8418
8419
static int __netdev_walk_all_lower_dev(struct net_device *dev,
8420
int (*fn)(struct net_device *dev,
8421
struct netdev_nested_priv *priv),
8422
struct netdev_nested_priv *priv)
8423
{
8424
struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
8425
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
8426
int ret, cur = 0;
8427
bool ignore;
8428
8429
now = dev;
8430
iter = &dev->adj_list.lower;
8431
8432
while (1) {
8433
if (now != dev) {
8434
ret = fn(now, priv);
8435
if (ret)
8436
return ret;
8437
}
8438
8439
next = NULL;
8440
while (1) {
8441
ldev = __netdev_next_lower_dev(now, &iter, &ignore);
8442
if (!ldev)
8443
break;
8444
if (ignore)
8445
continue;
8446
8447
next = ldev;
8448
niter = &ldev->adj_list.lower;
8449
dev_stack[cur] = now;
8450
iter_stack[cur++] = iter;
8451
break;
8452
}
8453
8454
if (!next) {
8455
if (!cur)
8456
return 0;
8457
next = dev_stack[--cur];
8458
niter = iter_stack[cur];
8459
}
8460
8461
now = next;
8462
iter = niter;
8463
}
8464
8465
return 0;
8466
}
8467
8468
struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
8469
struct list_head **iter)
8470
{
8471
struct netdev_adjacent *lower;
8472
8473
lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
8474
if (&lower->list == &dev->adj_list.lower)
8475
return NULL;
8476
8477
*iter = &lower->list;
8478
8479
return lower->dev;
8480
}
8481
EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
8482
8483
static u8 __netdev_upper_depth(struct net_device *dev)
8484
{
8485
struct net_device *udev;
8486
struct list_head *iter;
8487
u8 max_depth = 0;
8488
bool ignore;
8489
8490
for (iter = &dev->adj_list.upper,
8491
udev = __netdev_next_upper_dev(dev, &iter, &ignore);
8492
udev;
8493
udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
8494
if (ignore)
8495
continue;
8496
if (max_depth < udev->upper_level)
8497
max_depth = udev->upper_level;
8498
}
8499
8500
return max_depth;
8501
}
8502
8503
static u8 __netdev_lower_depth(struct net_device *dev)
8504
{
8505
struct net_device *ldev;
8506
struct list_head *iter;
8507
u8 max_depth = 0;
8508
bool ignore;
8509
8510
for (iter = &dev->adj_list.lower,
8511
ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
8512
ldev;
8513
ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
8514
if (ignore)
8515
continue;
8516
if (max_depth < ldev->lower_level)
8517
max_depth = ldev->lower_level;
8518
}
8519
8520
return max_depth;
8521
}
8522
8523
static int __netdev_update_upper_level(struct net_device *dev,
8524
struct netdev_nested_priv *__unused)
8525
{
8526
dev->upper_level = __netdev_upper_depth(dev) + 1;
8527
return 0;
8528
}
8529
8530
#ifdef CONFIG_LOCKDEP
8531
static LIST_HEAD(net_unlink_list);
8532
8533
static void net_unlink_todo(struct net_device *dev)
8534
{
8535
if (list_empty(&dev->unlink_list))
8536
list_add_tail(&dev->unlink_list, &net_unlink_list);
8537
}
8538
#endif
8539
8540
static int __netdev_update_lower_level(struct net_device *dev,
8541
struct netdev_nested_priv *priv)
8542
{
8543
dev->lower_level = __netdev_lower_depth(dev) + 1;
8544
8545
#ifdef CONFIG_LOCKDEP
8546
if (!priv)
8547
return 0;
8548
8549
if (priv->flags & NESTED_SYNC_IMM)
8550
dev->nested_level = dev->lower_level - 1;
8551
if (priv->flags & NESTED_SYNC_TODO)
8552
net_unlink_todo(dev);
8553
#endif
8554
return 0;
8555
}
8556
8557
int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
8558
int (*fn)(struct net_device *dev,
8559
struct netdev_nested_priv *priv),
8560
struct netdev_nested_priv *priv)
8561
{
8562
struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
8563
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
8564
int ret, cur = 0;
8565
8566
now = dev;
8567
iter = &dev->adj_list.lower;
8568
8569
while (1) {
8570
if (now != dev) {
8571
ret = fn(now, priv);
8572
if (ret)
8573
return ret;
8574
}
8575
8576
next = NULL;
8577
while (1) {
8578
ldev = netdev_next_lower_dev_rcu(now, &iter);
8579
if (!ldev)
8580
break;
8581
8582
next = ldev;
8583
niter = &ldev->adj_list.lower;
8584
dev_stack[cur] = now;
8585
iter_stack[cur++] = iter;
8586
break;
8587
}
8588
8589
if (!next) {
8590
if (!cur)
8591
return 0;
8592
next = dev_stack[--cur];
8593
niter = iter_stack[cur];
8594
}
8595
8596
now = next;
8597
iter = niter;
8598
}
8599
8600
return 0;
8601
}
8602
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
8603
8604
/**
8605
* netdev_lower_get_first_private_rcu - Get the first ->private from the
8606
* lower neighbour list, RCU
8607
* variant
8608
* @dev: device
8609
*
8610
* Gets the first netdev_adjacent->private from the dev's lower neighbour
8611
* list. The caller must hold RCU read lock.
8612
*/
8613
void *netdev_lower_get_first_private_rcu(struct net_device *dev)
8614
{
8615
struct netdev_adjacent *lower;
8616
8617
lower = list_first_or_null_rcu(&dev->adj_list.lower,
8618
struct netdev_adjacent, list);
8619
if (lower)
8620
return lower->private;
8621
return NULL;
8622
}
8623
EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
8624
8625
/**
8626
* netdev_master_upper_dev_get_rcu - Get master upper device
8627
* @dev: device
8628
*
8629
* Find a master upper device and return pointer to it or NULL in case
8630
* it's not there. The caller must hold the RCU read lock.
8631
*/
8632
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
8633
{
8634
struct netdev_adjacent *upper;
8635
8636
upper = list_first_or_null_rcu(&dev->adj_list.upper,
8637
struct netdev_adjacent, list);
8638
if (upper && likely(upper->master))
8639
return upper->dev;
8640
return NULL;
8641
}
8642
EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
8643
8644
static int netdev_adjacent_sysfs_add(struct net_device *dev,
8645
struct net_device *adj_dev,
8646
struct list_head *dev_list)
8647
{
8648
char linkname[IFNAMSIZ+7];
8649
8650
sprintf(linkname, dev_list == &dev->adj_list.upper ?
8651
"upper_%s" : "lower_%s", adj_dev->name);
8652
return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
8653
linkname);
8654
}
8655
static void netdev_adjacent_sysfs_del(struct net_device *dev,
8656
char *name,
8657
struct list_head *dev_list)
8658
{
8659
char linkname[IFNAMSIZ+7];
8660
8661
sprintf(linkname, dev_list == &dev->adj_list.upper ?
8662
"upper_%s" : "lower_%s", name);
8663
sysfs_remove_link(&(dev->dev.kobj), linkname);
8664
}
8665
8666
static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
8667
struct net_device *adj_dev,
8668
struct list_head *dev_list)
8669
{
8670
return (dev_list == &dev->adj_list.upper ||
8671
dev_list == &dev->adj_list.lower) &&
8672
net_eq(dev_net(dev), dev_net(adj_dev));
8673
}
8674
8675
static int __netdev_adjacent_dev_insert(struct net_device *dev,
8676
struct net_device *adj_dev,
8677
struct list_head *dev_list,
8678
void *private, bool master)
8679
{
8680
struct netdev_adjacent *adj;
8681
int ret;
8682
8683
adj = __netdev_find_adj(adj_dev, dev_list);
8684
8685
if (adj) {
8686
adj->ref_nr += 1;
8687
pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
8688
dev->name, adj_dev->name, adj->ref_nr);
8689
8690
return 0;
8691
}
8692
8693
adj = kmalloc(sizeof(*adj), GFP_KERNEL);
8694
if (!adj)
8695
return -ENOMEM;
8696
8697
adj->dev = adj_dev;
8698
adj->master = master;
8699
adj->ref_nr = 1;
8700
adj->private = private;
8701
adj->ignore = false;
8702
netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
8703
8704
pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
8705
dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
8706
8707
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
8708
ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
8709
if (ret)
8710
goto free_adj;
8711
}
8712
8713
/* Ensure that master link is always the first item in list. */
8714
if (master) {
8715
ret = sysfs_create_link(&(dev->dev.kobj),
8716
&(adj_dev->dev.kobj), "master");
8717
if (ret)
8718
goto remove_symlinks;
8719
8720
list_add_rcu(&adj->list, dev_list);
8721
} else {
8722
list_add_tail_rcu(&adj->list, dev_list);
8723
}
8724
8725
return 0;
8726
8727
remove_symlinks:
8728
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
8729
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
8730
free_adj:
8731
netdev_put(adj_dev, &adj->dev_tracker);
8732
kfree(adj);
8733
8734
return ret;
8735
}
8736
8737
static void __netdev_adjacent_dev_remove(struct net_device *dev,
8738
struct net_device *adj_dev,
8739
u16 ref_nr,
8740
struct list_head *dev_list)
8741
{
8742
struct netdev_adjacent *adj;
8743
8744
pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
8745
dev->name, adj_dev->name, ref_nr);
8746
8747
adj = __netdev_find_adj(adj_dev, dev_list);
8748
8749
if (!adj) {
8750
pr_err("Adjacency does not exist for device %s from %s\n",
8751
dev->name, adj_dev->name);
8752
WARN_ON(1);
8753
return;
8754
}
8755
8756
if (adj->ref_nr > ref_nr) {
8757
pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
8758
dev->name, adj_dev->name, ref_nr,
8759
adj->ref_nr - ref_nr);
8760
adj->ref_nr -= ref_nr;
8761
return;
8762
}
8763
8764
if (adj->master)
8765
sysfs_remove_link(&(dev->dev.kobj), "master");
8766
8767
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
8768
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
8769
8770
list_del_rcu(&adj->list);
8771
pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
8772
adj_dev->name, dev->name, adj_dev->name);
8773
netdev_put(adj_dev, &adj->dev_tracker);
8774
kfree_rcu(adj, rcu);
8775
}
8776
8777
static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
8778
struct net_device *upper_dev,
8779
struct list_head *up_list,
8780
struct list_head *down_list,
8781
void *private, bool master)
8782
{
8783
int ret;
8784
8785
ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
8786
private, master);
8787
if (ret)
8788
return ret;
8789
8790
ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
8791
private, false);
8792
if (ret) {
8793
__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
8794
return ret;
8795
}
8796
8797
return 0;
8798
}
8799
8800
static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
8801
struct net_device *upper_dev,
8802
u16 ref_nr,
8803
struct list_head *up_list,
8804
struct list_head *down_list)
8805
{
8806
__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
8807
__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
8808
}
8809
8810
static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
8811
struct net_device *upper_dev,
8812
void *private, bool master)
8813
{
8814
return __netdev_adjacent_dev_link_lists(dev, upper_dev,
8815
&dev->adj_list.upper,
8816
&upper_dev->adj_list.lower,
8817
private, master);
8818
}
8819
8820
static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
8821
struct net_device *upper_dev)
8822
{
8823
__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
8824
&dev->adj_list.upper,
8825
&upper_dev->adj_list.lower);
8826
}
8827
8828
static int __netdev_upper_dev_link(struct net_device *dev,
8829
struct net_device *upper_dev, bool master,
8830
void *upper_priv, void *upper_info,
8831
struct netdev_nested_priv *priv,
8832
struct netlink_ext_ack *extack)
8833
{
8834
struct netdev_notifier_changeupper_info changeupper_info = {
8835
.info = {
8836
.dev = dev,
8837
.extack = extack,
8838
},
8839
.upper_dev = upper_dev,
8840
.master = master,
8841
.linking = true,
8842
.upper_info = upper_info,
8843
};
8844
struct net_device *master_dev;
8845
int ret = 0;
8846
8847
ASSERT_RTNL();
8848
8849
if (dev == upper_dev)
8850
return -EBUSY;
8851
8852
/* To prevent loops, check if dev is not upper device to upper_dev. */
8853
if (__netdev_has_upper_dev(upper_dev, dev))
8854
return -EBUSY;
8855
8856
if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
8857
return -EMLINK;
8858
8859
if (!master) {
8860
if (__netdev_has_upper_dev(dev, upper_dev))
8861
return -EEXIST;
8862
} else {
8863
master_dev = __netdev_master_upper_dev_get(dev);
8864
if (master_dev)
8865
return master_dev == upper_dev ? -EEXIST : -EBUSY;
8866
}
8867
8868
ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
8869
&changeupper_info.info);
8870
ret = notifier_to_errno(ret);
8871
if (ret)
8872
return ret;
8873
8874
ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
8875
master);
8876
if (ret)
8877
return ret;
8878
8879
ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
8880
&changeupper_info.info);
8881
ret = notifier_to_errno(ret);
8882
if (ret)
8883
goto rollback;
8884
8885
__netdev_update_upper_level(dev, NULL);
8886
__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
8887
8888
__netdev_update_lower_level(upper_dev, priv);
8889
__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
8890
priv);
8891
8892
return 0;
8893
8894
rollback:
8895
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
8896
8897
return ret;
8898
}
8899
8900
/**
8901
* netdev_upper_dev_link - Add a link to the upper device
8902
* @dev: device
8903
* @upper_dev: new upper device
8904
* @extack: netlink extended ack
8905
*
8906
* Adds a link to device which is upper to this one. The caller must hold
8907
* the RTNL lock. On a failure a negative errno code is returned.
8908
* On success the reference counts are adjusted and the function
8909
* returns zero.
8910
*/
8911
int netdev_upper_dev_link(struct net_device *dev,
8912
struct net_device *upper_dev,
8913
struct netlink_ext_ack *extack)
8914
{
8915
struct netdev_nested_priv priv = {
8916
.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8917
.data = NULL,
8918
};
8919
8920
return __netdev_upper_dev_link(dev, upper_dev, false,
8921
NULL, NULL, &priv, extack);
8922
}
8923
EXPORT_SYMBOL(netdev_upper_dev_link);
8924
8925
/**
8926
* netdev_master_upper_dev_link - Add a master link to the upper device
8927
* @dev: device
8928
* @upper_dev: new upper device
8929
* @upper_priv: upper device private
8930
* @upper_info: upper info to be passed down via notifier
8931
* @extack: netlink extended ack
8932
*
8933
* Adds a link to device which is upper to this one. In this case, only
8934
* one master upper device can be linked, although other non-master devices
8935
* might be linked as well. The caller must hold the RTNL lock.
8936
* On a failure a negative errno code is returned. On success the reference
8937
* counts are adjusted and the function returns zero.
8938
*/
8939
int netdev_master_upper_dev_link(struct net_device *dev,
8940
struct net_device *upper_dev,
8941
void *upper_priv, void *upper_info,
8942
struct netlink_ext_ack *extack)
8943
{
8944
struct netdev_nested_priv priv = {
8945
.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8946
.data = NULL,
8947
};
8948
8949
return __netdev_upper_dev_link(dev, upper_dev, true,
8950
upper_priv, upper_info, &priv, extack);
8951
}
8952
EXPORT_SYMBOL(netdev_master_upper_dev_link);
8953
8954
static void __netdev_upper_dev_unlink(struct net_device *dev,
8955
struct net_device *upper_dev,
8956
struct netdev_nested_priv *priv)
8957
{
8958
struct netdev_notifier_changeupper_info changeupper_info = {
8959
.info = {
8960
.dev = dev,
8961
},
8962
.upper_dev = upper_dev,
8963
.linking = false,
8964
};
8965
8966
ASSERT_RTNL();
8967
8968
changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
8969
8970
call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
8971
&changeupper_info.info);
8972
8973
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
8974
8975
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
8976
&changeupper_info.info);
8977
8978
__netdev_update_upper_level(dev, NULL);
8979
__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
8980
8981
__netdev_update_lower_level(upper_dev, priv);
8982
__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
8983
priv);
8984
}
8985
8986
/**
8987
* netdev_upper_dev_unlink - Removes a link to upper device
8988
* @dev: device
8989
* @upper_dev: new upper device
8990
*
8991
* Removes a link to device which is upper to this one. The caller must hold
8992
* the RTNL lock.
8993
*/
8994
void netdev_upper_dev_unlink(struct net_device *dev,
8995
struct net_device *upper_dev)
8996
{
8997
struct netdev_nested_priv priv = {
8998
.flags = NESTED_SYNC_TODO,
8999
.data = NULL,
9000
};
9001
9002
__netdev_upper_dev_unlink(dev, upper_dev, &priv);
9003
}
9004
EXPORT_SYMBOL(netdev_upper_dev_unlink);
9005
9006
static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
9007
struct net_device *lower_dev,
9008
bool val)
9009
{
9010
struct netdev_adjacent *adj;
9011
9012
adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
9013
if (adj)
9014
adj->ignore = val;
9015
9016
adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
9017
if (adj)
9018
adj->ignore = val;
9019
}
9020
9021
static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
9022
struct net_device *lower_dev)
9023
{
9024
__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
9025
}
9026
9027
static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
9028
struct net_device *lower_dev)
9029
{
9030
__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
9031
}
9032
9033
int netdev_adjacent_change_prepare(struct net_device *old_dev,
9034
struct net_device *new_dev,
9035
struct net_device *dev,
9036
struct netlink_ext_ack *extack)
9037
{
9038
struct netdev_nested_priv priv = {
9039
.flags = 0,
9040
.data = NULL,
9041
};
9042
int err;
9043
9044
if (!new_dev)
9045
return 0;
9046
9047
if (old_dev && new_dev != old_dev)
9048
netdev_adjacent_dev_disable(dev, old_dev);
9049
err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
9050
extack);
9051
if (err) {
9052
if (old_dev && new_dev != old_dev)
9053
netdev_adjacent_dev_enable(dev, old_dev);
9054
return err;
9055
}
9056
9057
return 0;
9058
}
9059
EXPORT_SYMBOL(netdev_adjacent_change_prepare);
9060
9061
void netdev_adjacent_change_commit(struct net_device *old_dev,
9062
struct net_device *new_dev,
9063
struct net_device *dev)
9064
{
9065
struct netdev_nested_priv priv = {
9066
.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
9067
.data = NULL,
9068
};
9069
9070
if (!new_dev || !old_dev)
9071
return;
9072
9073
if (new_dev == old_dev)
9074
return;
9075
9076
netdev_adjacent_dev_enable(dev, old_dev);
9077
__netdev_upper_dev_unlink(old_dev, dev, &priv);
9078
}
9079
EXPORT_SYMBOL(netdev_adjacent_change_commit);
9080
9081
void netdev_adjacent_change_abort(struct net_device *old_dev,
9082
struct net_device *new_dev,
9083
struct net_device *dev)
9084
{
9085
struct netdev_nested_priv priv = {
9086
.flags = 0,
9087
.data = NULL,
9088
};
9089
9090
if (!new_dev)
9091
return;
9092
9093
if (old_dev && new_dev != old_dev)
9094
netdev_adjacent_dev_enable(dev, old_dev);
9095
9096
__netdev_upper_dev_unlink(new_dev, dev, &priv);
9097
}
9098
EXPORT_SYMBOL(netdev_adjacent_change_abort);
9099
9100
/**
9101
* netdev_bonding_info_change - Dispatch event about slave change
9102
* @dev: device
9103
* @bonding_info: info to dispatch
9104
*
9105
* Send NETDEV_BONDING_INFO to netdev notifiers with info.
9106
* The caller must hold the RTNL lock.
9107
*/
9108
void netdev_bonding_info_change(struct net_device *dev,
9109
struct netdev_bonding_info *bonding_info)
9110
{
9111
struct netdev_notifier_bonding_info info = {
9112
.info.dev = dev,
9113
};
9114
9115
memcpy(&info.bonding_info, bonding_info,
9116
sizeof(struct netdev_bonding_info));
9117
call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
9118
&info.info);
9119
}
9120
EXPORT_SYMBOL(netdev_bonding_info_change);
9121
9122
static int netdev_offload_xstats_enable_l3(struct net_device *dev,
9123
struct netlink_ext_ack *extack)
9124
{
9125
struct netdev_notifier_offload_xstats_info info = {
9126
.info.dev = dev,
9127
.info.extack = extack,
9128
.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
9129
};
9130
int err;
9131
int rc;
9132
9133
dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
9134
GFP_KERNEL);
9135
if (!dev->offload_xstats_l3)
9136
return -ENOMEM;
9137
9138
rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
9139
NETDEV_OFFLOAD_XSTATS_DISABLE,
9140
&info.info);
9141
err = notifier_to_errno(rc);
9142
if (err)
9143
goto free_stats;
9144
9145
return 0;
9146
9147
free_stats:
9148
kfree(dev->offload_xstats_l3);
9149
dev->offload_xstats_l3 = NULL;
9150
return err;
9151
}
9152
9153
int netdev_offload_xstats_enable(struct net_device *dev,
9154
enum netdev_offload_xstats_type type,
9155
struct netlink_ext_ack *extack)
9156
{
9157
ASSERT_RTNL();
9158
9159
if (netdev_offload_xstats_enabled(dev, type))
9160
return -EALREADY;
9161
9162
switch (type) {
9163
case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
9164
return netdev_offload_xstats_enable_l3(dev, extack);
9165
}
9166
9167
WARN_ON(1);
9168
return -EINVAL;
9169
}
9170
EXPORT_SYMBOL(netdev_offload_xstats_enable);
9171
9172
static void netdev_offload_xstats_disable_l3(struct net_device *dev)
9173
{
9174
struct netdev_notifier_offload_xstats_info info = {
9175
.info.dev = dev,
9176
.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
9177
};
9178
9179
call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
9180
&info.info);
9181
kfree(dev->offload_xstats_l3);
9182
dev->offload_xstats_l3 = NULL;
9183
}
9184
9185
int netdev_offload_xstats_disable(struct net_device *dev,
9186
enum netdev_offload_xstats_type type)
9187
{
9188
ASSERT_RTNL();
9189
9190
if (!netdev_offload_xstats_enabled(dev, type))
9191
return -EALREADY;
9192
9193
switch (type) {
9194
case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
9195
netdev_offload_xstats_disable_l3(dev);
9196
return 0;
9197
}
9198
9199
WARN_ON(1);
9200
return -EINVAL;
9201
}
9202
EXPORT_SYMBOL(netdev_offload_xstats_disable);
9203
9204
static void netdev_offload_xstats_disable_all(struct net_device *dev)
9205
{
9206
netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
9207
}
9208
9209
static struct rtnl_hw_stats64 *
9210
netdev_offload_xstats_get_ptr(const struct net_device *dev,
9211
enum netdev_offload_xstats_type type)
9212
{
9213
switch (type) {
9214
case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
9215
return dev->offload_xstats_l3;
9216
}
9217
9218
WARN_ON(1);
9219
return NULL;
9220
}
9221
9222
bool netdev_offload_xstats_enabled(const struct net_device *dev,
9223
enum netdev_offload_xstats_type type)
9224
{
9225
ASSERT_RTNL();
9226
9227
return netdev_offload_xstats_get_ptr(dev, type);
9228
}
9229
EXPORT_SYMBOL(netdev_offload_xstats_enabled);
9230
9231
struct netdev_notifier_offload_xstats_ru {
9232
bool used;
9233
};
9234
9235
struct netdev_notifier_offload_xstats_rd {
9236
struct rtnl_hw_stats64 stats;
9237
bool used;
9238
};
9239
9240
static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
9241
const struct rtnl_hw_stats64 *src)
9242
{
9243
dest->rx_packets += src->rx_packets;
9244
dest->tx_packets += src->tx_packets;
9245
dest->rx_bytes += src->rx_bytes;
9246
dest->tx_bytes += src->tx_bytes;
9247
dest->rx_errors += src->rx_errors;
9248
dest->tx_errors += src->tx_errors;
9249
dest->rx_dropped += src->rx_dropped;
9250
dest->tx_dropped += src->tx_dropped;
9251
dest->multicast += src->multicast;
9252
}
9253
9254
static int netdev_offload_xstats_get_used(struct net_device *dev,
9255
enum netdev_offload_xstats_type type,
9256
bool *p_used,
9257
struct netlink_ext_ack *extack)
9258
{
9259
struct netdev_notifier_offload_xstats_ru report_used = {};
9260
struct netdev_notifier_offload_xstats_info info = {
9261
.info.dev = dev,
9262
.info.extack = extack,
9263
.type = type,
9264
.report_used = &report_used,
9265
};
9266
int rc;
9267
9268
WARN_ON(!netdev_offload_xstats_enabled(dev, type));
9269
rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
9270
&info.info);
9271
*p_used = report_used.used;
9272
return notifier_to_errno(rc);
9273
}
9274
9275
static int netdev_offload_xstats_get_stats(struct net_device *dev,
9276
enum netdev_offload_xstats_type type,
9277
struct rtnl_hw_stats64 *p_stats,
9278
bool *p_used,
9279
struct netlink_ext_ack *extack)
9280
{
9281
struct netdev_notifier_offload_xstats_rd report_delta = {};
9282
struct netdev_notifier_offload_xstats_info info = {
9283
.info.dev = dev,
9284
.info.extack = extack,
9285
.type = type,
9286
.report_delta = &report_delta,
9287
};
9288
struct rtnl_hw_stats64 *stats;
9289
int rc;
9290
9291
stats = netdev_offload_xstats_get_ptr(dev, type);
9292
if (WARN_ON(!stats))
9293
return -EINVAL;
9294
9295
rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
9296
&info.info);
9297
9298
/* Cache whatever we got, even if there was an error, otherwise the
9299
* successful stats retrievals would get lost.
9300
*/
9301
netdev_hw_stats64_add(stats, &report_delta.stats);
9302
9303
if (p_stats)
9304
*p_stats = *stats;
9305
*p_used = report_delta.used;
9306
9307
return notifier_to_errno(rc);
9308
}
9309
9310
int netdev_offload_xstats_get(struct net_device *dev,
9311
enum netdev_offload_xstats_type type,
9312
struct rtnl_hw_stats64 *p_stats, bool *p_used,
9313
struct netlink_ext_ack *extack)
9314
{
9315
ASSERT_RTNL();
9316
9317
if (p_stats)
9318
return netdev_offload_xstats_get_stats(dev, type, p_stats,
9319
p_used, extack);
9320
else
9321
return netdev_offload_xstats_get_used(dev, type, p_used,
9322
extack);
9323
}
9324
EXPORT_SYMBOL(netdev_offload_xstats_get);
9325
9326
void
9327
netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
9328
const struct rtnl_hw_stats64 *stats)
9329
{
9330
report_delta->used = true;
9331
netdev_hw_stats64_add(&report_delta->stats, stats);
9332
}
9333
EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
9334
9335
void
9336
netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
9337
{
9338
report_used->used = true;
9339
}
9340
EXPORT_SYMBOL(netdev_offload_xstats_report_used);
9341
9342
void netdev_offload_xstats_push_delta(struct net_device *dev,
9343
enum netdev_offload_xstats_type type,
9344
const struct rtnl_hw_stats64 *p_stats)
9345
{
9346
struct rtnl_hw_stats64 *stats;
9347
9348
ASSERT_RTNL();
9349
9350
stats = netdev_offload_xstats_get_ptr(dev, type);
9351
if (WARN_ON(!stats))
9352
return;
9353
9354
netdev_hw_stats64_add(stats, p_stats);
9355
}
9356
EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
9357
9358
/**
9359
* netdev_get_xmit_slave - Get the xmit slave of master device
9360
* @dev: device
9361
* @skb: The packet
9362
* @all_slaves: assume all the slaves are active
9363
*
9364
* The reference counters are not incremented so the caller must be
9365
* careful with locks. The caller must hold RCU lock.
9366
* %NULL is returned if no slave is found.
9367
*/
9368
9369
struct net_device *netdev_get_xmit_slave(struct net_device *dev,
9370
struct sk_buff *skb,
9371
bool all_slaves)
9372
{
9373
const struct net_device_ops *ops = dev->netdev_ops;
9374
9375
if (!ops->ndo_get_xmit_slave)
9376
return NULL;
9377
return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
9378
}
9379
EXPORT_SYMBOL(netdev_get_xmit_slave);
9380
9381
static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
9382
struct sock *sk)
9383
{
9384
const struct net_device_ops *ops = dev->netdev_ops;
9385
9386
if (!ops->ndo_sk_get_lower_dev)
9387
return NULL;
9388
return ops->ndo_sk_get_lower_dev(dev, sk);
9389
}
9390
9391
/**
9392
* netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
9393
* @dev: device
9394
* @sk: the socket
9395
*
9396
* %NULL is returned if no lower device is found.
9397
*/
9398
9399
struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
9400
struct sock *sk)
9401
{
9402
struct net_device *lower;
9403
9404
lower = netdev_sk_get_lower_dev(dev, sk);
9405
while (lower) {
9406
dev = lower;
9407
lower = netdev_sk_get_lower_dev(dev, sk);
9408
}
9409
9410
return dev;
9411
}
9412
EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
9413
9414
static void netdev_adjacent_add_links(struct net_device *dev)
9415
{
9416
struct netdev_adjacent *iter;
9417
9418
struct net *net = dev_net(dev);
9419
9420
list_for_each_entry(iter, &dev->adj_list.upper, list) {
9421
if (!net_eq(net, dev_net(iter->dev)))
9422
continue;
9423
netdev_adjacent_sysfs_add(iter->dev, dev,
9424
&iter->dev->adj_list.lower);
9425
netdev_adjacent_sysfs_add(dev, iter->dev,
9426
&dev->adj_list.upper);
9427
}
9428
9429
list_for_each_entry(iter, &dev->adj_list.lower, list) {
9430
if (!net_eq(net, dev_net(iter->dev)))
9431
continue;
9432
netdev_adjacent_sysfs_add(iter->dev, dev,
9433
&iter->dev->adj_list.upper);
9434
netdev_adjacent_sysfs_add(dev, iter->dev,
9435
&dev->adj_list.lower);
9436
}
9437
}
9438
9439
static void netdev_adjacent_del_links(struct net_device *dev)
9440
{
9441
struct netdev_adjacent *iter;
9442
9443
struct net *net = dev_net(dev);
9444
9445
list_for_each_entry(iter, &dev->adj_list.upper, list) {
9446
if (!net_eq(net, dev_net(iter->dev)))
9447
continue;
9448
netdev_adjacent_sysfs_del(iter->dev, dev->name,
9449
&iter->dev->adj_list.lower);
9450
netdev_adjacent_sysfs_del(dev, iter->dev->name,
9451
&dev->adj_list.upper);
9452
}
9453
9454
list_for_each_entry(iter, &dev->adj_list.lower, list) {
9455
if (!net_eq(net, dev_net(iter->dev)))
9456
continue;
9457
netdev_adjacent_sysfs_del(iter->dev, dev->name,
9458
&iter->dev->adj_list.upper);
9459
netdev_adjacent_sysfs_del(dev, iter->dev->name,
9460
&dev->adj_list.lower);
9461
}
9462
}
9463
9464
void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
9465
{
9466
struct netdev_adjacent *iter;
9467
9468
struct net *net = dev_net(dev);
9469
9470
list_for_each_entry(iter, &dev->adj_list.upper, list) {
9471
if (!net_eq(net, dev_net(iter->dev)))
9472
continue;
9473
netdev_adjacent_sysfs_del(iter->dev, oldname,
9474
&iter->dev->adj_list.lower);
9475
netdev_adjacent_sysfs_add(iter->dev, dev,
9476
&iter->dev->adj_list.lower);
9477
}
9478
9479
list_for_each_entry(iter, &dev->adj_list.lower, list) {
9480
if (!net_eq(net, dev_net(iter->dev)))
9481
continue;
9482
netdev_adjacent_sysfs_del(iter->dev, oldname,
9483
&iter->dev->adj_list.upper);
9484
netdev_adjacent_sysfs_add(iter->dev, dev,
9485
&iter->dev->adj_list.upper);
9486
}
9487
}
9488
9489
void *netdev_lower_dev_get_private(struct net_device *dev,
9490
struct net_device *lower_dev)
9491
{
9492
struct netdev_adjacent *lower;
9493
9494
if (!lower_dev)
9495
return NULL;
9496
lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
9497
if (!lower)
9498
return NULL;
9499
9500
return lower->private;
9501
}
9502
EXPORT_SYMBOL(netdev_lower_dev_get_private);
9503
9504
9505
/**
9506
* netdev_lower_state_changed - Dispatch event about lower device state change
9507
* @lower_dev: device
9508
* @lower_state_info: state to dispatch
9509
*
9510
* Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
9511
* The caller must hold the RTNL lock.
9512
*/
9513
void netdev_lower_state_changed(struct net_device *lower_dev,
9514
void *lower_state_info)
9515
{
9516
struct netdev_notifier_changelowerstate_info changelowerstate_info = {
9517
.info.dev = lower_dev,
9518
};
9519
9520
ASSERT_RTNL();
9521
changelowerstate_info.lower_state_info = lower_state_info;
9522
call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
9523
&changelowerstate_info.info);
9524
}
9525
EXPORT_SYMBOL(netdev_lower_state_changed);
9526
9527
static void dev_change_rx_flags(struct net_device *dev, int flags)
9528
{
9529
const struct net_device_ops *ops = dev->netdev_ops;
9530
9531
if (ops->ndo_change_rx_flags)
9532
ops->ndo_change_rx_flags(dev, flags);
9533
}
9534
9535
static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
9536
{
9537
unsigned int old_flags = dev->flags;
9538
unsigned int promiscuity, flags;
9539
kuid_t uid;
9540
kgid_t gid;
9541
9542
ASSERT_RTNL();
9543
9544
promiscuity = dev->promiscuity + inc;
9545
if (promiscuity == 0) {
9546
/*
9547
* Avoid overflow.
9548
* If inc causes overflow, untouch promisc and return error.
9549
*/
9550
if (unlikely(inc > 0)) {
9551
netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
9552
return -EOVERFLOW;
9553
}
9554
flags = old_flags & ~IFF_PROMISC;
9555
} else {
9556
flags = old_flags | IFF_PROMISC;
9557
}
9558
WRITE_ONCE(dev->promiscuity, promiscuity);
9559
if (flags != old_flags) {
9560
WRITE_ONCE(dev->flags, flags);
9561
netdev_info(dev, "%s promiscuous mode\n",
9562
dev->flags & IFF_PROMISC ? "entered" : "left");
9563
if (audit_enabled) {
9564
current_uid_gid(&uid, &gid);
9565
audit_log(audit_context(), GFP_ATOMIC,
9566
AUDIT_ANOM_PROMISCUOUS,
9567
"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
9568
dev->name, (dev->flags & IFF_PROMISC),
9569
(old_flags & IFF_PROMISC),
9570
from_kuid(&init_user_ns, audit_get_loginuid(current)),
9571
from_kuid(&init_user_ns, uid),
9572
from_kgid(&init_user_ns, gid),
9573
audit_get_sessionid(current));
9574
}
9575
9576
dev_change_rx_flags(dev, IFF_PROMISC);
9577
}
9578
if (notify) {
9579
/* The ops lock is only required to ensure consistent locking
9580
* for `NETDEV_CHANGE` notifiers. This function is sometimes
9581
* called without the lock, even for devices that are ops
9582
* locked, such as in `dev_uc_sync_multiple` when using
9583
* bonding or teaming.
9584
*/
9585
netdev_ops_assert_locked(dev);
9586
__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
9587
}
9588
return 0;
9589
}
9590
9591
int netif_set_promiscuity(struct net_device *dev, int inc)
9592
{
9593
unsigned int old_flags = dev->flags;
9594
int err;
9595
9596
err = __dev_set_promiscuity(dev, inc, true);
9597
if (err < 0)
9598
return err;
9599
if (dev->flags != old_flags)
9600
dev_set_rx_mode(dev);
9601
return err;
9602
}
9603
9604
int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
9605
{
9606
unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
9607
unsigned int allmulti, flags;
9608
9609
ASSERT_RTNL();
9610
9611
allmulti = dev->allmulti + inc;
9612
if (allmulti == 0) {
9613
/*
9614
* Avoid overflow.
9615
* If inc causes overflow, untouch allmulti and return error.
9616
*/
9617
if (unlikely(inc > 0)) {
9618
netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
9619
return -EOVERFLOW;
9620
}
9621
flags = old_flags & ~IFF_ALLMULTI;
9622
} else {
9623
flags = old_flags | IFF_ALLMULTI;
9624
}
9625
WRITE_ONCE(dev->allmulti, allmulti);
9626
if (flags != old_flags) {
9627
WRITE_ONCE(dev->flags, flags);
9628
netdev_info(dev, "%s allmulticast mode\n",
9629
dev->flags & IFF_ALLMULTI ? "entered" : "left");
9630
dev_change_rx_flags(dev, IFF_ALLMULTI);
9631
dev_set_rx_mode(dev);
9632
if (notify)
9633
__dev_notify_flags(dev, old_flags,
9634
dev->gflags ^ old_gflags, 0, NULL);
9635
}
9636
return 0;
9637
}
9638
9639
/*
9640
* Upload unicast and multicast address lists to device and
9641
* configure RX filtering. When the device doesn't support unicast
9642
* filtering it is put in promiscuous mode while unicast addresses
9643
* are present.
9644
*/
9645
void __dev_set_rx_mode(struct net_device *dev)
9646
{
9647
const struct net_device_ops *ops = dev->netdev_ops;
9648
9649
/* dev_open will call this function so the list will stay sane. */
9650
if (!(dev->flags&IFF_UP))
9651
return;
9652
9653
if (!netif_device_present(dev))
9654
return;
9655
9656
if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
9657
/* Unicast addresses changes may only happen under the rtnl,
9658
* therefore calling __dev_set_promiscuity here is safe.
9659
*/
9660
if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
9661
__dev_set_promiscuity(dev, 1, false);
9662
dev->uc_promisc = true;
9663
} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
9664
__dev_set_promiscuity(dev, -1, false);
9665
dev->uc_promisc = false;
9666
}
9667
}
9668
9669
if (ops->ndo_set_rx_mode)
9670
ops->ndo_set_rx_mode(dev);
9671
}
9672
9673
void dev_set_rx_mode(struct net_device *dev)
9674
{
9675
netif_addr_lock_bh(dev);
9676
__dev_set_rx_mode(dev);
9677
netif_addr_unlock_bh(dev);
9678
}
9679
9680
/**
9681
* netif_get_flags() - get flags reported to userspace
9682
* @dev: device
9683
*
9684
* Get the combination of flag bits exported through APIs to userspace.
9685
*/
9686
unsigned int netif_get_flags(const struct net_device *dev)
9687
{
9688
unsigned int flags;
9689
9690
flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
9691
IFF_ALLMULTI |
9692
IFF_RUNNING |
9693
IFF_LOWER_UP |
9694
IFF_DORMANT)) |
9695
(READ_ONCE(dev->gflags) & (IFF_PROMISC |
9696
IFF_ALLMULTI));
9697
9698
if (netif_running(dev)) {
9699
if (netif_oper_up(dev))
9700
flags |= IFF_RUNNING;
9701
if (netif_carrier_ok(dev))
9702
flags |= IFF_LOWER_UP;
9703
if (netif_dormant(dev))
9704
flags |= IFF_DORMANT;
9705
}
9706
9707
return flags;
9708
}
9709
EXPORT_SYMBOL(netif_get_flags);
9710
9711
int __dev_change_flags(struct net_device *dev, unsigned int flags,
9712
struct netlink_ext_ack *extack)
9713
{
9714
unsigned int old_flags = dev->flags;
9715
int ret;
9716
9717
ASSERT_RTNL();
9718
9719
/*
9720
* Set the flags on our device.
9721
*/
9722
9723
dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
9724
IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
9725
IFF_AUTOMEDIA)) |
9726
(dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
9727
IFF_ALLMULTI));
9728
9729
/*
9730
* Load in the correct multicast list now the flags have changed.
9731
*/
9732
9733
if ((old_flags ^ flags) & IFF_MULTICAST)
9734
dev_change_rx_flags(dev, IFF_MULTICAST);
9735
9736
dev_set_rx_mode(dev);
9737
9738
/*
9739
* Have we downed the interface. We handle IFF_UP ourselves
9740
* according to user attempts to set it, rather than blindly
9741
* setting it.
9742
*/
9743
9744
ret = 0;
9745
if ((old_flags ^ flags) & IFF_UP) {
9746
if (old_flags & IFF_UP)
9747
__dev_close(dev);
9748
else
9749
ret = __dev_open(dev, extack);
9750
}
9751
9752
if ((flags ^ dev->gflags) & IFF_PROMISC) {
9753
int inc = (flags & IFF_PROMISC) ? 1 : -1;
9754
old_flags = dev->flags;
9755
9756
dev->gflags ^= IFF_PROMISC;
9757
9758
if (__dev_set_promiscuity(dev, inc, false) >= 0)
9759
if (dev->flags != old_flags)
9760
dev_set_rx_mode(dev);
9761
}
9762
9763
/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
9764
* is important. Some (broken) drivers set IFF_PROMISC, when
9765
* IFF_ALLMULTI is requested not asking us and not reporting.
9766
*/
9767
if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
9768
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
9769
9770
dev->gflags ^= IFF_ALLMULTI;
9771
netif_set_allmulti(dev, inc, false);
9772
}
9773
9774
return ret;
9775
}
9776
9777
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
9778
unsigned int gchanges, u32 portid,
9779
const struct nlmsghdr *nlh)
9780
{
9781
unsigned int changes = dev->flags ^ old_flags;
9782
9783
if (gchanges)
9784
rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
9785
9786
if (changes & IFF_UP) {
9787
if (dev->flags & IFF_UP)
9788
call_netdevice_notifiers(NETDEV_UP, dev);
9789
else
9790
call_netdevice_notifiers(NETDEV_DOWN, dev);
9791
}
9792
9793
if (dev->flags & IFF_UP &&
9794
(changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
9795
struct netdev_notifier_change_info change_info = {
9796
.info = {
9797
.dev = dev,
9798
},
9799
.flags_changed = changes,
9800
};
9801
9802
call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
9803
}
9804
}
9805
9806
int netif_change_flags(struct net_device *dev, unsigned int flags,
9807
struct netlink_ext_ack *extack)
9808
{
9809
int ret;
9810
unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
9811
9812
ret = __dev_change_flags(dev, flags, extack);
9813
if (ret < 0)
9814
return ret;
9815
9816
changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
9817
__dev_notify_flags(dev, old_flags, changes, 0, NULL);
9818
return ret;
9819
}
9820
9821
int __netif_set_mtu(struct net_device *dev, int new_mtu)
9822
{
9823
const struct net_device_ops *ops = dev->netdev_ops;
9824
9825
if (ops->ndo_change_mtu)
9826
return ops->ndo_change_mtu(dev, new_mtu);
9827
9828
/* Pairs with all the lockless reads of dev->mtu in the stack */
9829
WRITE_ONCE(dev->mtu, new_mtu);
9830
return 0;
9831
}
9832
EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL");
9833
9834
int dev_validate_mtu(struct net_device *dev, int new_mtu,
9835
struct netlink_ext_ack *extack)
9836
{
9837
/* MTU must be positive, and in range */
9838
if (new_mtu < 0 || new_mtu < dev->min_mtu) {
9839
NL_SET_ERR_MSG(extack, "mtu less than device minimum");
9840
return -EINVAL;
9841
}
9842
9843
if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
9844
NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
9845
return -EINVAL;
9846
}
9847
return 0;
9848
}
9849
9850
/**
9851
* netif_set_mtu_ext() - Change maximum transfer unit
9852
* @dev: device
9853
* @new_mtu: new transfer unit
9854
* @extack: netlink extended ack
9855
*
9856
* Change the maximum transfer size of the network device.
9857
*
9858
* Return: 0 on success, -errno on failure.
9859
*/
9860
int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
9861
struct netlink_ext_ack *extack)
9862
{
9863
int err, orig_mtu;
9864
9865
netdev_ops_assert_locked(dev);
9866
9867
if (new_mtu == dev->mtu)
9868
return 0;
9869
9870
err = dev_validate_mtu(dev, new_mtu, extack);
9871
if (err)
9872
return err;
9873
9874
if (!netif_device_present(dev))
9875
return -ENODEV;
9876
9877
err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
9878
err = notifier_to_errno(err);
9879
if (err)
9880
return err;
9881
9882
orig_mtu = dev->mtu;
9883
err = __netif_set_mtu(dev, new_mtu);
9884
9885
if (!err) {
9886
err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
9887
orig_mtu);
9888
err = notifier_to_errno(err);
9889
if (err) {
9890
/* setting mtu back and notifying everyone again,
9891
* so that they have a chance to revert changes.
9892
*/
9893
__netif_set_mtu(dev, orig_mtu);
9894
call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
9895
new_mtu);
9896
}
9897
}
9898
return err;
9899
}
9900
9901
int netif_set_mtu(struct net_device *dev, int new_mtu)
9902
{
9903
struct netlink_ext_ack extack;
9904
int err;
9905
9906
memset(&extack, 0, sizeof(extack));
9907
err = netif_set_mtu_ext(dev, new_mtu, &extack);
9908
if (err && extack._msg)
9909
net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
9910
return err;
9911
}
9912
EXPORT_SYMBOL(netif_set_mtu);
9913
9914
int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
9915
{
9916
unsigned int orig_len = dev->tx_queue_len;
9917
int res;
9918
9919
if (new_len != (unsigned int)new_len)
9920
return -ERANGE;
9921
9922
if (new_len != orig_len) {
9923
WRITE_ONCE(dev->tx_queue_len, new_len);
9924
res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
9925
res = notifier_to_errno(res);
9926
if (res)
9927
goto err_rollback;
9928
res = dev_qdisc_change_tx_queue_len(dev);
9929
if (res)
9930
goto err_rollback;
9931
}
9932
9933
return 0;
9934
9935
err_rollback:
9936
netdev_err(dev, "refused to change device tx_queue_len\n");
9937
WRITE_ONCE(dev->tx_queue_len, orig_len);
9938
return res;
9939
}
9940
9941
void netif_set_group(struct net_device *dev, int new_group)
9942
{
9943
dev->group = new_group;
9944
}
9945
9946
/**
9947
* netif_pre_changeaddr_notify() - Call NETDEV_PRE_CHANGEADDR.
9948
* @dev: device
9949
* @addr: new address
9950
* @extack: netlink extended ack
9951
*
9952
* Return: 0 on success, -errno on failure.
9953
*/
9954
int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr,
9955
struct netlink_ext_ack *extack)
9956
{
9957
struct netdev_notifier_pre_changeaddr_info info = {
9958
.info.dev = dev,
9959
.info.extack = extack,
9960
.dev_addr = addr,
9961
};
9962
int rc;
9963
9964
rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
9965
return notifier_to_errno(rc);
9966
}
9967
EXPORT_SYMBOL_NS_GPL(netif_pre_changeaddr_notify, "NETDEV_INTERNAL");
9968
9969
int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
9970
struct netlink_ext_ack *extack)
9971
{
9972
const struct net_device_ops *ops = dev->netdev_ops;
9973
int err;
9974
9975
if (!ops->ndo_set_mac_address)
9976
return -EOPNOTSUPP;
9977
if (ss->ss_family != dev->type)
9978
return -EINVAL;
9979
if (!netif_device_present(dev))
9980
return -ENODEV;
9981
err = netif_pre_changeaddr_notify(dev, ss->__data, extack);
9982
if (err)
9983
return err;
9984
if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) {
9985
err = ops->ndo_set_mac_address(dev, ss);
9986
if (err)
9987
return err;
9988
}
9989
dev->addr_assign_type = NET_ADDR_SET;
9990
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
9991
add_device_randomness(dev->dev_addr, dev->addr_len);
9992
return 0;
9993
}
9994
9995
DECLARE_RWSEM(dev_addr_sem);
9996
9997
/* "sa" is a true struct sockaddr with limited "sa_data" member. */
9998
int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
9999
{
10000
size_t size = sizeof(sa->sa_data);
10001
struct net_device *dev;
10002
int ret = 0;
10003
10004
down_read(&dev_addr_sem);
10005
rcu_read_lock();
10006
10007
dev = dev_get_by_name_rcu(net, dev_name);
10008
if (!dev) {
10009
ret = -ENODEV;
10010
goto unlock;
10011
}
10012
if (!dev->addr_len)
10013
memset(sa->sa_data, 0, size);
10014
else
10015
memcpy(sa->sa_data, dev->dev_addr,
10016
min_t(size_t, size, dev->addr_len));
10017
sa->sa_family = dev->type;
10018
10019
unlock:
10020
rcu_read_unlock();
10021
up_read(&dev_addr_sem);
10022
return ret;
10023
}
10024
EXPORT_SYMBOL_NS_GPL(netif_get_mac_address, "NETDEV_INTERNAL");
10025
10026
int netif_change_carrier(struct net_device *dev, bool new_carrier)
10027
{
10028
const struct net_device_ops *ops = dev->netdev_ops;
10029
10030
if (!ops->ndo_change_carrier)
10031
return -EOPNOTSUPP;
10032
if (!netif_device_present(dev))
10033
return -ENODEV;
10034
return ops->ndo_change_carrier(dev, new_carrier);
10035
}
10036
10037
/**
10038
* dev_get_phys_port_id - Get device physical port ID
10039
* @dev: device
10040
* @ppid: port ID
10041
*
10042
* Get device physical port ID
10043
*/
10044
int dev_get_phys_port_id(struct net_device *dev,
10045
struct netdev_phys_item_id *ppid)
10046
{
10047
const struct net_device_ops *ops = dev->netdev_ops;
10048
10049
if (!ops->ndo_get_phys_port_id)
10050
return -EOPNOTSUPP;
10051
return ops->ndo_get_phys_port_id(dev, ppid);
10052
}
10053
10054
/**
10055
* dev_get_phys_port_name - Get device physical port name
10056
* @dev: device
10057
* @name: port name
10058
* @len: limit of bytes to copy to name
10059
*
10060
* Get device physical port name
10061
*/
10062
int dev_get_phys_port_name(struct net_device *dev,
10063
char *name, size_t len)
10064
{
10065
const struct net_device_ops *ops = dev->netdev_ops;
10066
int err;
10067
10068
if (ops->ndo_get_phys_port_name) {
10069
err = ops->ndo_get_phys_port_name(dev, name, len);
10070
if (err != -EOPNOTSUPP)
10071
return err;
10072
}
10073
return devlink_compat_phys_port_name_get(dev, name, len);
10074
}
10075
10076
/**
10077
* netif_get_port_parent_id() - Get the device's port parent identifier
10078
* @dev: network device
10079
* @ppid: pointer to a storage for the port's parent identifier
10080
* @recurse: allow/disallow recursion to lower devices
10081
*
10082
* Get the devices's port parent identifier.
10083
*
10084
* Return: 0 on success, -errno on failure.
10085
*/
10086
int netif_get_port_parent_id(struct net_device *dev,
10087
struct netdev_phys_item_id *ppid, bool recurse)
10088
{
10089
const struct net_device_ops *ops = dev->netdev_ops;
10090
struct netdev_phys_item_id first = { };
10091
struct net_device *lower_dev;
10092
struct list_head *iter;
10093
int err;
10094
10095
if (ops->ndo_get_port_parent_id) {
10096
err = ops->ndo_get_port_parent_id(dev, ppid);
10097
if (err != -EOPNOTSUPP)
10098
return err;
10099
}
10100
10101
err = devlink_compat_switch_id_get(dev, ppid);
10102
if (!recurse || err != -EOPNOTSUPP)
10103
return err;
10104
10105
netdev_for_each_lower_dev(dev, lower_dev, iter) {
10106
err = netif_get_port_parent_id(lower_dev, ppid, true);
10107
if (err)
10108
break;
10109
if (!first.id_len)
10110
first = *ppid;
10111
else if (memcmp(&first, ppid, sizeof(*ppid)))
10112
return -EOPNOTSUPP;
10113
}
10114
10115
return err;
10116
}
10117
EXPORT_SYMBOL(netif_get_port_parent_id);
10118
10119
/**
10120
* netdev_port_same_parent_id - Indicate if two network devices have
10121
* the same port parent identifier
10122
* @a: first network device
10123
* @b: second network device
10124
*/
10125
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
10126
{
10127
struct netdev_phys_item_id a_id = { };
10128
struct netdev_phys_item_id b_id = { };
10129
10130
if (netif_get_port_parent_id(a, &a_id, true) ||
10131
netif_get_port_parent_id(b, &b_id, true))
10132
return false;
10133
10134
return netdev_phys_item_id_same(&a_id, &b_id);
10135
}
10136
EXPORT_SYMBOL(netdev_port_same_parent_id);
10137
10138
int netif_change_proto_down(struct net_device *dev, bool proto_down)
10139
{
10140
if (!dev->change_proto_down)
10141
return -EOPNOTSUPP;
10142
if (!netif_device_present(dev))
10143
return -ENODEV;
10144
if (proto_down)
10145
netif_carrier_off(dev);
10146
else
10147
netif_carrier_on(dev);
10148
WRITE_ONCE(dev->proto_down, proto_down);
10149
return 0;
10150
}
10151
10152
/**
10153
* netdev_change_proto_down_reason_locked - proto down reason
10154
*
10155
* @dev: device
10156
* @mask: proto down mask
10157
* @value: proto down value
10158
*/
10159
void netdev_change_proto_down_reason_locked(struct net_device *dev,
10160
unsigned long mask, u32 value)
10161
{
10162
u32 proto_down_reason;
10163
int b;
10164
10165
if (!mask) {
10166
proto_down_reason = value;
10167
} else {
10168
proto_down_reason = dev->proto_down_reason;
10169
for_each_set_bit(b, &mask, 32) {
10170
if (value & (1 << b))
10171
proto_down_reason |= BIT(b);
10172
else
10173
proto_down_reason &= ~BIT(b);
10174
}
10175
}
10176
WRITE_ONCE(dev->proto_down_reason, proto_down_reason);
10177
}
10178
10179
struct bpf_xdp_link {
10180
struct bpf_link link;
10181
struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
10182
int flags;
10183
};
10184
10185
static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
10186
{
10187
if (flags & XDP_FLAGS_HW_MODE)
10188
return XDP_MODE_HW;
10189
if (flags & XDP_FLAGS_DRV_MODE)
10190
return XDP_MODE_DRV;
10191
if (flags & XDP_FLAGS_SKB_MODE)
10192
return XDP_MODE_SKB;
10193
return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
10194
}
10195
10196
static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
10197
{
10198
switch (mode) {
10199
case XDP_MODE_SKB:
10200
return generic_xdp_install;
10201
case XDP_MODE_DRV:
10202
case XDP_MODE_HW:
10203
return dev->netdev_ops->ndo_bpf;
10204
default:
10205
return NULL;
10206
}
10207
}
10208
10209
static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
10210
enum bpf_xdp_mode mode)
10211
{
10212
return dev->xdp_state[mode].link;
10213
}
10214
10215
static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
10216
enum bpf_xdp_mode mode)
10217
{
10218
struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
10219
10220
if (link)
10221
return link->link.prog;
10222
return dev->xdp_state[mode].prog;
10223
}
10224
10225
u8 dev_xdp_prog_count(struct net_device *dev)
10226
{
10227
u8 count = 0;
10228
int i;
10229
10230
for (i = 0; i < __MAX_XDP_MODE; i++)
10231
if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
10232
count++;
10233
return count;
10234
}
10235
EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
10236
10237
u8 dev_xdp_sb_prog_count(struct net_device *dev)
10238
{
10239
u8 count = 0;
10240
int i;
10241
10242
for (i = 0; i < __MAX_XDP_MODE; i++)
10243
if (dev->xdp_state[i].prog &&
10244
!dev->xdp_state[i].prog->aux->xdp_has_frags)
10245
count++;
10246
return count;
10247
}
10248
10249
int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
10250
{
10251
if (!dev->netdev_ops->ndo_bpf)
10252
return -EOPNOTSUPP;
10253
10254
if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
10255
bpf->command == XDP_SETUP_PROG &&
10256
bpf->prog && !bpf->prog->aux->xdp_has_frags) {
10257
NL_SET_ERR_MSG(bpf->extack,
10258
"unable to propagate XDP to device using tcp-data-split");
10259
return -EBUSY;
10260
}
10261
10262
if (dev_get_min_mp_channel_count(dev)) {
10263
NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
10264
return -EBUSY;
10265
}
10266
10267
return dev->netdev_ops->ndo_bpf(dev, bpf);
10268
}
10269
EXPORT_SYMBOL_GPL(netif_xdp_propagate);
10270
10271
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
10272
{
10273
struct bpf_prog *prog = dev_xdp_prog(dev, mode);
10274
10275
return prog ? prog->aux->id : 0;
10276
}
10277
10278
static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
10279
struct bpf_xdp_link *link)
10280
{
10281
dev->xdp_state[mode].link = link;
10282
dev->xdp_state[mode].prog = NULL;
10283
}
10284
10285
static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
10286
struct bpf_prog *prog)
10287
{
10288
dev->xdp_state[mode].link = NULL;
10289
dev->xdp_state[mode].prog = prog;
10290
}
10291
10292
static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
10293
bpf_op_t bpf_op, struct netlink_ext_ack *extack,
10294
u32 flags, struct bpf_prog *prog)
10295
{
10296
struct netdev_bpf xdp;
10297
int err;
10298
10299
netdev_ops_assert_locked(dev);
10300
10301
if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
10302
prog && !prog->aux->xdp_has_frags) {
10303
NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split");
10304
return -EBUSY;
10305
}
10306
10307
if (dev_get_min_mp_channel_count(dev)) {
10308
NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
10309
return -EBUSY;
10310
}
10311
10312
memset(&xdp, 0, sizeof(xdp));
10313
xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
10314
xdp.extack = extack;
10315
xdp.flags = flags;
10316
xdp.prog = prog;
10317
10318
/* Drivers assume refcnt is already incremented (i.e, prog pointer is
10319
* "moved" into driver), so they don't increment it on their own, but
10320
* they do decrement refcnt when program is detached or replaced.
10321
* Given net_device also owns link/prog, we need to bump refcnt here
10322
* to prevent drivers from underflowing it.
10323
*/
10324
if (prog)
10325
bpf_prog_inc(prog);
10326
err = bpf_op(dev, &xdp);
10327
if (err) {
10328
if (prog)
10329
bpf_prog_put(prog);
10330
return err;
10331
}
10332
10333
if (mode != XDP_MODE_HW)
10334
bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
10335
10336
return 0;
10337
}
10338
10339
static void dev_xdp_uninstall(struct net_device *dev)
10340
{
10341
struct bpf_xdp_link *link;
10342
struct bpf_prog *prog;
10343
enum bpf_xdp_mode mode;
10344
bpf_op_t bpf_op;
10345
10346
ASSERT_RTNL();
10347
10348
for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
10349
prog = dev_xdp_prog(dev, mode);
10350
if (!prog)
10351
continue;
10352
10353
bpf_op = dev_xdp_bpf_op(dev, mode);
10354
if (!bpf_op)
10355
continue;
10356
10357
WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
10358
10359
/* auto-detach link from net device */
10360
link = dev_xdp_link(dev, mode);
10361
if (link)
10362
link->dev = NULL;
10363
else
10364
bpf_prog_put(prog);
10365
10366
dev_xdp_set_link(dev, mode, NULL);
10367
}
10368
}
10369
10370
static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
10371
struct bpf_xdp_link *link, struct bpf_prog *new_prog,
10372
struct bpf_prog *old_prog, u32 flags)
10373
{
10374
unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
10375
struct bpf_prog *cur_prog;
10376
struct net_device *upper;
10377
struct list_head *iter;
10378
enum bpf_xdp_mode mode;
10379
bpf_op_t bpf_op;
10380
int err;
10381
10382
ASSERT_RTNL();
10383
10384
/* either link or prog attachment, never both */
10385
if (link && (new_prog || old_prog))
10386
return -EINVAL;
10387
/* link supports only XDP mode flags */
10388
if (link && (flags & ~XDP_FLAGS_MODES)) {
10389
NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
10390
return -EINVAL;
10391
}
10392
/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
10393
if (num_modes > 1) {
10394
NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
10395
return -EINVAL;
10396
}
10397
/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
10398
if (!num_modes && dev_xdp_prog_count(dev) > 1) {
10399
NL_SET_ERR_MSG(extack,
10400
"More than one program loaded, unset mode is ambiguous");
10401
return -EINVAL;
10402
}
10403
/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
10404
if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
10405
NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
10406
return -EINVAL;
10407
}
10408
10409
mode = dev_xdp_mode(dev, flags);
10410
/* can't replace attached link */
10411
if (dev_xdp_link(dev, mode)) {
10412
NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
10413
return -EBUSY;
10414
}
10415
10416
/* don't allow if an upper device already has a program */
10417
netdev_for_each_upper_dev_rcu(dev, upper, iter) {
10418
if (dev_xdp_prog_count(upper) > 0) {
10419
NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
10420
return -EEXIST;
10421
}
10422
}
10423
10424
cur_prog = dev_xdp_prog(dev, mode);
10425
/* can't replace attached prog with link */
10426
if (link && cur_prog) {
10427
NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
10428
return -EBUSY;
10429
}
10430
if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
10431
NL_SET_ERR_MSG(extack, "Active program does not match expected");
10432
return -EEXIST;
10433
}
10434
10435
/* put effective new program into new_prog */
10436
if (link)
10437
new_prog = link->link.prog;
10438
10439
if (new_prog) {
10440
bool offload = mode == XDP_MODE_HW;
10441
enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
10442
? XDP_MODE_DRV : XDP_MODE_SKB;
10443
10444
if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
10445
NL_SET_ERR_MSG(extack, "XDP program already attached");
10446
return -EBUSY;
10447
}
10448
if (!offload && dev_xdp_prog(dev, other_mode)) {
10449
NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
10450
return -EEXIST;
10451
}
10452
if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
10453
NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
10454
return -EINVAL;
10455
}
10456
if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
10457
NL_SET_ERR_MSG(extack, "Program bound to different device");
10458
return -EINVAL;
10459
}
10460
if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
10461
NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
10462
return -EINVAL;
10463
}
10464
if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
10465
NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
10466
return -EINVAL;
10467
}
10468
if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
10469
NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
10470
return -EINVAL;
10471
}
10472
}
10473
10474
/* don't call drivers if the effective program didn't change */
10475
if (new_prog != cur_prog) {
10476
bpf_op = dev_xdp_bpf_op(dev, mode);
10477
if (!bpf_op) {
10478
NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
10479
return -EOPNOTSUPP;
10480
}
10481
10482
err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
10483
if (err)
10484
return err;
10485
}
10486
10487
if (link)
10488
dev_xdp_set_link(dev, mode, link);
10489
else
10490
dev_xdp_set_prog(dev, mode, new_prog);
10491
if (cur_prog)
10492
bpf_prog_put(cur_prog);
10493
10494
return 0;
10495
}
10496
10497
static int dev_xdp_attach_link(struct net_device *dev,
10498
struct netlink_ext_ack *extack,
10499
struct bpf_xdp_link *link)
10500
{
10501
return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
10502
}
10503
10504
static int dev_xdp_detach_link(struct net_device *dev,
10505
struct netlink_ext_ack *extack,
10506
struct bpf_xdp_link *link)
10507
{
10508
enum bpf_xdp_mode mode;
10509
bpf_op_t bpf_op;
10510
10511
ASSERT_RTNL();
10512
10513
mode = dev_xdp_mode(dev, link->flags);
10514
if (dev_xdp_link(dev, mode) != link)
10515
return -EINVAL;
10516
10517
bpf_op = dev_xdp_bpf_op(dev, mode);
10518
WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
10519
dev_xdp_set_link(dev, mode, NULL);
10520
return 0;
10521
}
10522
10523
static void bpf_xdp_link_release(struct bpf_link *link)
10524
{
10525
struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
10526
10527
rtnl_lock();
10528
10529
/* if racing with net_device's tear down, xdp_link->dev might be
10530
* already NULL, in which case link was already auto-detached
10531
*/
10532
if (xdp_link->dev) {
10533
netdev_lock_ops(xdp_link->dev);
10534
WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
10535
netdev_unlock_ops(xdp_link->dev);
10536
xdp_link->dev = NULL;
10537
}
10538
10539
rtnl_unlock();
10540
}
10541
10542
static int bpf_xdp_link_detach(struct bpf_link *link)
10543
{
10544
bpf_xdp_link_release(link);
10545
return 0;
10546
}
10547
10548
static void bpf_xdp_link_dealloc(struct bpf_link *link)
10549
{
10550
struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
10551
10552
kfree(xdp_link);
10553
}
10554
10555
static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
10556
struct seq_file *seq)
10557
{
10558
struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
10559
u32 ifindex = 0;
10560
10561
rtnl_lock();
10562
if (xdp_link->dev)
10563
ifindex = xdp_link->dev->ifindex;
10564
rtnl_unlock();
10565
10566
seq_printf(seq, "ifindex:\t%u\n", ifindex);
10567
}
10568
10569
static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
10570
struct bpf_link_info *info)
10571
{
10572
struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
10573
u32 ifindex = 0;
10574
10575
rtnl_lock();
10576
if (xdp_link->dev)
10577
ifindex = xdp_link->dev->ifindex;
10578
rtnl_unlock();
10579
10580
info->xdp.ifindex = ifindex;
10581
return 0;
10582
}
10583
10584
static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
10585
struct bpf_prog *old_prog)
10586
{
10587
struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
10588
enum bpf_xdp_mode mode;
10589
bpf_op_t bpf_op;
10590
int err = 0;
10591
10592
rtnl_lock();
10593
10594
/* link might have been auto-released already, so fail */
10595
if (!xdp_link->dev) {
10596
err = -ENOLINK;
10597
goto out_unlock;
10598
}
10599
10600
if (old_prog && link->prog != old_prog) {
10601
err = -EPERM;
10602
goto out_unlock;
10603
}
10604
old_prog = link->prog;
10605
if (old_prog->type != new_prog->type ||
10606
old_prog->expected_attach_type != new_prog->expected_attach_type) {
10607
err = -EINVAL;
10608
goto out_unlock;
10609
}
10610
10611
if (old_prog == new_prog) {
10612
/* no-op, don't disturb drivers */
10613
bpf_prog_put(new_prog);
10614
goto out_unlock;
10615
}
10616
10617
netdev_lock_ops(xdp_link->dev);
10618
mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
10619
bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
10620
err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
10621
xdp_link->flags, new_prog);
10622
netdev_unlock_ops(xdp_link->dev);
10623
if (err)
10624
goto out_unlock;
10625
10626
old_prog = xchg(&link->prog, new_prog);
10627
bpf_prog_put(old_prog);
10628
10629
out_unlock:
10630
rtnl_unlock();
10631
return err;
10632
}
10633
10634
static const struct bpf_link_ops bpf_xdp_link_lops = {
10635
.release = bpf_xdp_link_release,
10636
.dealloc = bpf_xdp_link_dealloc,
10637
.detach = bpf_xdp_link_detach,
10638
.show_fdinfo = bpf_xdp_link_show_fdinfo,
10639
.fill_link_info = bpf_xdp_link_fill_link_info,
10640
.update_prog = bpf_xdp_link_update,
10641
};
10642
10643
int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
10644
{
10645
struct net *net = current->nsproxy->net_ns;
10646
struct bpf_link_primer link_primer;
10647
struct netlink_ext_ack extack = {};
10648
struct bpf_xdp_link *link;
10649
struct net_device *dev;
10650
int err, fd;
10651
10652
rtnl_lock();
10653
dev = dev_get_by_index(net, attr->link_create.target_ifindex);
10654
if (!dev) {
10655
rtnl_unlock();
10656
return -EINVAL;
10657
}
10658
10659
link = kzalloc(sizeof(*link), GFP_USER);
10660
if (!link) {
10661
err = -ENOMEM;
10662
goto unlock;
10663
}
10664
10665
bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog,
10666
attr->link_create.attach_type);
10667
link->dev = dev;
10668
link->flags = attr->link_create.flags;
10669
10670
err = bpf_link_prime(&link->link, &link_primer);
10671
if (err) {
10672
kfree(link);
10673
goto unlock;
10674
}
10675
10676
netdev_lock_ops(dev);
10677
err = dev_xdp_attach_link(dev, &extack, link);
10678
netdev_unlock_ops(dev);
10679
rtnl_unlock();
10680
10681
if (err) {
10682
link->dev = NULL;
10683
bpf_link_cleanup(&link_primer);
10684
trace_bpf_xdp_link_attach_failed(extack._msg);
10685
goto out_put_dev;
10686
}
10687
10688
fd = bpf_link_settle(&link_primer);
10689
/* link itself doesn't hold dev's refcnt to not complicate shutdown */
10690
dev_put(dev);
10691
return fd;
10692
10693
unlock:
10694
rtnl_unlock();
10695
10696
out_put_dev:
10697
dev_put(dev);
10698
return err;
10699
}
10700
10701
/**
10702
* dev_change_xdp_fd - set or clear a bpf program for a device rx path
10703
* @dev: device
10704
* @extack: netlink extended ack
10705
* @fd: new program fd or negative value to clear
10706
* @expected_fd: old program fd that userspace expects to replace or clear
10707
* @flags: xdp-related flags
10708
*
10709
* Set or clear a bpf program for a device
10710
*/
10711
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
10712
int fd, int expected_fd, u32 flags)
10713
{
10714
enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
10715
struct bpf_prog *new_prog = NULL, *old_prog = NULL;
10716
int err;
10717
10718
ASSERT_RTNL();
10719
10720
if (fd >= 0) {
10721
new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
10722
mode != XDP_MODE_SKB);
10723
if (IS_ERR(new_prog))
10724
return PTR_ERR(new_prog);
10725
}
10726
10727
if (expected_fd >= 0) {
10728
old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
10729
mode != XDP_MODE_SKB);
10730
if (IS_ERR(old_prog)) {
10731
err = PTR_ERR(old_prog);
10732
old_prog = NULL;
10733
goto err_out;
10734
}
10735
}
10736
10737
err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
10738
10739
err_out:
10740
if (err && new_prog)
10741
bpf_prog_put(new_prog);
10742
if (old_prog)
10743
bpf_prog_put(old_prog);
10744
return err;
10745
}
10746
10747
u32 dev_get_min_mp_channel_count(const struct net_device *dev)
10748
{
10749
int i;
10750
10751
netdev_ops_assert_locked(dev);
10752
10753
for (i = dev->real_num_rx_queues - 1; i >= 0; i--)
10754
if (dev->_rx[i].mp_params.mp_priv)
10755
/* The channel count is the idx plus 1. */
10756
return i + 1;
10757
10758
return 0;
10759
}
10760
10761
/**
10762
* dev_index_reserve() - allocate an ifindex in a namespace
10763
* @net: the applicable net namespace
10764
* @ifindex: requested ifindex, pass %0 to get one allocated
10765
*
10766
* Allocate a ifindex for a new device. Caller must either use the ifindex
10767
* to store the device (via list_netdevice()) or call dev_index_release()
10768
* to give the index up.
10769
*
10770
* Return: a suitable unique value for a new device interface number or -errno.
10771
*/
10772
static int dev_index_reserve(struct net *net, u32 ifindex)
10773
{
10774
int err;
10775
10776
if (ifindex > INT_MAX) {
10777
DEBUG_NET_WARN_ON_ONCE(1);
10778
return -EINVAL;
10779
}
10780
10781
if (!ifindex)
10782
err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
10783
xa_limit_31b, &net->ifindex, GFP_KERNEL);
10784
else
10785
err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
10786
if (err < 0)
10787
return err;
10788
10789
return ifindex;
10790
}
10791
10792
static void dev_index_release(struct net *net, int ifindex)
10793
{
10794
/* Expect only unused indexes, unlist_netdevice() removes the used */
10795
WARN_ON(xa_erase(&net->dev_by_index, ifindex));
10796
}
10797
10798
static bool from_cleanup_net(void)
10799
{
10800
#ifdef CONFIG_NET_NS
10801
return current == READ_ONCE(cleanup_net_task);
10802
#else
10803
return false;
10804
#endif
10805
}
10806
10807
/* Delayed registration/unregisteration */
10808
LIST_HEAD(net_todo_list);
10809
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
10810
atomic_t dev_unreg_count = ATOMIC_INIT(0);
10811
10812
static void net_set_todo(struct net_device *dev)
10813
{
10814
list_add_tail(&dev->todo_list, &net_todo_list);
10815
}
10816
10817
static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
10818
struct net_device *upper, netdev_features_t features)
10819
{
10820
netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
10821
netdev_features_t feature;
10822
int feature_bit;
10823
10824
for_each_netdev_feature(upper_disables, feature_bit) {
10825
feature = __NETIF_F_BIT(feature_bit);
10826
if (!(upper->wanted_features & feature)
10827
&& (features & feature)) {
10828
netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
10829
&feature, upper->name);
10830
features &= ~feature;
10831
}
10832
}
10833
10834
return features;
10835
}
10836
10837
static void netdev_sync_lower_features(struct net_device *upper,
10838
struct net_device *lower, netdev_features_t features)
10839
{
10840
netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
10841
netdev_features_t feature;
10842
int feature_bit;
10843
10844
for_each_netdev_feature(upper_disables, feature_bit) {
10845
feature = __NETIF_F_BIT(feature_bit);
10846
if (!(features & feature) && (lower->features & feature)) {
10847
netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
10848
&feature, lower->name);
10849
netdev_lock_ops(lower);
10850
lower->wanted_features &= ~feature;
10851
__netdev_update_features(lower);
10852
10853
if (unlikely(lower->features & feature))
10854
netdev_WARN(upper, "failed to disable %pNF on %s!\n",
10855
&feature, lower->name);
10856
else
10857
netdev_features_change(lower);
10858
netdev_unlock_ops(lower);
10859
}
10860
}
10861
}
10862
10863
static bool netdev_has_ip_or_hw_csum(netdev_features_t features)
10864
{
10865
netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
10866
bool ip_csum = (features & ip_csum_mask) == ip_csum_mask;
10867
bool hw_csum = features & NETIF_F_HW_CSUM;
10868
10869
return ip_csum || hw_csum;
10870
}
10871
10872
static netdev_features_t netdev_fix_features(struct net_device *dev,
10873
netdev_features_t features)
10874
{
10875
/* Fix illegal checksum combinations */
10876
if ((features & NETIF_F_HW_CSUM) &&
10877
(features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
10878
netdev_warn(dev, "mixed HW and IP checksum settings.\n");
10879
features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
10880
}
10881
10882
/* TSO requires that SG is present as well. */
10883
if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
10884
netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
10885
features &= ~NETIF_F_ALL_TSO;
10886
}
10887
10888
if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
10889
!(features & NETIF_F_IP_CSUM)) {
10890
netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
10891
features &= ~NETIF_F_TSO;
10892
features &= ~NETIF_F_TSO_ECN;
10893
}
10894
10895
if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
10896
!(features & NETIF_F_IPV6_CSUM)) {
10897
netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
10898
features &= ~NETIF_F_TSO6;
10899
}
10900
10901
/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
10902
if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
10903
features &= ~NETIF_F_TSO_MANGLEID;
10904
10905
/* TSO ECN requires that TSO is present as well. */
10906
if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
10907
features &= ~NETIF_F_TSO_ECN;
10908
10909
/* Software GSO depends on SG. */
10910
if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
10911
netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
10912
features &= ~NETIF_F_GSO;
10913
}
10914
10915
/* GSO partial features require GSO partial be set */
10916
if ((features & dev->gso_partial_features) &&
10917
!(features & NETIF_F_GSO_PARTIAL)) {
10918
netdev_dbg(dev,
10919
"Dropping partially supported GSO features since no GSO partial.\n");
10920
features &= ~dev->gso_partial_features;
10921
}
10922
10923
if (!(features & NETIF_F_RXCSUM)) {
10924
/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
10925
* successfully merged by hardware must also have the
10926
* checksum verified by hardware. If the user does not
10927
* want to enable RXCSUM, logically, we should disable GRO_HW.
10928
*/
10929
if (features & NETIF_F_GRO_HW) {
10930
netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
10931
features &= ~NETIF_F_GRO_HW;
10932
}
10933
}
10934
10935
/* LRO/HW-GRO features cannot be combined with RX-FCS */
10936
if (features & NETIF_F_RXFCS) {
10937
if (features & NETIF_F_LRO) {
10938
netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
10939
features &= ~NETIF_F_LRO;
10940
}
10941
10942
if (features & NETIF_F_GRO_HW) {
10943
netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
10944
features &= ~NETIF_F_GRO_HW;
10945
}
10946
}
10947
10948
if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
10949
netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
10950
features &= ~NETIF_F_LRO;
10951
}
10952
10953
if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
10954
netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
10955
features &= ~NETIF_F_HW_TLS_TX;
10956
}
10957
10958
if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
10959
netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
10960
features &= ~NETIF_F_HW_TLS_RX;
10961
}
10962
10963
if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
10964
netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
10965
features &= ~NETIF_F_GSO_UDP_L4;
10966
}
10967
10968
return features;
10969
}
10970
10971
int __netdev_update_features(struct net_device *dev)
10972
{
10973
struct net_device *upper, *lower;
10974
netdev_features_t features;
10975
struct list_head *iter;
10976
int err = -1;
10977
10978
ASSERT_RTNL();
10979
netdev_ops_assert_locked(dev);
10980
10981
features = netdev_get_wanted_features(dev);
10982
10983
if (dev->netdev_ops->ndo_fix_features)
10984
features = dev->netdev_ops->ndo_fix_features(dev, features);
10985
10986
/* driver might be less strict about feature dependencies */
10987
features = netdev_fix_features(dev, features);
10988
10989
/* some features can't be enabled if they're off on an upper device */
10990
netdev_for_each_upper_dev_rcu(dev, upper, iter)
10991
features = netdev_sync_upper_features(dev, upper, features);
10992
10993
if (dev->features == features)
10994
goto sync_lower;
10995
10996
netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
10997
&dev->features, &features);
10998
10999
if (dev->netdev_ops->ndo_set_features)
11000
err = dev->netdev_ops->ndo_set_features(dev, features);
11001
else
11002
err = 0;
11003
11004
if (unlikely(err < 0)) {
11005
netdev_err(dev,
11006
"set_features() failed (%d); wanted %pNF, left %pNF\n",
11007
err, &features, &dev->features);
11008
/* return non-0 since some features might have changed and
11009
* it's better to fire a spurious notification than miss it
11010
*/
11011
return -1;
11012
}
11013
11014
sync_lower:
11015
/* some features must be disabled on lower devices when disabled
11016
* on an upper device (think: bonding master or bridge)
11017
*/
11018
netdev_for_each_lower_dev(dev, lower, iter)
11019
netdev_sync_lower_features(dev, lower, features);
11020
11021
if (!err) {
11022
netdev_features_t diff = features ^ dev->features;
11023
11024
if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
11025
/* udp_tunnel_{get,drop}_rx_info both need
11026
* NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
11027
* device, or they won't do anything.
11028
* Thus we need to update dev->features
11029
* *before* calling udp_tunnel_get_rx_info,
11030
* but *after* calling udp_tunnel_drop_rx_info.
11031
*/
11032
udp_tunnel_nic_lock(dev);
11033
if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
11034
dev->features = features;
11035
udp_tunnel_get_rx_info(dev);
11036
} else {
11037
udp_tunnel_drop_rx_info(dev);
11038
}
11039
udp_tunnel_nic_unlock(dev);
11040
}
11041
11042
if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
11043
if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
11044
dev->features = features;
11045
err |= vlan_get_rx_ctag_filter_info(dev);
11046
} else {
11047
vlan_drop_rx_ctag_filter_info(dev);
11048
}
11049
}
11050
11051
if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
11052
if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
11053
dev->features = features;
11054
err |= vlan_get_rx_stag_filter_info(dev);
11055
} else {
11056
vlan_drop_rx_stag_filter_info(dev);
11057
}
11058
}
11059
11060
dev->features = features;
11061
}
11062
11063
return err < 0 ? 0 : 1;
11064
}
11065
11066
/**
11067
* netdev_update_features - recalculate device features
11068
* @dev: the device to check
11069
*
11070
* Recalculate dev->features set and send notifications if it
11071
* has changed. Should be called after driver or hardware dependent
11072
* conditions might have changed that influence the features.
11073
*/
11074
void netdev_update_features(struct net_device *dev)
11075
{
11076
if (__netdev_update_features(dev))
11077
netdev_features_change(dev);
11078
}
11079
EXPORT_SYMBOL(netdev_update_features);
11080
11081
/**
11082
* netdev_change_features - recalculate device features
11083
* @dev: the device to check
11084
*
11085
* Recalculate dev->features set and send notifications even
11086
* if they have not changed. Should be called instead of
11087
* netdev_update_features() if also dev->vlan_features might
11088
* have changed to allow the changes to be propagated to stacked
11089
* VLAN devices.
11090
*/
11091
void netdev_change_features(struct net_device *dev)
11092
{
11093
__netdev_update_features(dev);
11094
netdev_features_change(dev);
11095
}
11096
EXPORT_SYMBOL(netdev_change_features);
11097
11098
/**
11099
* netif_stacked_transfer_operstate - transfer operstate
11100
* @rootdev: the root or lower level device to transfer state from
11101
* @dev: the device to transfer operstate to
11102
*
11103
* Transfer operational state from root to device. This is normally
11104
* called when a stacking relationship exists between the root
11105
* device and the device(a leaf device).
11106
*/
11107
void netif_stacked_transfer_operstate(const struct net_device *rootdev,
11108
struct net_device *dev)
11109
{
11110
if (rootdev->operstate == IF_OPER_DORMANT)
11111
netif_dormant_on(dev);
11112
else
11113
netif_dormant_off(dev);
11114
11115
if (rootdev->operstate == IF_OPER_TESTING)
11116
netif_testing_on(dev);
11117
else
11118
netif_testing_off(dev);
11119
11120
if (netif_carrier_ok(rootdev))
11121
netif_carrier_on(dev);
11122
else
11123
netif_carrier_off(dev);
11124
}
11125
EXPORT_SYMBOL(netif_stacked_transfer_operstate);
11126
11127
static int netif_alloc_rx_queues(struct net_device *dev)
11128
{
11129
unsigned int i, count = dev->num_rx_queues;
11130
struct netdev_rx_queue *rx;
11131
size_t sz = count * sizeof(*rx);
11132
int err = 0;
11133
11134
BUG_ON(count < 1);
11135
11136
rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
11137
if (!rx)
11138
return -ENOMEM;
11139
11140
dev->_rx = rx;
11141
11142
for (i = 0; i < count; i++) {
11143
rx[i].dev = dev;
11144
11145
/* XDP RX-queue setup */
11146
err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
11147
if (err < 0)
11148
goto err_rxq_info;
11149
}
11150
return 0;
11151
11152
err_rxq_info:
11153
/* Rollback successful reg's and free other resources */
11154
while (i--)
11155
xdp_rxq_info_unreg(&rx[i].xdp_rxq);
11156
kvfree(dev->_rx);
11157
dev->_rx = NULL;
11158
return err;
11159
}
11160
11161
static void netif_free_rx_queues(struct net_device *dev)
11162
{
11163
unsigned int i, count = dev->num_rx_queues;
11164
11165
/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
11166
if (!dev->_rx)
11167
return;
11168
11169
for (i = 0; i < count; i++)
11170
xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
11171
11172
kvfree(dev->_rx);
11173
}
11174
11175
static void netdev_init_one_queue(struct net_device *dev,
11176
struct netdev_queue *queue, void *_unused)
11177
{
11178
/* Initialize queue lock */
11179
spin_lock_init(&queue->_xmit_lock);
11180
netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
11181
queue->xmit_lock_owner = -1;
11182
netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
11183
queue->dev = dev;
11184
#ifdef CONFIG_BQL
11185
dql_init(&queue->dql, HZ);
11186
#endif
11187
}
11188
11189
static void netif_free_tx_queues(struct net_device *dev)
11190
{
11191
kvfree(dev->_tx);
11192
}
11193
11194
static int netif_alloc_netdev_queues(struct net_device *dev)
11195
{
11196
unsigned int count = dev->num_tx_queues;
11197
struct netdev_queue *tx;
11198
size_t sz = count * sizeof(*tx);
11199
11200
if (count < 1 || count > 0xffff)
11201
return -EINVAL;
11202
11203
tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
11204
if (!tx)
11205
return -ENOMEM;
11206
11207
dev->_tx = tx;
11208
11209
netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
11210
spin_lock_init(&dev->tx_global_lock);
11211
11212
return 0;
11213
}
11214
11215
void netif_tx_stop_all_queues(struct net_device *dev)
11216
{
11217
unsigned int i;
11218
11219
for (i = 0; i < dev->num_tx_queues; i++) {
11220
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
11221
11222
netif_tx_stop_queue(txq);
11223
}
11224
}
11225
EXPORT_SYMBOL(netif_tx_stop_all_queues);
11226
11227
static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
11228
{
11229
void __percpu *v;
11230
11231
/* Drivers implementing ndo_get_peer_dev must support tstat
11232
* accounting, so that skb_do_redirect() can bump the dev's
11233
* RX stats upon network namespace switch.
11234
*/
11235
if (dev->netdev_ops->ndo_get_peer_dev &&
11236
dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
11237
return -EOPNOTSUPP;
11238
11239
switch (dev->pcpu_stat_type) {
11240
case NETDEV_PCPU_STAT_NONE:
11241
return 0;
11242
case NETDEV_PCPU_STAT_LSTATS:
11243
v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
11244
break;
11245
case NETDEV_PCPU_STAT_TSTATS:
11246
v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
11247
break;
11248
case NETDEV_PCPU_STAT_DSTATS:
11249
v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
11250
break;
11251
default:
11252
return -EINVAL;
11253
}
11254
11255
return v ? 0 : -ENOMEM;
11256
}
11257
11258
static void netdev_do_free_pcpu_stats(struct net_device *dev)
11259
{
11260
switch (dev->pcpu_stat_type) {
11261
case NETDEV_PCPU_STAT_NONE:
11262
return;
11263
case NETDEV_PCPU_STAT_LSTATS:
11264
free_percpu(dev->lstats);
11265
break;
11266
case NETDEV_PCPU_STAT_TSTATS:
11267
free_percpu(dev->tstats);
11268
break;
11269
case NETDEV_PCPU_STAT_DSTATS:
11270
free_percpu(dev->dstats);
11271
break;
11272
}
11273
}
11274
11275
static void netdev_free_phy_link_topology(struct net_device *dev)
11276
{
11277
struct phy_link_topology *topo = dev->link_topo;
11278
11279
if (IS_ENABLED(CONFIG_PHYLIB) && topo) {
11280
xa_destroy(&topo->phys);
11281
kfree(topo);
11282
dev->link_topo = NULL;
11283
}
11284
}
11285
11286
/**
11287
* register_netdevice() - register a network device
11288
* @dev: device to register
11289
*
11290
* Take a prepared network device structure and make it externally accessible.
11291
* A %NETDEV_REGISTER message is sent to the netdev notifier chain.
11292
* Callers must hold the rtnl lock - you may want register_netdev()
11293
* instead of this.
11294
*/
11295
int register_netdevice(struct net_device *dev)
11296
{
11297
int ret;
11298
struct net *net = dev_net(dev);
11299
11300
BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
11301
NETDEV_FEATURE_COUNT);
11302
BUG_ON(dev_boot_phase);
11303
ASSERT_RTNL();
11304
11305
might_sleep();
11306
11307
/* When net_device's are persistent, this will be fatal. */
11308
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
11309
BUG_ON(!net);
11310
11311
ret = ethtool_check_ops(dev->ethtool_ops);
11312
if (ret)
11313
return ret;
11314
11315
/* rss ctx ID 0 is reserved for the default context, start from 1 */
11316
xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1);
11317
mutex_init(&dev->ethtool->rss_lock);
11318
11319
spin_lock_init(&dev->addr_list_lock);
11320
netdev_set_addr_lockdep_class(dev);
11321
11322
ret = dev_get_valid_name(net, dev, dev->name);
11323
if (ret < 0)
11324
goto out;
11325
11326
ret = -ENOMEM;
11327
dev->name_node = netdev_name_node_head_alloc(dev);
11328
if (!dev->name_node)
11329
goto out;
11330
11331
/* Init, if this function is available */
11332
if (dev->netdev_ops->ndo_init) {
11333
ret = dev->netdev_ops->ndo_init(dev);
11334
if (ret) {
11335
if (ret > 0)
11336
ret = -EIO;
11337
goto err_free_name;
11338
}
11339
}
11340
11341
if (((dev->hw_features | dev->features) &
11342
NETIF_F_HW_VLAN_CTAG_FILTER) &&
11343
(!dev->netdev_ops->ndo_vlan_rx_add_vid ||
11344
!dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
11345
netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
11346
ret = -EINVAL;
11347
goto err_uninit;
11348
}
11349
11350
ret = netdev_do_alloc_pcpu_stats(dev);
11351
if (ret)
11352
goto err_uninit;
11353
11354
ret = dev_index_reserve(net, dev->ifindex);
11355
if (ret < 0)
11356
goto err_free_pcpu;
11357
dev->ifindex = ret;
11358
11359
/* Transfer changeable features to wanted_features and enable
11360
* software offloads (GSO and GRO).
11361
*/
11362
dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
11363
dev->features |= NETIF_F_SOFT_FEATURES;
11364
11365
if (dev->udp_tunnel_nic_info) {
11366
dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
11367
dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
11368
}
11369
11370
dev->wanted_features = dev->features & dev->hw_features;
11371
11372
if (!(dev->flags & IFF_LOOPBACK))
11373
dev->hw_features |= NETIF_F_NOCACHE_COPY;
11374
11375
/* If IPv4 TCP segmentation offload is supported we should also
11376
* allow the device to enable segmenting the frame with the option
11377
* of ignoring a static IP ID value. This doesn't enable the
11378
* feature itself but allows the user to enable it later.
11379
*/
11380
if (dev->hw_features & NETIF_F_TSO)
11381
dev->hw_features |= NETIF_F_TSO_MANGLEID;
11382
if (dev->vlan_features & NETIF_F_TSO)
11383
dev->vlan_features |= NETIF_F_TSO_MANGLEID;
11384
if (dev->mpls_features & NETIF_F_TSO)
11385
dev->mpls_features |= NETIF_F_TSO_MANGLEID;
11386
if (dev->hw_enc_features & NETIF_F_TSO)
11387
dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
11388
11389
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
11390
*/
11391
dev->vlan_features |= NETIF_F_HIGHDMA;
11392
11393
/* Make NETIF_F_SG inheritable to tunnel devices.
11394
*/
11395
dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
11396
11397
/* Make NETIF_F_SG inheritable to MPLS.
11398
*/
11399
dev->mpls_features |= NETIF_F_SG;
11400
11401
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
11402
ret = notifier_to_errno(ret);
11403
if (ret)
11404
goto err_ifindex_release;
11405
11406
ret = netdev_register_kobject(dev);
11407
11408
netdev_lock(dev);
11409
WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
11410
netdev_unlock(dev);
11411
11412
if (ret)
11413
goto err_uninit_notify;
11414
11415
netdev_lock_ops(dev);
11416
__netdev_update_features(dev);
11417
netdev_unlock_ops(dev);
11418
11419
/*
11420
* Default initial state at registry is that the
11421
* device is present.
11422
*/
11423
11424
set_bit(__LINK_STATE_PRESENT, &dev->state);
11425
11426
linkwatch_init_dev(dev);
11427
11428
dev_init_scheduler(dev);
11429
11430
netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
11431
list_netdevice(dev);
11432
11433
add_device_randomness(dev->dev_addr, dev->addr_len);
11434
11435
/* If the device has permanent device address, driver should
11436
* set dev_addr and also addr_assign_type should be set to
11437
* NET_ADDR_PERM (default value).
11438
*/
11439
if (dev->addr_assign_type == NET_ADDR_PERM)
11440
memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
11441
11442
/* Notify protocols, that a new device appeared. */
11443
netdev_lock_ops(dev);
11444
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
11445
netdev_unlock_ops(dev);
11446
ret = notifier_to_errno(ret);
11447
if (ret) {
11448
/* Expect explicit free_netdev() on failure */
11449
dev->needs_free_netdev = false;
11450
unregister_netdevice_queue(dev, NULL);
11451
goto out;
11452
}
11453
/*
11454
* Prevent userspace races by waiting until the network
11455
* device is fully setup before sending notifications.
11456
*/
11457
if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
11458
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
11459
11460
out:
11461
return ret;
11462
11463
err_uninit_notify:
11464
call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
11465
err_ifindex_release:
11466
dev_index_release(net, dev->ifindex);
11467
err_free_pcpu:
11468
netdev_do_free_pcpu_stats(dev);
11469
err_uninit:
11470
if (dev->netdev_ops->ndo_uninit)
11471
dev->netdev_ops->ndo_uninit(dev);
11472
if (dev->priv_destructor)
11473
dev->priv_destructor(dev);
11474
err_free_name:
11475
netdev_name_node_free(dev->name_node);
11476
goto out;
11477
}
11478
EXPORT_SYMBOL(register_netdevice);
11479
11480
/* Initialize the core of a dummy net device.
11481
* The setup steps dummy netdevs need which normal netdevs get by going
11482
* through register_netdevice().
11483
*/
11484
static void init_dummy_netdev(struct net_device *dev)
11485
{
11486
/* make sure we BUG if trying to hit standard
11487
* register/unregister code path
11488
*/
11489
dev->reg_state = NETREG_DUMMY;
11490
11491
/* a dummy interface is started by default */
11492
set_bit(__LINK_STATE_PRESENT, &dev->state);
11493
set_bit(__LINK_STATE_START, &dev->state);
11494
11495
/* Note : We dont allocate pcpu_refcnt for dummy devices,
11496
* because users of this 'device' dont need to change
11497
* its refcount.
11498
*/
11499
}
11500
11501
/**
11502
* register_netdev - register a network device
11503
* @dev: device to register
11504
*
11505
* Take a completed network device structure and add it to the kernel
11506
* interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
11507
* chain. 0 is returned on success. A negative errno code is returned
11508
* on a failure to set up the device, or if the name is a duplicate.
11509
*
11510
* This is a wrapper around register_netdevice that takes the rtnl semaphore
11511
* and expands the device name if you passed a format string to
11512
* alloc_netdev.
11513
*/
11514
int register_netdev(struct net_device *dev)
11515
{
11516
struct net *net = dev_net(dev);
11517
int err;
11518
11519
if (rtnl_net_lock_killable(net))
11520
return -EINTR;
11521
11522
err = register_netdevice(dev);
11523
11524
rtnl_net_unlock(net);
11525
11526
return err;
11527
}
11528
EXPORT_SYMBOL(register_netdev);
11529
11530
int netdev_refcnt_read(const struct net_device *dev)
11531
{
11532
#ifdef CONFIG_PCPU_DEV_REFCNT
11533
int i, refcnt = 0;
11534
11535
for_each_possible_cpu(i)
11536
refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
11537
return refcnt;
11538
#else
11539
return refcount_read(&dev->dev_refcnt);
11540
#endif
11541
}
11542
EXPORT_SYMBOL(netdev_refcnt_read);
11543
11544
int netdev_unregister_timeout_secs __read_mostly = 10;
11545
11546
#define WAIT_REFS_MIN_MSECS 1
11547
#define WAIT_REFS_MAX_MSECS 250
11548
/**
11549
* netdev_wait_allrefs_any - wait until all references are gone.
11550
* @list: list of net_devices to wait on
11551
*
11552
* This is called when unregistering network devices.
11553
*
11554
* Any protocol or device that holds a reference should register
11555
* for netdevice notification, and cleanup and put back the
11556
* reference if they receive an UNREGISTER event.
11557
* We can get stuck here if buggy protocols don't correctly
11558
* call dev_put.
11559
*/
11560
static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
11561
{
11562
unsigned long rebroadcast_time, warning_time;
11563
struct net_device *dev;
11564
int wait = 0;
11565
11566
rebroadcast_time = warning_time = jiffies;
11567
11568
list_for_each_entry(dev, list, todo_list)
11569
if (netdev_refcnt_read(dev) == 1)
11570
return dev;
11571
11572
while (true) {
11573
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
11574
rtnl_lock();
11575
11576
/* Rebroadcast unregister notification */
11577
list_for_each_entry(dev, list, todo_list)
11578
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11579
11580
__rtnl_unlock();
11581
rcu_barrier();
11582
rtnl_lock();
11583
11584
list_for_each_entry(dev, list, todo_list)
11585
if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
11586
&dev->state)) {
11587
/* We must not have linkwatch events
11588
* pending on unregister. If this
11589
* happens, we simply run the queue
11590
* unscheduled, resulting in a noop
11591
* for this device.
11592
*/
11593
linkwatch_run_queue();
11594
break;
11595
}
11596
11597
__rtnl_unlock();
11598
11599
rebroadcast_time = jiffies;
11600
}
11601
11602
rcu_barrier();
11603
11604
if (!wait) {
11605
wait = WAIT_REFS_MIN_MSECS;
11606
} else {
11607
msleep(wait);
11608
wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
11609
}
11610
11611
list_for_each_entry(dev, list, todo_list)
11612
if (netdev_refcnt_read(dev) == 1)
11613
return dev;
11614
11615
if (time_after(jiffies, warning_time +
11616
READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
11617
list_for_each_entry(dev, list, todo_list) {
11618
pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
11619
dev->name, netdev_refcnt_read(dev));
11620
ref_tracker_dir_print(&dev->refcnt_tracker, 10);
11621
}
11622
11623
warning_time = jiffies;
11624
}
11625
}
11626
}
11627
11628
/* The sequence is:
11629
*
11630
* rtnl_lock();
11631
* ...
11632
* register_netdevice(x1);
11633
* register_netdevice(x2);
11634
* ...
11635
* unregister_netdevice(y1);
11636
* unregister_netdevice(y2);
11637
* ...
11638
* rtnl_unlock();
11639
* free_netdev(y1);
11640
* free_netdev(y2);
11641
*
11642
* We are invoked by rtnl_unlock().
11643
* This allows us to deal with problems:
11644
* 1) We can delete sysfs objects which invoke hotplug
11645
* without deadlocking with linkwatch via keventd.
11646
* 2) Since we run with the RTNL semaphore not held, we can sleep
11647
* safely in order to wait for the netdev refcnt to drop to zero.
11648
*
11649
* We must not return until all unregister events added during
11650
* the interval the lock was held have been completed.
11651
*/
11652
void netdev_run_todo(void)
11653
{
11654
struct net_device *dev, *tmp;
11655
struct list_head list;
11656
int cnt;
11657
#ifdef CONFIG_LOCKDEP
11658
struct list_head unlink_list;
11659
11660
list_replace_init(&net_unlink_list, &unlink_list);
11661
11662
while (!list_empty(&unlink_list)) {
11663
dev = list_first_entry(&unlink_list, struct net_device,
11664
unlink_list);
11665
list_del_init(&dev->unlink_list);
11666
dev->nested_level = dev->lower_level - 1;
11667
}
11668
#endif
11669
11670
/* Snapshot list, allow later requests */
11671
list_replace_init(&net_todo_list, &list);
11672
11673
__rtnl_unlock();
11674
11675
/* Wait for rcu callbacks to finish before next phase */
11676
if (!list_empty(&list))
11677
rcu_barrier();
11678
11679
list_for_each_entry_safe(dev, tmp, &list, todo_list) {
11680
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
11681
netdev_WARN(dev, "run_todo but not unregistering\n");
11682
list_del(&dev->todo_list);
11683
continue;
11684
}
11685
11686
netdev_lock(dev);
11687
WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
11688
netdev_unlock(dev);
11689
linkwatch_sync_dev(dev);
11690
}
11691
11692
cnt = 0;
11693
while (!list_empty(&list)) {
11694
dev = netdev_wait_allrefs_any(&list);
11695
list_del(&dev->todo_list);
11696
11697
/* paranoia */
11698
BUG_ON(netdev_refcnt_read(dev) != 1);
11699
BUG_ON(!list_empty(&dev->ptype_all));
11700
BUG_ON(!list_empty(&dev->ptype_specific));
11701
WARN_ON(rcu_access_pointer(dev->ip_ptr));
11702
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
11703
11704
netdev_do_free_pcpu_stats(dev);
11705
if (dev->priv_destructor)
11706
dev->priv_destructor(dev);
11707
if (dev->needs_free_netdev)
11708
free_netdev(dev);
11709
11710
cnt++;
11711
11712
/* Free network device */
11713
kobject_put(&dev->dev.kobj);
11714
}
11715
if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
11716
wake_up(&netdev_unregistering_wq);
11717
}
11718
11719
/* Collate per-cpu network dstats statistics
11720
*
11721
* Read per-cpu network statistics from dev->dstats and populate the related
11722
* fields in @s.
11723
*/
11724
static void dev_fetch_dstats(struct rtnl_link_stats64 *s,
11725
const struct pcpu_dstats __percpu *dstats)
11726
{
11727
int cpu;
11728
11729
for_each_possible_cpu(cpu) {
11730
u64 rx_packets, rx_bytes, rx_drops;
11731
u64 tx_packets, tx_bytes, tx_drops;
11732
const struct pcpu_dstats *stats;
11733
unsigned int start;
11734
11735
stats = per_cpu_ptr(dstats, cpu);
11736
do {
11737
start = u64_stats_fetch_begin(&stats->syncp);
11738
rx_packets = u64_stats_read(&stats->rx_packets);
11739
rx_bytes = u64_stats_read(&stats->rx_bytes);
11740
rx_drops = u64_stats_read(&stats->rx_drops);
11741
tx_packets = u64_stats_read(&stats->tx_packets);
11742
tx_bytes = u64_stats_read(&stats->tx_bytes);
11743
tx_drops = u64_stats_read(&stats->tx_drops);
11744
} while (u64_stats_fetch_retry(&stats->syncp, start));
11745
11746
s->rx_packets += rx_packets;
11747
s->rx_bytes += rx_bytes;
11748
s->rx_dropped += rx_drops;
11749
s->tx_packets += tx_packets;
11750
s->tx_bytes += tx_bytes;
11751
s->tx_dropped += tx_drops;
11752
}
11753
}
11754
11755
/* ndo_get_stats64 implementation for dtstats-based accounting.
11756
*
11757
* Populate @s from dev->stats and dev->dstats. This is used internally by the
11758
* core for NETDEV_PCPU_STAT_DSTAT-type stats collection.
11759
*/
11760
static void dev_get_dstats64(const struct net_device *dev,
11761
struct rtnl_link_stats64 *s)
11762
{
11763
netdev_stats_to_stats64(s, &dev->stats);
11764
dev_fetch_dstats(s, dev->dstats);
11765
}
11766
11767
/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
11768
* all the same fields in the same order as net_device_stats, with only
11769
* the type differing, but rtnl_link_stats64 may have additional fields
11770
* at the end for newer counters.
11771
*/
11772
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
11773
const struct net_device_stats *netdev_stats)
11774
{
11775
size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
11776
const atomic_long_t *src = (atomic_long_t *)netdev_stats;
11777
u64 *dst = (u64 *)stats64;
11778
11779
BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
11780
for (i = 0; i < n; i++)
11781
dst[i] = (unsigned long)atomic_long_read(&src[i]);
11782
/* zero out counters that only exist in rtnl_link_stats64 */
11783
memset((char *)stats64 + n * sizeof(u64), 0,
11784
sizeof(*stats64) - n * sizeof(u64));
11785
}
11786
EXPORT_SYMBOL(netdev_stats_to_stats64);
11787
11788
static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
11789
struct net_device *dev)
11790
{
11791
struct net_device_core_stats __percpu *p;
11792
11793
p = alloc_percpu_gfp(struct net_device_core_stats,
11794
GFP_ATOMIC | __GFP_NOWARN);
11795
11796
if (p && cmpxchg(&dev->core_stats, NULL, p))
11797
free_percpu(p);
11798
11799
/* This READ_ONCE() pairs with the cmpxchg() above */
11800
return READ_ONCE(dev->core_stats);
11801
}
11802
11803
noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
11804
{
11805
/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
11806
struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
11807
unsigned long __percpu *field;
11808
11809
if (unlikely(!p)) {
11810
p = netdev_core_stats_alloc(dev);
11811
if (!p)
11812
return;
11813
}
11814
11815
field = (unsigned long __percpu *)((void __percpu *)p + offset);
11816
this_cpu_inc(*field);
11817
}
11818
EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
11819
11820
/**
11821
* dev_get_stats - get network device statistics
11822
* @dev: device to get statistics from
11823
* @storage: place to store stats
11824
*
11825
* Get network statistics from device. Return @storage.
11826
* The device driver may provide its own method by setting
11827
* dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
11828
* otherwise the internal statistics structure is used.
11829
*/
11830
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
11831
struct rtnl_link_stats64 *storage)
11832
{
11833
const struct net_device_ops *ops = dev->netdev_ops;
11834
const struct net_device_core_stats __percpu *p;
11835
11836
/*
11837
* IPv{4,6} and udp tunnels share common stat helpers and use
11838
* different stat type (NETDEV_PCPU_STAT_TSTATS vs
11839
* NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
11840
*/
11841
BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
11842
offsetof(struct pcpu_dstats, rx_bytes));
11843
BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
11844
offsetof(struct pcpu_dstats, rx_packets));
11845
BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
11846
offsetof(struct pcpu_dstats, tx_bytes));
11847
BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
11848
offsetof(struct pcpu_dstats, tx_packets));
11849
11850
if (ops->ndo_get_stats64) {
11851
memset(storage, 0, sizeof(*storage));
11852
ops->ndo_get_stats64(dev, storage);
11853
} else if (ops->ndo_get_stats) {
11854
netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
11855
} else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
11856
dev_get_tstats64(dev, storage);
11857
} else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
11858
dev_get_dstats64(dev, storage);
11859
} else {
11860
netdev_stats_to_stats64(storage, &dev->stats);
11861
}
11862
11863
/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
11864
p = READ_ONCE(dev->core_stats);
11865
if (p) {
11866
const struct net_device_core_stats *core_stats;
11867
int i;
11868
11869
for_each_possible_cpu(i) {
11870
core_stats = per_cpu_ptr(p, i);
11871
storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
11872
storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
11873
storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
11874
storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
11875
}
11876
}
11877
return storage;
11878
}
11879
EXPORT_SYMBOL(dev_get_stats);
11880
11881
/**
11882
* dev_fetch_sw_netstats - get per-cpu network device statistics
11883
* @s: place to store stats
11884
* @netstats: per-cpu network stats to read from
11885
*
11886
* Read per-cpu network statistics and populate the related fields in @s.
11887
*/
11888
void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
11889
const struct pcpu_sw_netstats __percpu *netstats)
11890
{
11891
int cpu;
11892
11893
for_each_possible_cpu(cpu) {
11894
u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
11895
const struct pcpu_sw_netstats *stats;
11896
unsigned int start;
11897
11898
stats = per_cpu_ptr(netstats, cpu);
11899
do {
11900
start = u64_stats_fetch_begin(&stats->syncp);
11901
rx_packets = u64_stats_read(&stats->rx_packets);
11902
rx_bytes = u64_stats_read(&stats->rx_bytes);
11903
tx_packets = u64_stats_read(&stats->tx_packets);
11904
tx_bytes = u64_stats_read(&stats->tx_bytes);
11905
} while (u64_stats_fetch_retry(&stats->syncp, start));
11906
11907
s->rx_packets += rx_packets;
11908
s->rx_bytes += rx_bytes;
11909
s->tx_packets += tx_packets;
11910
s->tx_bytes += tx_bytes;
11911
}
11912
}
11913
EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
11914
11915
/**
11916
* dev_get_tstats64 - ndo_get_stats64 implementation
11917
* @dev: device to get statistics from
11918
* @s: place to store stats
11919
*
11920
* Populate @s from dev->stats and dev->tstats. Can be used as
11921
* ndo_get_stats64() callback.
11922
*/
11923
void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
11924
{
11925
netdev_stats_to_stats64(s, &dev->stats);
11926
dev_fetch_sw_netstats(s, dev->tstats);
11927
}
11928
EXPORT_SYMBOL_GPL(dev_get_tstats64);
11929
11930
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
11931
{
11932
struct netdev_queue *queue = dev_ingress_queue(dev);
11933
11934
#ifdef CONFIG_NET_CLS_ACT
11935
if (queue)
11936
return queue;
11937
queue = kzalloc(sizeof(*queue), GFP_KERNEL);
11938
if (!queue)
11939
return NULL;
11940
netdev_init_one_queue(dev, queue, NULL);
11941
RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
11942
RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
11943
rcu_assign_pointer(dev->ingress_queue, queue);
11944
#endif
11945
return queue;
11946
}
11947
11948
static const struct ethtool_ops default_ethtool_ops;
11949
11950
void netdev_set_default_ethtool_ops(struct net_device *dev,
11951
const struct ethtool_ops *ops)
11952
{
11953
if (dev->ethtool_ops == &default_ethtool_ops)
11954
dev->ethtool_ops = ops;
11955
}
11956
EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
11957
11958
/**
11959
* netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
11960
* @dev: netdev to enable the IRQ coalescing on
11961
*
11962
* Sets a conservative default for SW IRQ coalescing. Users can use
11963
* sysfs attributes to override the default values.
11964
*/
11965
void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
11966
{
11967
WARN_ON(dev->reg_state == NETREG_REGISTERED);
11968
11969
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
11970
netdev_set_gro_flush_timeout(dev, 20000);
11971
netdev_set_defer_hard_irqs(dev, 1);
11972
}
11973
}
11974
EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
11975
11976
/**
11977
* alloc_netdev_mqs - allocate network device
11978
* @sizeof_priv: size of private data to allocate space for
11979
* @name: device name format string
11980
* @name_assign_type: origin of device name
11981
* @setup: callback to initialize device
11982
* @txqs: the number of TX subqueues to allocate
11983
* @rxqs: the number of RX subqueues to allocate
11984
*
11985
* Allocates a struct net_device with private data area for driver use
11986
* and performs basic initialization. Also allocates subqueue structs
11987
* for each queue on the device.
11988
*/
11989
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
11990
unsigned char name_assign_type,
11991
void (*setup)(struct net_device *),
11992
unsigned int txqs, unsigned int rxqs)
11993
{
11994
struct net_device *dev;
11995
size_t napi_config_sz;
11996
unsigned int maxqs;
11997
11998
BUG_ON(strlen(name) >= sizeof(dev->name));
11999
12000
if (txqs < 1) {
12001
pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
12002
return NULL;
12003
}
12004
12005
if (rxqs < 1) {
12006
pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
12007
return NULL;
12008
}
12009
12010
maxqs = max(txqs, rxqs);
12011
12012
dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
12013
GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
12014
if (!dev)
12015
return NULL;
12016
12017
dev->priv_len = sizeof_priv;
12018
12019
ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev");
12020
#ifdef CONFIG_PCPU_DEV_REFCNT
12021
dev->pcpu_refcnt = alloc_percpu(int);
12022
if (!dev->pcpu_refcnt)
12023
goto free_dev;
12024
__dev_hold(dev);
12025
#else
12026
refcount_set(&dev->dev_refcnt, 1);
12027
#endif
12028
12029
if (dev_addr_init(dev))
12030
goto free_pcpu;
12031
12032
dev_mc_init(dev);
12033
dev_uc_init(dev);
12034
12035
dev_net_set(dev, &init_net);
12036
12037
dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
12038
dev->xdp_zc_max_segs = 1;
12039
dev->gso_max_segs = GSO_MAX_SEGS;
12040
dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
12041
dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
12042
dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
12043
dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
12044
dev->tso_max_segs = TSO_MAX_SEGS;
12045
dev->upper_level = 1;
12046
dev->lower_level = 1;
12047
#ifdef CONFIG_LOCKDEP
12048
dev->nested_level = 0;
12049
INIT_LIST_HEAD(&dev->unlink_list);
12050
#endif
12051
12052
INIT_LIST_HEAD(&dev->napi_list);
12053
INIT_LIST_HEAD(&dev->unreg_list);
12054
INIT_LIST_HEAD(&dev->close_list);
12055
INIT_LIST_HEAD(&dev->link_watch_list);
12056
INIT_LIST_HEAD(&dev->adj_list.upper);
12057
INIT_LIST_HEAD(&dev->adj_list.lower);
12058
INIT_LIST_HEAD(&dev->ptype_all);
12059
INIT_LIST_HEAD(&dev->ptype_specific);
12060
INIT_LIST_HEAD(&dev->net_notifier_list);
12061
#ifdef CONFIG_NET_SCHED
12062
hash_init(dev->qdisc_hash);
12063
#endif
12064
12065
mutex_init(&dev->lock);
12066
12067
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
12068
setup(dev);
12069
12070
if (!dev->tx_queue_len) {
12071
dev->priv_flags |= IFF_NO_QUEUE;
12072
dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
12073
}
12074
12075
dev->num_tx_queues = txqs;
12076
dev->real_num_tx_queues = txqs;
12077
if (netif_alloc_netdev_queues(dev))
12078
goto free_all;
12079
12080
dev->num_rx_queues = rxqs;
12081
dev->real_num_rx_queues = rxqs;
12082
if (netif_alloc_rx_queues(dev))
12083
goto free_all;
12084
dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT);
12085
if (!dev->ethtool)
12086
goto free_all;
12087
12088
dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT);
12089
if (!dev->cfg)
12090
goto free_all;
12091
dev->cfg_pending = dev->cfg;
12092
12093
dev->num_napi_configs = maxqs;
12094
napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
12095
dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
12096
if (!dev->napi_config)
12097
goto free_all;
12098
12099
strscpy(dev->name, name);
12100
dev->name_assign_type = name_assign_type;
12101
dev->group = INIT_NETDEV_GROUP;
12102
if (!dev->ethtool_ops)
12103
dev->ethtool_ops = &default_ethtool_ops;
12104
12105
nf_hook_netdev_init(dev);
12106
12107
return dev;
12108
12109
free_all:
12110
free_netdev(dev);
12111
return NULL;
12112
12113
free_pcpu:
12114
#ifdef CONFIG_PCPU_DEV_REFCNT
12115
free_percpu(dev->pcpu_refcnt);
12116
free_dev:
12117
#endif
12118
kvfree(dev);
12119
return NULL;
12120
}
12121
EXPORT_SYMBOL(alloc_netdev_mqs);
12122
12123
static void netdev_napi_exit(struct net_device *dev)
12124
{
12125
if (!list_empty(&dev->napi_list)) {
12126
struct napi_struct *p, *n;
12127
12128
netdev_lock(dev);
12129
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
12130
__netif_napi_del_locked(p);
12131
netdev_unlock(dev);
12132
12133
synchronize_net();
12134
}
12135
12136
kvfree(dev->napi_config);
12137
}
12138
12139
/**
12140
* free_netdev - free network device
12141
* @dev: device
12142
*
12143
* This function does the last stage of destroying an allocated device
12144
* interface. The reference to the device object is released. If this
12145
* is the last reference then it will be freed.Must be called in process
12146
* context.
12147
*/
12148
void free_netdev(struct net_device *dev)
12149
{
12150
might_sleep();
12151
12152
/* When called immediately after register_netdevice() failed the unwind
12153
* handling may still be dismantling the device. Handle that case by
12154
* deferring the free.
12155
*/
12156
if (dev->reg_state == NETREG_UNREGISTERING) {
12157
ASSERT_RTNL();
12158
dev->needs_free_netdev = true;
12159
return;
12160
}
12161
12162
WARN_ON(dev->cfg != dev->cfg_pending);
12163
kfree(dev->cfg);
12164
kfree(dev->ethtool);
12165
netif_free_tx_queues(dev);
12166
netif_free_rx_queues(dev);
12167
12168
kfree(rcu_dereference_protected(dev->ingress_queue, 1));
12169
12170
/* Flush device addresses */
12171
dev_addr_flush(dev);
12172
12173
netdev_napi_exit(dev);
12174
12175
netif_del_cpu_rmap(dev);
12176
12177
ref_tracker_dir_exit(&dev->refcnt_tracker);
12178
#ifdef CONFIG_PCPU_DEV_REFCNT
12179
free_percpu(dev->pcpu_refcnt);
12180
dev->pcpu_refcnt = NULL;
12181
#endif
12182
free_percpu(dev->core_stats);
12183
dev->core_stats = NULL;
12184
free_percpu(dev->xdp_bulkq);
12185
dev->xdp_bulkq = NULL;
12186
12187
netdev_free_phy_link_topology(dev);
12188
12189
mutex_destroy(&dev->lock);
12190
12191
/* Compatibility with error handling in drivers */
12192
if (dev->reg_state == NETREG_UNINITIALIZED ||
12193
dev->reg_state == NETREG_DUMMY) {
12194
kvfree(dev);
12195
return;
12196
}
12197
12198
BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
12199
WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
12200
12201
/* will free via device release */
12202
put_device(&dev->dev);
12203
}
12204
EXPORT_SYMBOL(free_netdev);
12205
12206
/**
12207
* alloc_netdev_dummy - Allocate and initialize a dummy net device.
12208
* @sizeof_priv: size of private data to allocate space for
12209
*
12210
* Return: the allocated net_device on success, NULL otherwise
12211
*/
12212
struct net_device *alloc_netdev_dummy(int sizeof_priv)
12213
{
12214
return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
12215
init_dummy_netdev);
12216
}
12217
EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
12218
12219
/**
12220
* synchronize_net - Synchronize with packet receive processing
12221
*
12222
* Wait for packets currently being received to be done.
12223
* Does not block later packets from starting.
12224
*/
12225
void synchronize_net(void)
12226
{
12227
might_sleep();
12228
if (from_cleanup_net() || rtnl_is_locked())
12229
synchronize_rcu_expedited();
12230
else
12231
synchronize_rcu();
12232
}
12233
EXPORT_SYMBOL(synchronize_net);
12234
12235
static void netdev_rss_contexts_free(struct net_device *dev)
12236
{
12237
struct ethtool_rxfh_context *ctx;
12238
unsigned long context;
12239
12240
mutex_lock(&dev->ethtool->rss_lock);
12241
xa_for_each(&dev->ethtool->rss_ctx, context, ctx) {
12242
xa_erase(&dev->ethtool->rss_ctx, context);
12243
dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL);
12244
kfree(ctx);
12245
}
12246
xa_destroy(&dev->ethtool->rss_ctx);
12247
mutex_unlock(&dev->ethtool->rss_lock);
12248
}
12249
12250
/**
12251
* unregister_netdevice_queue - remove device from the kernel
12252
* @dev: device
12253
* @head: list
12254
*
12255
* This function shuts down a device interface and removes it
12256
* from the kernel tables.
12257
* If head not NULL, device is queued to be unregistered later.
12258
*
12259
* Callers must hold the rtnl semaphore. You may want
12260
* unregister_netdev() instead of this.
12261
*/
12262
12263
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
12264
{
12265
ASSERT_RTNL();
12266
12267
if (head) {
12268
list_move_tail(&dev->unreg_list, head);
12269
} else {
12270
LIST_HEAD(single);
12271
12272
list_add(&dev->unreg_list, &single);
12273
unregister_netdevice_many(&single);
12274
}
12275
}
12276
EXPORT_SYMBOL(unregister_netdevice_queue);
12277
12278
static void dev_memory_provider_uninstall(struct net_device *dev)
12279
{
12280
unsigned int i;
12281
12282
for (i = 0; i < dev->real_num_rx_queues; i++) {
12283
struct netdev_rx_queue *rxq = &dev->_rx[i];
12284
struct pp_memory_provider_params *p = &rxq->mp_params;
12285
12286
if (p->mp_ops && p->mp_ops->uninstall)
12287
p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq);
12288
}
12289
}
12290
12291
/* devices must be UP and netdev_lock()'d */
12292
static void netif_close_many_and_unlock(struct list_head *close_head)
12293
{
12294
struct net_device *dev, *tmp;
12295
12296
netif_close_many(close_head, false);
12297
12298
/* ... now unlock them */
12299
list_for_each_entry_safe(dev, tmp, close_head, close_list) {
12300
netdev_unlock(dev);
12301
list_del_init(&dev->close_list);
12302
}
12303
}
12304
12305
static void netif_close_many_and_unlock_cond(struct list_head *close_head)
12306
{
12307
#ifdef CONFIG_LOCKDEP
12308
/* We can only track up to MAX_LOCK_DEPTH locks per task.
12309
*
12310
* Reserve half the available slots for additional locks possibly
12311
* taken by notifiers and (soft)irqs.
12312
*/
12313
unsigned int limit = MAX_LOCK_DEPTH / 2;
12314
12315
if (lockdep_depth(current) > limit)
12316
netif_close_many_and_unlock(close_head);
12317
#endif
12318
}
12319
12320
void unregister_netdevice_many_notify(struct list_head *head,
12321
u32 portid, const struct nlmsghdr *nlh)
12322
{
12323
struct net_device *dev, *tmp;
12324
LIST_HEAD(close_head);
12325
int cnt = 0;
12326
12327
BUG_ON(dev_boot_phase);
12328
ASSERT_RTNL();
12329
12330
if (list_empty(head))
12331
return;
12332
12333
list_for_each_entry_safe(dev, tmp, head, unreg_list) {
12334
/* Some devices call without registering
12335
* for initialization unwind. Remove those
12336
* devices and proceed with the remaining.
12337
*/
12338
if (dev->reg_state == NETREG_UNINITIALIZED) {
12339
pr_debug("unregister_netdevice: device %s/%p never was registered\n",
12340
dev->name, dev);
12341
12342
WARN_ON(1);
12343
list_del(&dev->unreg_list);
12344
continue;
12345
}
12346
dev->dismantle = true;
12347
BUG_ON(dev->reg_state != NETREG_REGISTERED);
12348
}
12349
12350
/* If device is running, close it first. Start with ops locked... */
12351
list_for_each_entry(dev, head, unreg_list) {
12352
if (!(dev->flags & IFF_UP))
12353
continue;
12354
if (netdev_need_ops_lock(dev)) {
12355
list_add_tail(&dev->close_list, &close_head);
12356
netdev_lock(dev);
12357
}
12358
netif_close_many_and_unlock_cond(&close_head);
12359
}
12360
netif_close_many_and_unlock(&close_head);
12361
/* ... now go over the rest. */
12362
list_for_each_entry(dev, head, unreg_list) {
12363
if (!netdev_need_ops_lock(dev))
12364
list_add_tail(&dev->close_list, &close_head);
12365
}
12366
netif_close_many(&close_head, true);
12367
12368
list_for_each_entry(dev, head, unreg_list) {
12369
/* And unlink it from device chain. */
12370
unlist_netdevice(dev);
12371
netdev_lock(dev);
12372
WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
12373
netdev_unlock(dev);
12374
}
12375
flush_all_backlogs();
12376
12377
synchronize_net();
12378
12379
list_for_each_entry(dev, head, unreg_list) {
12380
struct sk_buff *skb = NULL;
12381
12382
/* Shutdown queueing discipline. */
12383
netdev_lock_ops(dev);
12384
dev_shutdown(dev);
12385
dev_tcx_uninstall(dev);
12386
dev_xdp_uninstall(dev);
12387
dev_memory_provider_uninstall(dev);
12388
netdev_unlock_ops(dev);
12389
bpf_dev_bound_netdev_unregister(dev);
12390
12391
netdev_offload_xstats_disable_all(dev);
12392
12393
/* Notify protocols, that we are about to destroy
12394
* this device. They should clean all the things.
12395
*/
12396
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
12397
12398
if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
12399
skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
12400
GFP_KERNEL, NULL, 0,
12401
portid, nlh);
12402
12403
/*
12404
* Flush the unicast and multicast chains
12405
*/
12406
dev_uc_flush(dev);
12407
dev_mc_flush(dev);
12408
12409
netdev_name_node_alt_flush(dev);
12410
netdev_name_node_free(dev->name_node);
12411
12412
netdev_rss_contexts_free(dev);
12413
12414
call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
12415
12416
if (dev->netdev_ops->ndo_uninit)
12417
dev->netdev_ops->ndo_uninit(dev);
12418
12419
mutex_destroy(&dev->ethtool->rss_lock);
12420
12421
net_shaper_flush_netdev(dev);
12422
12423
if (skb)
12424
rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
12425
12426
/* Notifier chain MUST detach us all upper devices. */
12427
WARN_ON(netdev_has_any_upper_dev(dev));
12428
WARN_ON(netdev_has_any_lower_dev(dev));
12429
12430
/* Remove entries from kobject tree */
12431
netdev_unregister_kobject(dev);
12432
#ifdef CONFIG_XPS
12433
/* Remove XPS queueing entries */
12434
netif_reset_xps_queues_gt(dev, 0);
12435
#endif
12436
}
12437
12438
synchronize_net();
12439
12440
list_for_each_entry(dev, head, unreg_list) {
12441
netdev_put(dev, &dev->dev_registered_tracker);
12442
net_set_todo(dev);
12443
cnt++;
12444
}
12445
atomic_add(cnt, &dev_unreg_count);
12446
12447
list_del(head);
12448
}
12449
12450
/**
12451
* unregister_netdevice_many - unregister many devices
12452
* @head: list of devices
12453
*
12454
* Note: As most callers use a stack allocated list_head,
12455
* we force a list_del() to make sure stack won't be corrupted later.
12456
*/
12457
void unregister_netdevice_many(struct list_head *head)
12458
{
12459
unregister_netdevice_many_notify(head, 0, NULL);
12460
}
12461
EXPORT_SYMBOL(unregister_netdevice_many);
12462
12463
/**
12464
* unregister_netdev - remove device from the kernel
12465
* @dev: device
12466
*
12467
* This function shuts down a device interface and removes it
12468
* from the kernel tables.
12469
*
12470
* This is just a wrapper for unregister_netdevice that takes
12471
* the rtnl semaphore. In general you want to use this and not
12472
* unregister_netdevice.
12473
*/
12474
void unregister_netdev(struct net_device *dev)
12475
{
12476
rtnl_net_dev_lock(dev);
12477
unregister_netdevice(dev);
12478
rtnl_net_dev_unlock(dev);
12479
}
12480
EXPORT_SYMBOL(unregister_netdev);
12481
12482
int __dev_change_net_namespace(struct net_device *dev, struct net *net,
12483
const char *pat, int new_ifindex,
12484
struct netlink_ext_ack *extack)
12485
{
12486
struct netdev_name_node *name_node;
12487
struct net *net_old = dev_net(dev);
12488
char new_name[IFNAMSIZ] = {};
12489
int err, new_nsid;
12490
12491
ASSERT_RTNL();
12492
12493
/* Don't allow namespace local devices to be moved. */
12494
err = -EINVAL;
12495
if (dev->netns_immutable) {
12496
NL_SET_ERR_MSG(extack, "The interface netns is immutable");
12497
goto out;
12498
}
12499
12500
/* Ensure the device has been registered */
12501
if (dev->reg_state != NETREG_REGISTERED) {
12502
NL_SET_ERR_MSG(extack, "The interface isn't registered");
12503
goto out;
12504
}
12505
12506
/* Get out if there is nothing todo */
12507
err = 0;
12508
if (net_eq(net_old, net))
12509
goto out;
12510
12511
/* Pick the destination device name, and ensure
12512
* we can use it in the destination network namespace.
12513
*/
12514
err = -EEXIST;
12515
if (netdev_name_in_use(net, dev->name)) {
12516
/* We get here if we can't use the current device name */
12517
if (!pat) {
12518
NL_SET_ERR_MSG(extack,
12519
"An interface with the same name exists in the target netns");
12520
goto out;
12521
}
12522
err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
12523
if (err < 0) {
12524
NL_SET_ERR_MSG_FMT(extack,
12525
"Unable to use '%s' for the new interface name in the target netns",
12526
pat);
12527
goto out;
12528
}
12529
}
12530
/* Check that none of the altnames conflicts. */
12531
err = -EEXIST;
12532
netdev_for_each_altname(dev, name_node) {
12533
if (netdev_name_in_use(net, name_node->name)) {
12534
NL_SET_ERR_MSG_FMT(extack,
12535
"An interface with the altname %s exists in the target netns",
12536
name_node->name);
12537
goto out;
12538
}
12539
}
12540
12541
/* Check that new_ifindex isn't used yet. */
12542
if (new_ifindex) {
12543
err = dev_index_reserve(net, new_ifindex);
12544
if (err < 0) {
12545
NL_SET_ERR_MSG_FMT(extack,
12546
"The ifindex %d is not available in the target netns",
12547
new_ifindex);
12548
goto out;
12549
}
12550
} else {
12551
/* If there is an ifindex conflict assign a new one */
12552
err = dev_index_reserve(net, dev->ifindex);
12553
if (err == -EBUSY)
12554
err = dev_index_reserve(net, 0);
12555
if (err < 0) {
12556
NL_SET_ERR_MSG(extack,
12557
"Unable to allocate a new ifindex in the target netns");
12558
goto out;
12559
}
12560
new_ifindex = err;
12561
}
12562
12563
/*
12564
* And now a mini version of register_netdevice unregister_netdevice.
12565
*/
12566
12567
netdev_lock_ops(dev);
12568
/* If device is running close it first. */
12569
netif_close(dev);
12570
/* And unlink it from device chain */
12571
unlist_netdevice(dev);
12572
12573
if (!netdev_need_ops_lock(dev))
12574
netdev_lock(dev);
12575
dev->moving_ns = true;
12576
netdev_unlock(dev);
12577
12578
synchronize_net();
12579
12580
/* Shutdown queueing discipline. */
12581
netdev_lock_ops(dev);
12582
dev_shutdown(dev);
12583
netdev_unlock_ops(dev);
12584
12585
/* Notify protocols, that we are about to destroy
12586
* this device. They should clean all the things.
12587
*
12588
* Note that dev->reg_state stays at NETREG_REGISTERED.
12589
* This is wanted because this way 8021q and macvlan know
12590
* the device is just moving and can keep their slaves up.
12591
*/
12592
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
12593
rcu_barrier();
12594
12595
new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
12596
12597
rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
12598
new_ifindex);
12599
12600
/*
12601
* Flush the unicast and multicast chains
12602
*/
12603
dev_uc_flush(dev);
12604
dev_mc_flush(dev);
12605
12606
/* Send a netdev-removed uevent to the old namespace */
12607
kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
12608
netdev_adjacent_del_links(dev);
12609
12610
/* Move per-net netdevice notifiers that are following the netdevice */
12611
move_netdevice_notifiers_dev_net(dev, net);
12612
12613
/* Actually switch the network namespace */
12614
netdev_lock(dev);
12615
dev_net_set(dev, net);
12616
netdev_unlock(dev);
12617
dev->ifindex = new_ifindex;
12618
12619
if (new_name[0]) {
12620
/* Rename the netdev to prepared name */
12621
write_seqlock_bh(&netdev_rename_lock);
12622
strscpy(dev->name, new_name, IFNAMSIZ);
12623
write_sequnlock_bh(&netdev_rename_lock);
12624
}
12625
12626
/* Fixup kobjects */
12627
dev_set_uevent_suppress(&dev->dev, 1);
12628
err = device_rename(&dev->dev, dev->name);
12629
dev_set_uevent_suppress(&dev->dev, 0);
12630
WARN_ON(err);
12631
12632
/* Send a netdev-add uevent to the new namespace */
12633
kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
12634
netdev_adjacent_add_links(dev);
12635
12636
/* Adapt owner in case owning user namespace of target network
12637
* namespace is different from the original one.
12638
*/
12639
err = netdev_change_owner(dev, net_old, net);
12640
WARN_ON(err);
12641
12642
netdev_lock(dev);
12643
dev->moving_ns = false;
12644
if (!netdev_need_ops_lock(dev))
12645
netdev_unlock(dev);
12646
12647
/* Add the device back in the hashes */
12648
list_netdevice(dev);
12649
/* Notify protocols, that a new device appeared. */
12650
call_netdevice_notifiers(NETDEV_REGISTER, dev);
12651
netdev_unlock_ops(dev);
12652
12653
/*
12654
* Prevent userspace races by waiting until the network
12655
* device is fully setup before sending notifications.
12656
*/
12657
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
12658
12659
synchronize_net();
12660
err = 0;
12661
out:
12662
return err;
12663
}
12664
12665
static int dev_cpu_dead(unsigned int oldcpu)
12666
{
12667
struct sk_buff **list_skb;
12668
struct sk_buff *skb;
12669
unsigned int cpu;
12670
struct softnet_data *sd, *oldsd, *remsd = NULL;
12671
12672
local_irq_disable();
12673
cpu = smp_processor_id();
12674
sd = &per_cpu(softnet_data, cpu);
12675
oldsd = &per_cpu(softnet_data, oldcpu);
12676
12677
/* Find end of our completion_queue. */
12678
list_skb = &sd->completion_queue;
12679
while (*list_skb)
12680
list_skb = &(*list_skb)->next;
12681
/* Append completion queue from offline CPU. */
12682
*list_skb = oldsd->completion_queue;
12683
oldsd->completion_queue = NULL;
12684
12685
/* Append output queue from offline CPU. */
12686
if (oldsd->output_queue) {
12687
*sd->output_queue_tailp = oldsd->output_queue;
12688
sd->output_queue_tailp = oldsd->output_queue_tailp;
12689
oldsd->output_queue = NULL;
12690
oldsd->output_queue_tailp = &oldsd->output_queue;
12691
}
12692
/* Append NAPI poll list from offline CPU, with one exception :
12693
* process_backlog() must be called by cpu owning percpu backlog.
12694
* We properly handle process_queue & input_pkt_queue later.
12695
*/
12696
while (!list_empty(&oldsd->poll_list)) {
12697
struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
12698
struct napi_struct,
12699
poll_list);
12700
12701
list_del_init(&napi->poll_list);
12702
if (napi->poll == process_backlog)
12703
napi->state &= NAPIF_STATE_THREADED;
12704
else
12705
____napi_schedule(sd, napi);
12706
}
12707
12708
raise_softirq_irqoff(NET_TX_SOFTIRQ);
12709
local_irq_enable();
12710
12711
if (!use_backlog_threads()) {
12712
#ifdef CONFIG_RPS
12713
remsd = oldsd->rps_ipi_list;
12714
oldsd->rps_ipi_list = NULL;
12715
#endif
12716
/* send out pending IPI's on offline CPU */
12717
net_rps_send_ipi(remsd);
12718
}
12719
12720
/* Process offline CPU's input_pkt_queue */
12721
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
12722
netif_rx(skb);
12723
rps_input_queue_head_incr(oldsd);
12724
}
12725
while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
12726
netif_rx(skb);
12727
rps_input_queue_head_incr(oldsd);
12728
}
12729
12730
return 0;
12731
}
12732
12733
/**
12734
* netdev_increment_features - increment feature set by one
12735
* @all: current feature set
12736
* @one: new feature set
12737
* @mask: mask feature set
12738
*
12739
* Computes a new feature set after adding a device with feature set
12740
* @one to the master device with current feature set @all. Will not
12741
* enable anything that is off in @mask. Returns the new feature set.
12742
*/
12743
netdev_features_t netdev_increment_features(netdev_features_t all,
12744
netdev_features_t one, netdev_features_t mask)
12745
{
12746
if (mask & NETIF_F_HW_CSUM)
12747
mask |= NETIF_F_CSUM_MASK;
12748
mask |= NETIF_F_VLAN_CHALLENGED;
12749
12750
all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
12751
all &= one | ~NETIF_F_ALL_FOR_ALL;
12752
12753
/* If one device supports hw checksumming, set for all. */
12754
if (all & NETIF_F_HW_CSUM)
12755
all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
12756
12757
return all;
12758
}
12759
EXPORT_SYMBOL(netdev_increment_features);
12760
12761
/**
12762
* netdev_compute_master_upper_features - compute feature from lowers
12763
* @dev: the upper device
12764
* @update_header: whether to update upper device's header_len/headroom/tailroom
12765
*
12766
* Recompute the upper device's feature based on all lower devices.
12767
*/
12768
void netdev_compute_master_upper_features(struct net_device *dev, bool update_header)
12769
{
12770
unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
12771
netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES;
12772
netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES;
12773
netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES;
12774
netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES;
12775
netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES;
12776
unsigned short max_header_len = ETH_HLEN;
12777
unsigned int tso_max_size = TSO_MAX_SIZE;
12778
unsigned short max_headroom = 0;
12779
unsigned short max_tailroom = 0;
12780
u16 tso_max_segs = TSO_MAX_SEGS;
12781
struct net_device *lower_dev;
12782
struct list_head *iter;
12783
12784
mpls_features = netdev_base_features(mpls_features);
12785
vlan_features = netdev_base_features(vlan_features);
12786
enc_features = netdev_base_features(enc_features);
12787
12788
netdev_for_each_lower_dev(dev, lower_dev, iter) {
12789
gso_partial_features = netdev_increment_features(gso_partial_features,
12790
lower_dev->gso_partial_features,
12791
MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES);
12792
12793
vlan_features = netdev_increment_features(vlan_features,
12794
lower_dev->vlan_features,
12795
MASTER_UPPER_DEV_VLAN_FEATURES);
12796
12797
enc_features = netdev_increment_features(enc_features,
12798
lower_dev->hw_enc_features,
12799
MASTER_UPPER_DEV_ENC_FEATURES);
12800
12801
if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
12802
xfrm_features = netdev_increment_features(xfrm_features,
12803
lower_dev->hw_enc_features,
12804
MASTER_UPPER_DEV_XFRM_FEATURES);
12805
12806
mpls_features = netdev_increment_features(mpls_features,
12807
lower_dev->mpls_features,
12808
MASTER_UPPER_DEV_MPLS_FEATURES);
12809
12810
dst_release_flag &= lower_dev->priv_flags;
12811
12812
if (update_header) {
12813
max_header_len = max(max_header_len, lower_dev->hard_header_len);
12814
max_headroom = max(max_headroom, lower_dev->needed_headroom);
12815
max_tailroom = max(max_tailroom, lower_dev->needed_tailroom);
12816
}
12817
12818
tso_max_size = min(tso_max_size, lower_dev->tso_max_size);
12819
tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs);
12820
}
12821
12822
dev->gso_partial_features = gso_partial_features;
12823
dev->vlan_features = vlan_features;
12824
dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
12825
NETIF_F_HW_VLAN_CTAG_TX |
12826
NETIF_F_HW_VLAN_STAG_TX;
12827
if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
12828
dev->hw_enc_features |= xfrm_features;
12829
dev->mpls_features = mpls_features;
12830
12831
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
12832
if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
12833
dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
12834
dev->priv_flags |= IFF_XMIT_DST_RELEASE;
12835
12836
if (update_header) {
12837
dev->hard_header_len = max_header_len;
12838
dev->needed_headroom = max_headroom;
12839
dev->needed_tailroom = max_tailroom;
12840
}
12841
12842
netif_set_tso_max_segs(dev, tso_max_segs);
12843
netif_set_tso_max_size(dev, tso_max_size);
12844
12845
netdev_change_features(dev);
12846
}
12847
EXPORT_SYMBOL(netdev_compute_master_upper_features);
12848
12849
static struct hlist_head * __net_init netdev_create_hash(void)
12850
{
12851
int i;
12852
struct hlist_head *hash;
12853
12854
hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
12855
if (hash != NULL)
12856
for (i = 0; i < NETDEV_HASHENTRIES; i++)
12857
INIT_HLIST_HEAD(&hash[i]);
12858
12859
return hash;
12860
}
12861
12862
/* Initialize per network namespace state */
12863
static int __net_init netdev_init(struct net *net)
12864
{
12865
BUILD_BUG_ON(GRO_HASH_BUCKETS >
12866
BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask));
12867
12868
INIT_LIST_HEAD(&net->dev_base_head);
12869
12870
net->dev_name_head = netdev_create_hash();
12871
if (net->dev_name_head == NULL)
12872
goto err_name;
12873
12874
net->dev_index_head = netdev_create_hash();
12875
if (net->dev_index_head == NULL)
12876
goto err_idx;
12877
12878
xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
12879
12880
RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
12881
12882
return 0;
12883
12884
err_idx:
12885
kfree(net->dev_name_head);
12886
err_name:
12887
return -ENOMEM;
12888
}
12889
12890
/**
12891
* netdev_drivername - network driver for the device
12892
* @dev: network device
12893
*
12894
* Determine network driver for device.
12895
*/
12896
const char *netdev_drivername(const struct net_device *dev)
12897
{
12898
const struct device_driver *driver;
12899
const struct device *parent;
12900
const char *empty = "";
12901
12902
parent = dev->dev.parent;
12903
if (!parent)
12904
return empty;
12905
12906
driver = parent->driver;
12907
if (driver && driver->name)
12908
return driver->name;
12909
return empty;
12910
}
12911
12912
static void __netdev_printk(const char *level, const struct net_device *dev,
12913
struct va_format *vaf)
12914
{
12915
if (dev && dev->dev.parent) {
12916
dev_printk_emit(level[1] - '0',
12917
dev->dev.parent,
12918
"%s %s %s%s: %pV",
12919
dev_driver_string(dev->dev.parent),
12920
dev_name(dev->dev.parent),
12921
netdev_name(dev), netdev_reg_state(dev),
12922
vaf);
12923
} else if (dev) {
12924
printk("%s%s%s: %pV",
12925
level, netdev_name(dev), netdev_reg_state(dev), vaf);
12926
} else {
12927
printk("%s(NULL net_device): %pV", level, vaf);
12928
}
12929
}
12930
12931
void netdev_printk(const char *level, const struct net_device *dev,
12932
const char *format, ...)
12933
{
12934
struct va_format vaf;
12935
va_list args;
12936
12937
va_start(args, format);
12938
12939
vaf.fmt = format;
12940
vaf.va = &args;
12941
12942
__netdev_printk(level, dev, &vaf);
12943
12944
va_end(args);
12945
}
12946
EXPORT_SYMBOL(netdev_printk);
12947
12948
#define define_netdev_printk_level(func, level) \
12949
void func(const struct net_device *dev, const char *fmt, ...) \
12950
{ \
12951
struct va_format vaf; \
12952
va_list args; \
12953
\
12954
va_start(args, fmt); \
12955
\
12956
vaf.fmt = fmt; \
12957
vaf.va = &args; \
12958
\
12959
__netdev_printk(level, dev, &vaf); \
12960
\
12961
va_end(args); \
12962
} \
12963
EXPORT_SYMBOL(func);
12964
12965
define_netdev_printk_level(netdev_emerg, KERN_EMERG);
12966
define_netdev_printk_level(netdev_alert, KERN_ALERT);
12967
define_netdev_printk_level(netdev_crit, KERN_CRIT);
12968
define_netdev_printk_level(netdev_err, KERN_ERR);
12969
define_netdev_printk_level(netdev_warn, KERN_WARNING);
12970
define_netdev_printk_level(netdev_notice, KERN_NOTICE);
12971
define_netdev_printk_level(netdev_info, KERN_INFO);
12972
12973
static void __net_exit netdev_exit(struct net *net)
12974
{
12975
kfree(net->dev_name_head);
12976
kfree(net->dev_index_head);
12977
xa_destroy(&net->dev_by_index);
12978
if (net != &init_net)
12979
WARN_ON_ONCE(!list_empty(&net->dev_base_head));
12980
}
12981
12982
static struct pernet_operations __net_initdata netdev_net_ops = {
12983
.init = netdev_init,
12984
.exit = netdev_exit,
12985
};
12986
12987
static void __net_exit default_device_exit_net(struct net *net)
12988
{
12989
struct netdev_name_node *name_node, *tmp;
12990
struct net_device *dev, *aux;
12991
/*
12992
* Push all migratable network devices back to the
12993
* initial network namespace
12994
*/
12995
ASSERT_RTNL();
12996
for_each_netdev_safe(net, dev, aux) {
12997
int err;
12998
char fb_name[IFNAMSIZ];
12999
13000
/* Ignore unmoveable devices (i.e. loopback) */
13001
if (dev->netns_immutable)
13002
continue;
13003
13004
/* Leave virtual devices for the generic cleanup */
13005
if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
13006
continue;
13007
13008
/* Push remaining network devices to init_net */
13009
snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
13010
if (netdev_name_in_use(&init_net, fb_name))
13011
snprintf(fb_name, IFNAMSIZ, "dev%%d");
13012
13013
netdev_for_each_altname_safe(dev, name_node, tmp)
13014
if (netdev_name_in_use(&init_net, name_node->name))
13015
__netdev_name_node_alt_destroy(name_node);
13016
13017
err = dev_change_net_namespace(dev, &init_net, fb_name);
13018
if (err) {
13019
pr_emerg("%s: failed to move %s to init_net: %d\n",
13020
__func__, dev->name, err);
13021
BUG();
13022
}
13023
}
13024
}
13025
13026
static void __net_exit default_device_exit_batch(struct list_head *net_list)
13027
{
13028
/* At exit all network devices most be removed from a network
13029
* namespace. Do this in the reverse order of registration.
13030
* Do this across as many network namespaces as possible to
13031
* improve batching efficiency.
13032
*/
13033
struct net_device *dev;
13034
struct net *net;
13035
LIST_HEAD(dev_kill_list);
13036
13037
rtnl_lock();
13038
list_for_each_entry(net, net_list, exit_list) {
13039
default_device_exit_net(net);
13040
cond_resched();
13041
}
13042
13043
list_for_each_entry(net, net_list, exit_list) {
13044
for_each_netdev_reverse(net, dev) {
13045
if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
13046
dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
13047
else
13048
unregister_netdevice_queue(dev, &dev_kill_list);
13049
}
13050
}
13051
unregister_netdevice_many(&dev_kill_list);
13052
rtnl_unlock();
13053
}
13054
13055
static struct pernet_operations __net_initdata default_device_ops = {
13056
.exit_batch = default_device_exit_batch,
13057
};
13058
13059
static void __init net_dev_struct_check(void)
13060
{
13061
/* TX read-mostly hotpath */
13062
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast);
13063
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
13064
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
13065
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
13066
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
13067
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
13068
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
13069
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
13070
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
13071
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
13072
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
13073
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
13074
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
13075
#ifdef CONFIG_XPS
13076
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
13077
#endif
13078
#ifdef CONFIG_NETFILTER_EGRESS
13079
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
13080
#endif
13081
#ifdef CONFIG_NET_XGRESS
13082
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
13083
#endif
13084
CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
13085
13086
/* TXRX read-mostly hotpath */
13087
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
13088
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
13089
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
13090
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
13091
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
13092
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
13093
CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
13094
13095
/* RX read-mostly hotpath */
13096
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
13097
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
13098
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
13099
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
13100
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
13101
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
13102
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
13103
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
13104
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
13105
#ifdef CONFIG_NETPOLL
13106
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
13107
#endif
13108
#ifdef CONFIG_NET_XGRESS
13109
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
13110
#endif
13111
CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
13112
}
13113
13114
/*
13115
* Initialize the DEV module. At boot time this walks the device list and
13116
* unhooks any devices that fail to initialise (normally hardware not
13117
* present) and leaves us with a valid list of present and active devices.
13118
*
13119
*/
13120
13121
/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
13122
#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE)
13123
13124
static int net_page_pool_create(int cpuid)
13125
{
13126
#if IS_ENABLED(CONFIG_PAGE_POOL)
13127
struct page_pool_params page_pool_params = {
13128
.pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
13129
.flags = PP_FLAG_SYSTEM_POOL,
13130
.nid = cpu_to_mem(cpuid),
13131
};
13132
struct page_pool *pp_ptr;
13133
int err;
13134
13135
pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
13136
if (IS_ERR(pp_ptr))
13137
return -ENOMEM;
13138
13139
err = xdp_reg_page_pool(pp_ptr);
13140
if (err) {
13141
page_pool_destroy(pp_ptr);
13142
return err;
13143
}
13144
13145
per_cpu(system_page_pool.pool, cpuid) = pp_ptr;
13146
#endif
13147
return 0;
13148
}
13149
13150
static int backlog_napi_should_run(unsigned int cpu)
13151
{
13152
struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
13153
struct napi_struct *napi = &sd->backlog;
13154
13155
return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
13156
}
13157
13158
static void run_backlog_napi(unsigned int cpu)
13159
{
13160
struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
13161
13162
napi_threaded_poll_loop(&sd->backlog, false);
13163
}
13164
13165
static void backlog_napi_setup(unsigned int cpu)
13166
{
13167
struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
13168
struct napi_struct *napi = &sd->backlog;
13169
13170
napi->thread = this_cpu_read(backlog_napi);
13171
set_bit(NAPI_STATE_THREADED, &napi->state);
13172
}
13173
13174
static struct smp_hotplug_thread backlog_threads = {
13175
.store = &backlog_napi,
13176
.thread_should_run = backlog_napi_should_run,
13177
.thread_fn = run_backlog_napi,
13178
.thread_comm = "backlog_napi/%u",
13179
.setup = backlog_napi_setup,
13180
};
13181
13182
/*
13183
* This is called single threaded during boot, so no need
13184
* to take the rtnl semaphore.
13185
*/
13186
static int __init net_dev_init(void)
13187
{
13188
int i, rc = -ENOMEM;
13189
13190
BUG_ON(!dev_boot_phase);
13191
13192
net_dev_struct_check();
13193
13194
if (dev_proc_init())
13195
goto out;
13196
13197
if (netdev_kobject_init())
13198
goto out;
13199
13200
for (i = 0; i < PTYPE_HASH_SIZE; i++)
13201
INIT_LIST_HEAD(&ptype_base[i]);
13202
13203
if (register_pernet_subsys(&netdev_net_ops))
13204
goto out;
13205
13206
/*
13207
* Initialise the packet receive queues.
13208
*/
13209
13210
flush_backlogs_fallback = flush_backlogs_alloc();
13211
if (!flush_backlogs_fallback)
13212
goto out;
13213
13214
for_each_possible_cpu(i) {
13215
struct softnet_data *sd = &per_cpu(softnet_data, i);
13216
13217
skb_queue_head_init(&sd->input_pkt_queue);
13218
skb_queue_head_init(&sd->process_queue);
13219
#ifdef CONFIG_XFRM_OFFLOAD
13220
skb_queue_head_init(&sd->xfrm_backlog);
13221
#endif
13222
INIT_LIST_HEAD(&sd->poll_list);
13223
sd->output_queue_tailp = &sd->output_queue;
13224
#ifdef CONFIG_RPS
13225
INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
13226
sd->cpu = i;
13227
#endif
13228
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
13229
13230
gro_init(&sd->backlog.gro);
13231
sd->backlog.poll = process_backlog;
13232
sd->backlog.weight = weight_p;
13233
INIT_LIST_HEAD(&sd->backlog.poll_list);
13234
13235
if (net_page_pool_create(i))
13236
goto out;
13237
}
13238
net_hotdata.skb_defer_nodes =
13239
__alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids,
13240
__alignof__(struct skb_defer_node));
13241
if (!net_hotdata.skb_defer_nodes)
13242
goto out;
13243
if (use_backlog_threads())
13244
smpboot_register_percpu_thread(&backlog_threads);
13245
13246
dev_boot_phase = 0;
13247
13248
/* The loopback device is special if any other network devices
13249
* is present in a network namespace the loopback device must
13250
* be present. Since we now dynamically allocate and free the
13251
* loopback device ensure this invariant is maintained by
13252
* keeping the loopback device as the first device on the
13253
* list of network devices. Ensuring the loopback devices
13254
* is the first device that appears and the last network device
13255
* that disappears.
13256
*/
13257
if (register_pernet_device(&loopback_net_ops))
13258
goto out;
13259
13260
if (register_pernet_device(&default_device_ops))
13261
goto out;
13262
13263
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
13264
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
13265
13266
rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
13267
NULL, dev_cpu_dead);
13268
WARN_ON(rc < 0);
13269
rc = 0;
13270
13271
/* avoid static key IPIs to isolated CPUs */
13272
if (housekeeping_enabled(HK_TYPE_MISC))
13273
net_enable_timestamp();
13274
out:
13275
if (rc < 0) {
13276
for_each_possible_cpu(i) {
13277
struct page_pool *pp_ptr;
13278
13279
pp_ptr = per_cpu(system_page_pool.pool, i);
13280
if (!pp_ptr)
13281
continue;
13282
13283
xdp_unreg_page_pool(pp_ptr);
13284
page_pool_destroy(pp_ptr);
13285
per_cpu(system_page_pool.pool, i) = NULL;
13286
}
13287
}
13288
13289
return rc;
13290
}
13291
13292
subsys_initcall(net_dev_init);
13293
13294