Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/core/dev.c
15111 views
1
/*
2
* NET3 Protocol independent device support routines.
3
*
4
* This program is free software; you can redistribute it and/or
5
* modify it under the terms of the GNU General Public License
6
* as published by the Free Software Foundation; either version
7
* 2 of the License, or (at your option) any later version.
8
*
9
* Derived from the non IP parts of dev.c 1.0.19
10
* Authors: Ross Biro
11
* Fred N. van Kempen, <[email protected]>
12
* Mark Evans, <[email protected]>
13
*
14
* Additional Authors:
15
* Florian la Roche <[email protected]>
16
* Alan Cox <[email protected]>
17
* David Hinds <[email protected]>
18
* Alexey Kuznetsov <[email protected]>
19
* Adam Sulmicki <[email protected]>
20
* Pekka Riikonen <[email protected]>
21
*
22
* Changes:
23
* D.J. Barrow : Fixed bug where dev->refcnt gets set
24
* to 2 if register_netdev gets called
25
* before net_dev_init & also removed a
26
* few lines of code in the process.
27
* Alan Cox : device private ioctl copies fields back.
28
* Alan Cox : Transmit queue code does relevant
29
* stunts to keep the queue safe.
30
* Alan Cox : Fixed double lock.
31
* Alan Cox : Fixed promisc NULL pointer trap
32
* ???????? : Support the full private ioctl range
33
* Alan Cox : Moved ioctl permission check into
34
* drivers
35
* Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36
* Alan Cox : 100 backlog just doesn't cut it when
37
* you start doing multicast video 8)
38
* Alan Cox : Rewrote net_bh and list manager.
39
* Alan Cox : Fix ETH_P_ALL echoback lengths.
40
* Alan Cox : Took out transmit every packet pass
41
* Saved a few bytes in the ioctl handler
42
* Alan Cox : Network driver sets packet type before
43
* calling netif_rx. Saves a function
44
* call a packet.
45
* Alan Cox : Hashed net_bh()
46
* Richard Kooijman: Timestamp fixes.
47
* Alan Cox : Wrong field in SIOCGIFDSTADDR
48
* Alan Cox : Device lock protection.
49
* Alan Cox : Fixed nasty side effect of device close
50
* changes.
51
* Rudi Cilibrasi : Pass the right thing to
52
* set_mac_address()
53
* Dave Miller : 32bit quantity for the device lock to
54
* make it work out on a Sparc.
55
* Bjorn Ekwall : Added KERNELD hack.
56
* Alan Cox : Cleaned up the backlog initialise.
57
* Craig Metz : SIOCGIFCONF fix if space for under
58
* 1 device.
59
* Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60
* is no device open function.
61
* Andi Kleen : Fix error reporting for SIOCGIFCONF
62
* Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63
* Cyrus Durgin : Cleaned for KMOD
64
* Adam Sulmicki : Bug Fix : Network Device Unload
65
* A network device unload needs to purge
66
* the backlog queue.
67
* Paul Rusty Russell : SIOCSIFNAME
68
* Pekka Riikonen : Netdev boot-time settings code
69
* Andrew Morton : Make unregister_netdevice wait
70
* indefinitely on dev->refcnt
71
* J Hadi Salim : - Backlog queue sampling
72
* - netif_rx() feedback
73
*/
74
75
#include <asm/uaccess.h>
76
#include <asm/system.h>
77
#include <linux/bitops.h>
78
#include <linux/capability.h>
79
#include <linux/cpu.h>
80
#include <linux/types.h>
81
#include <linux/kernel.h>
82
#include <linux/hash.h>
83
#include <linux/slab.h>
84
#include <linux/sched.h>
85
#include <linux/mutex.h>
86
#include <linux/string.h>
87
#include <linux/mm.h>
88
#include <linux/socket.h>
89
#include <linux/sockios.h>
90
#include <linux/errno.h>
91
#include <linux/interrupt.h>
92
#include <linux/if_ether.h>
93
#include <linux/netdevice.h>
94
#include <linux/etherdevice.h>
95
#include <linux/ethtool.h>
96
#include <linux/notifier.h>
97
#include <linux/skbuff.h>
98
#include <net/net_namespace.h>
99
#include <net/sock.h>
100
#include <linux/rtnetlink.h>
101
#include <linux/proc_fs.h>
102
#include <linux/seq_file.h>
103
#include <linux/stat.h>
104
#include <net/dst.h>
105
#include <net/pkt_sched.h>
106
#include <net/checksum.h>
107
#include <net/xfrm.h>
108
#include <linux/highmem.h>
109
#include <linux/init.h>
110
#include <linux/kmod.h>
111
#include <linux/module.h>
112
#include <linux/netpoll.h>
113
#include <linux/rcupdate.h>
114
#include <linux/delay.h>
115
#include <net/wext.h>
116
#include <net/iw_handler.h>
117
#include <asm/current.h>
118
#include <linux/audit.h>
119
#include <linux/dmaengine.h>
120
#include <linux/err.h>
121
#include <linux/ctype.h>
122
#include <linux/if_arp.h>
123
#include <linux/if_vlan.h>
124
#include <linux/ip.h>
125
#include <net/ip.h>
126
#include <linux/ipv6.h>
127
#include <linux/in.h>
128
#include <linux/jhash.h>
129
#include <linux/random.h>
130
#include <trace/events/napi.h>
131
#include <trace/events/net.h>
132
#include <trace/events/skb.h>
133
#include <linux/pci.h>
134
#include <linux/inetdevice.h>
135
#include <linux/cpu_rmap.h>
136
137
#include "net-sysfs.h"
138
139
/* Instead of increasing this, you should create a hash table. */
140
#define MAX_GRO_SKBS 8
141
142
/* This should be increased if a protocol with a bigger head is added. */
143
#define GRO_MAX_HEAD (MAX_HEADER + 128)
144
145
/*
146
* The list of packet types we will receive (as opposed to discard)
147
* and the routines to invoke.
148
*
149
* Why 16. Because with 16 the only overlap we get on a hash of the
150
* low nibble of the protocol value is RARP/SNAP/X.25.
151
*
152
* NOTE: That is no longer true with the addition of VLAN tags. Not
153
* sure which should go first, but I bet it won't make much
154
* difference if we are running VLANs. The good news is that
155
* this protocol won't be in the list unless compiled in, so
156
* the average user (w/out VLANs) will not be adversely affected.
157
* --BLG
158
*
159
* 0800 IP
160
* 8100 802.1Q VLAN
161
* 0001 802.3
162
* 0002 AX.25
163
* 0004 802.2
164
* 8035 RARP
165
* 0005 SNAP
166
* 0805 X.25
167
* 0806 ARP
168
* 8137 IPX
169
* 0009 Localtalk
170
* 86DD IPv6
171
*/
172
173
#define PTYPE_HASH_SIZE (16)
174
#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
175
176
static DEFINE_SPINLOCK(ptype_lock);
177
static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178
static struct list_head ptype_all __read_mostly; /* Taps */
179
180
/*
181
* The @dev_base_head list is protected by @dev_base_lock and the rtnl
182
* semaphore.
183
*
184
* Pure readers hold dev_base_lock for reading, or rcu_read_lock()
185
*
186
* Writers must hold the rtnl semaphore while they loop through the
187
* dev_base_head list, and hold dev_base_lock for writing when they do the
188
* actual updates. This allows pure readers to access the list even
189
* while a writer is preparing to update it.
190
*
191
* To put it another way, dev_base_lock is held for writing only to
192
* protect against pure readers; the rtnl semaphore provides the
193
* protection against other writers.
194
*
195
* See, for example usages, register_netdevice() and
196
* unregister_netdevice(), which must be called with the rtnl
197
* semaphore held.
198
*/
199
DEFINE_RWLOCK(dev_base_lock);
200
EXPORT_SYMBOL(dev_base_lock);
201
202
static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
203
{
204
unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
205
return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206
}
207
208
static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
209
{
210
return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211
}
212
213
static inline void rps_lock(struct softnet_data *sd)
214
{
215
#ifdef CONFIG_RPS
216
spin_lock(&sd->input_pkt_queue.lock);
217
#endif
218
}
219
220
static inline void rps_unlock(struct softnet_data *sd)
221
{
222
#ifdef CONFIG_RPS
223
spin_unlock(&sd->input_pkt_queue.lock);
224
#endif
225
}
226
227
/* Device list insertion */
228
static int list_netdevice(struct net_device *dev)
229
{
230
struct net *net = dev_net(dev);
231
232
ASSERT_RTNL();
233
234
write_lock_bh(&dev_base_lock);
235
list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
236
hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
237
hlist_add_head_rcu(&dev->index_hlist,
238
dev_index_hash(net, dev->ifindex));
239
write_unlock_bh(&dev_base_lock);
240
return 0;
241
}
242
243
/* Device list removal
244
* caller must respect a RCU grace period before freeing/reusing dev
245
*/
246
static void unlist_netdevice(struct net_device *dev)
247
{
248
ASSERT_RTNL();
249
250
/* Unlink dev from the device chain */
251
write_lock_bh(&dev_base_lock);
252
list_del_rcu(&dev->dev_list);
253
hlist_del_rcu(&dev->name_hlist);
254
hlist_del_rcu(&dev->index_hlist);
255
write_unlock_bh(&dev_base_lock);
256
}
257
258
/*
259
* Our notifier list
260
*/
261
262
static RAW_NOTIFIER_HEAD(netdev_chain);
263
264
/*
265
* Device drivers call our routines to queue packets here. We empty the
266
* queue in the local softnet handler.
267
*/
268
269
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270
EXPORT_PER_CPU_SYMBOL(softnet_data);
271
272
#ifdef CONFIG_LOCKDEP
273
/*
274
* register_netdevice() inits txq->_xmit_lock and sets lockdep class
275
* according to dev->type
276
*/
277
static const unsigned short netdev_lock_type[] =
278
{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279
ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280
ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281
ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282
ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283
ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284
ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285
ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286
ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287
ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288
ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289
ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290
ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
291
ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
292
ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
293
ARPHRD_VOID, ARPHRD_NONE};
294
295
static const char *const netdev_lock_name[] =
296
{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297
"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298
"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299
"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300
"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301
"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302
"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303
"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304
"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305
"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306
"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307
"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308
"_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
309
"_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
310
"_xmit_PHONET_PIPE", "_xmit_IEEE802154",
311
"_xmit_VOID", "_xmit_NONE"};
312
313
static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
314
static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
315
316
static inline unsigned short netdev_lock_pos(unsigned short dev_type)
317
{
318
int i;
319
320
for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
321
if (netdev_lock_type[i] == dev_type)
322
return i;
323
/* the last key is used by default */
324
return ARRAY_SIZE(netdev_lock_type) - 1;
325
}
326
327
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328
unsigned short dev_type)
329
{
330
int i;
331
332
i = netdev_lock_pos(dev_type);
333
lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
334
netdev_lock_name[i]);
335
}
336
337
static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
338
{
339
int i;
340
341
i = netdev_lock_pos(dev->type);
342
lockdep_set_class_and_name(&dev->addr_list_lock,
343
&netdev_addr_lock_key[i],
344
netdev_lock_name[i]);
345
}
346
#else
347
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
348
unsigned short dev_type)
349
{
350
}
351
static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352
{
353
}
354
#endif
355
356
/*******************************************************************************
357
358
Protocol management and registration routines
359
360
*******************************************************************************/
361
362
/*
363
* Add a protocol ID to the list. Now that the input handler is
364
* smarter we can dispense with all the messy stuff that used to be
365
* here.
366
*
367
* BEWARE!!! Protocol handlers, mangling input packets,
368
* MUST BE last in hash buckets and checking protocol handlers
369
* MUST start from promiscuous ptype_all chain in net_bh.
370
* It is true now, do not change it.
371
* Explanation follows: if protocol handler, mangling packet, will
372
* be the first on list, it is not able to sense, that packet
373
* is cloned and should be copied-on-write, so that it will
374
* change it and subsequent readers will get broken packet.
375
* --ANK (980803)
376
*/
377
378
static inline struct list_head *ptype_head(const struct packet_type *pt)
379
{
380
if (pt->type == htons(ETH_P_ALL))
381
return &ptype_all;
382
else
383
return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384
}
385
386
/**
387
* dev_add_pack - add packet handler
388
* @pt: packet type declaration
389
*
390
* Add a protocol handler to the networking stack. The passed &packet_type
391
* is linked into kernel lists and may not be freed until it has been
392
* removed from the kernel lists.
393
*
394
* This call does not sleep therefore it can not
395
* guarantee all CPU's that are in middle of receiving packets
396
* will see the new packet type (until the next received packet).
397
*/
398
399
void dev_add_pack(struct packet_type *pt)
400
{
401
struct list_head *head = ptype_head(pt);
402
403
spin_lock(&ptype_lock);
404
list_add_rcu(&pt->list, head);
405
spin_unlock(&ptype_lock);
406
}
407
EXPORT_SYMBOL(dev_add_pack);
408
409
/**
410
* __dev_remove_pack - remove packet handler
411
* @pt: packet type declaration
412
*
413
* Remove a protocol handler that was previously added to the kernel
414
* protocol handlers by dev_add_pack(). The passed &packet_type is removed
415
* from the kernel lists and can be freed or reused once this function
416
* returns.
417
*
418
* The packet type might still be in use by receivers
419
* and must not be freed until after all the CPU's have gone
420
* through a quiescent state.
421
*/
422
void __dev_remove_pack(struct packet_type *pt)
423
{
424
struct list_head *head = ptype_head(pt);
425
struct packet_type *pt1;
426
427
spin_lock(&ptype_lock);
428
429
list_for_each_entry(pt1, head, list) {
430
if (pt == pt1) {
431
list_del_rcu(&pt->list);
432
goto out;
433
}
434
}
435
436
printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437
out:
438
spin_unlock(&ptype_lock);
439
}
440
EXPORT_SYMBOL(__dev_remove_pack);
441
442
/**
443
* dev_remove_pack - remove packet handler
444
* @pt: packet type declaration
445
*
446
* Remove a protocol handler that was previously added to the kernel
447
* protocol handlers by dev_add_pack(). The passed &packet_type is removed
448
* from the kernel lists and can be freed or reused once this function
449
* returns.
450
*
451
* This call sleeps to guarantee that no CPU is looking at the packet
452
* type after return.
453
*/
454
void dev_remove_pack(struct packet_type *pt)
455
{
456
__dev_remove_pack(pt);
457
458
synchronize_net();
459
}
460
EXPORT_SYMBOL(dev_remove_pack);
461
462
/******************************************************************************
463
464
Device Boot-time Settings Routines
465
466
*******************************************************************************/
467
468
/* Boot time configuration table */
469
static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470
471
/**
472
* netdev_boot_setup_add - add new setup entry
473
* @name: name of the device
474
* @map: configured settings for the device
475
*
476
* Adds new setup entry to the dev_boot_setup list. The function
477
* returns 0 on error and 1 on success. This is a generic routine to
478
* all netdevices.
479
*/
480
static int netdev_boot_setup_add(char *name, struct ifmap *map)
481
{
482
struct netdev_boot_setup *s;
483
int i;
484
485
s = dev_boot_setup;
486
for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487
if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488
memset(s[i].name, 0, sizeof(s[i].name));
489
strlcpy(s[i].name, name, IFNAMSIZ);
490
memcpy(&s[i].map, map, sizeof(s[i].map));
491
break;
492
}
493
}
494
495
return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496
}
497
498
/**
499
* netdev_boot_setup_check - check boot time settings
500
* @dev: the netdevice
501
*
502
* Check boot time settings for the device.
503
* The found settings are set for the device to be used
504
* later in the device probing.
505
* Returns 0 if no settings found, 1 if they are.
506
*/
507
int netdev_boot_setup_check(struct net_device *dev)
508
{
509
struct netdev_boot_setup *s = dev_boot_setup;
510
int i;
511
512
for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513
if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514
!strcmp(dev->name, s[i].name)) {
515
dev->irq = s[i].map.irq;
516
dev->base_addr = s[i].map.base_addr;
517
dev->mem_start = s[i].map.mem_start;
518
dev->mem_end = s[i].map.mem_end;
519
return 1;
520
}
521
}
522
return 0;
523
}
524
EXPORT_SYMBOL(netdev_boot_setup_check);
525
526
527
/**
528
* netdev_boot_base - get address from boot time settings
529
* @prefix: prefix for network device
530
* @unit: id for network device
531
*
532
* Check boot time settings for the base address of device.
533
* The found settings are set for the device to be used
534
* later in the device probing.
535
* Returns 0 if no settings found.
536
*/
537
unsigned long netdev_boot_base(const char *prefix, int unit)
538
{
539
const struct netdev_boot_setup *s = dev_boot_setup;
540
char name[IFNAMSIZ];
541
int i;
542
543
sprintf(name, "%s%d", prefix, unit);
544
545
/*
546
* If device already registered then return base of 1
547
* to indicate not to probe for this interface
548
*/
549
if (__dev_get_by_name(&init_net, name))
550
return 1;
551
552
for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553
if (!strcmp(name, s[i].name))
554
return s[i].map.base_addr;
555
return 0;
556
}
557
558
/*
559
* Saves at boot time configured settings for any netdevice.
560
*/
561
int __init netdev_boot_setup(char *str)
562
{
563
int ints[5];
564
struct ifmap map;
565
566
str = get_options(str, ARRAY_SIZE(ints), ints);
567
if (!str || !*str)
568
return 0;
569
570
/* Save settings */
571
memset(&map, 0, sizeof(map));
572
if (ints[0] > 0)
573
map.irq = ints[1];
574
if (ints[0] > 1)
575
map.base_addr = ints[2];
576
if (ints[0] > 2)
577
map.mem_start = ints[3];
578
if (ints[0] > 3)
579
map.mem_end = ints[4];
580
581
/* Add new entry to the list */
582
return netdev_boot_setup_add(str, &map);
583
}
584
585
__setup("netdev=", netdev_boot_setup);
586
587
/*******************************************************************************
588
589
Device Interface Subroutines
590
591
*******************************************************************************/
592
593
/**
594
* __dev_get_by_name - find a device by its name
595
* @net: the applicable net namespace
596
* @name: name to find
597
*
598
* Find an interface by name. Must be called under RTNL semaphore
599
* or @dev_base_lock. If the name is found a pointer to the device
600
* is returned. If the name is not found then %NULL is returned. The
601
* reference counters are not incremented so the caller must be
602
* careful with locks.
603
*/
604
605
struct net_device *__dev_get_by_name(struct net *net, const char *name)
606
{
607
struct hlist_node *p;
608
struct net_device *dev;
609
struct hlist_head *head = dev_name_hash(net, name);
610
611
hlist_for_each_entry(dev, p, head, name_hlist)
612
if (!strncmp(dev->name, name, IFNAMSIZ))
613
return dev;
614
615
return NULL;
616
}
617
EXPORT_SYMBOL(__dev_get_by_name);
618
619
/**
620
* dev_get_by_name_rcu - find a device by its name
621
* @net: the applicable net namespace
622
* @name: name to find
623
*
624
* Find an interface by name.
625
* If the name is found a pointer to the device is returned.
626
* If the name is not found then %NULL is returned.
627
* The reference counters are not incremented so the caller must be
628
* careful with locks. The caller must hold RCU lock.
629
*/
630
631
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632
{
633
struct hlist_node *p;
634
struct net_device *dev;
635
struct hlist_head *head = dev_name_hash(net, name);
636
637
hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638
if (!strncmp(dev->name, name, IFNAMSIZ))
639
return dev;
640
641
return NULL;
642
}
643
EXPORT_SYMBOL(dev_get_by_name_rcu);
644
645
/**
646
* dev_get_by_name - find a device by its name
647
* @net: the applicable net namespace
648
* @name: name to find
649
*
650
* Find an interface by name. This can be called from any
651
* context and does its own locking. The returned handle has
652
* the usage count incremented and the caller must use dev_put() to
653
* release it when it is no longer needed. %NULL is returned if no
654
* matching device is found.
655
*/
656
657
struct net_device *dev_get_by_name(struct net *net, const char *name)
658
{
659
struct net_device *dev;
660
661
rcu_read_lock();
662
dev = dev_get_by_name_rcu(net, name);
663
if (dev)
664
dev_hold(dev);
665
rcu_read_unlock();
666
return dev;
667
}
668
EXPORT_SYMBOL(dev_get_by_name);
669
670
/**
671
* __dev_get_by_index - find a device by its ifindex
672
* @net: the applicable net namespace
673
* @ifindex: index of device
674
*
675
* Search for an interface by index. Returns %NULL if the device
676
* is not found or a pointer to the device. The device has not
677
* had its reference counter increased so the caller must be careful
678
* about locking. The caller must hold either the RTNL semaphore
679
* or @dev_base_lock.
680
*/
681
682
struct net_device *__dev_get_by_index(struct net *net, int ifindex)
683
{
684
struct hlist_node *p;
685
struct net_device *dev;
686
struct hlist_head *head = dev_index_hash(net, ifindex);
687
688
hlist_for_each_entry(dev, p, head, index_hlist)
689
if (dev->ifindex == ifindex)
690
return dev;
691
692
return NULL;
693
}
694
EXPORT_SYMBOL(__dev_get_by_index);
695
696
/**
697
* dev_get_by_index_rcu - find a device by its ifindex
698
* @net: the applicable net namespace
699
* @ifindex: index of device
700
*
701
* Search for an interface by index. Returns %NULL if the device
702
* is not found or a pointer to the device. The device has not
703
* had its reference counter increased so the caller must be careful
704
* about locking. The caller must hold RCU lock.
705
*/
706
707
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708
{
709
struct hlist_node *p;
710
struct net_device *dev;
711
struct hlist_head *head = dev_index_hash(net, ifindex);
712
713
hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714
if (dev->ifindex == ifindex)
715
return dev;
716
717
return NULL;
718
}
719
EXPORT_SYMBOL(dev_get_by_index_rcu);
720
721
722
/**
723
* dev_get_by_index - find a device by its ifindex
724
* @net: the applicable net namespace
725
* @ifindex: index of device
726
*
727
* Search for an interface by index. Returns NULL if the device
728
* is not found or a pointer to the device. The device returned has
729
* had a reference added and the pointer is safe until the user calls
730
* dev_put to indicate they have finished with it.
731
*/
732
733
struct net_device *dev_get_by_index(struct net *net, int ifindex)
734
{
735
struct net_device *dev;
736
737
rcu_read_lock();
738
dev = dev_get_by_index_rcu(net, ifindex);
739
if (dev)
740
dev_hold(dev);
741
rcu_read_unlock();
742
return dev;
743
}
744
EXPORT_SYMBOL(dev_get_by_index);
745
746
/**
747
* dev_getbyhwaddr_rcu - find a device by its hardware address
748
* @net: the applicable net namespace
749
* @type: media type of device
750
* @ha: hardware address
751
*
752
* Search for an interface by MAC address. Returns NULL if the device
753
* is not found or a pointer to the device.
754
* The caller must hold RCU or RTNL.
755
* The returned device has not had its ref count increased
756
* and the caller must therefore be careful about locking
757
*
758
*/
759
760
struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
761
const char *ha)
762
{
763
struct net_device *dev;
764
765
for_each_netdev_rcu(net, dev)
766
if (dev->type == type &&
767
!memcmp(dev->dev_addr, ha, dev->addr_len))
768
return dev;
769
770
return NULL;
771
}
772
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
773
774
struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
775
{
776
struct net_device *dev;
777
778
ASSERT_RTNL();
779
for_each_netdev(net, dev)
780
if (dev->type == type)
781
return dev;
782
783
return NULL;
784
}
785
EXPORT_SYMBOL(__dev_getfirstbyhwtype);
786
787
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
788
{
789
struct net_device *dev, *ret = NULL;
790
791
rcu_read_lock();
792
for_each_netdev_rcu(net, dev)
793
if (dev->type == type) {
794
dev_hold(dev);
795
ret = dev;
796
break;
797
}
798
rcu_read_unlock();
799
return ret;
800
}
801
EXPORT_SYMBOL(dev_getfirstbyhwtype);
802
803
/**
804
* dev_get_by_flags_rcu - find any device with given flags
805
* @net: the applicable net namespace
806
* @if_flags: IFF_* values
807
* @mask: bitmask of bits in if_flags to check
808
*
809
* Search for any interface with the given flags. Returns NULL if a device
810
* is not found or a pointer to the device. Must be called inside
811
* rcu_read_lock(), and result refcount is unchanged.
812
*/
813
814
struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
815
unsigned short mask)
816
{
817
struct net_device *dev, *ret;
818
819
ret = NULL;
820
for_each_netdev_rcu(net, dev) {
821
if (((dev->flags ^ if_flags) & mask) == 0) {
822
ret = dev;
823
break;
824
}
825
}
826
return ret;
827
}
828
EXPORT_SYMBOL(dev_get_by_flags_rcu);
829
830
/**
831
* dev_valid_name - check if name is okay for network device
832
* @name: name string
833
*
834
* Network device names need to be valid file names to
835
* to allow sysfs to work. We also disallow any kind of
836
* whitespace.
837
*/
838
int dev_valid_name(const char *name)
839
{
840
if (*name == '\0')
841
return 0;
842
if (strlen(name) >= IFNAMSIZ)
843
return 0;
844
if (!strcmp(name, ".") || !strcmp(name, ".."))
845
return 0;
846
847
while (*name) {
848
if (*name == '/' || isspace(*name))
849
return 0;
850
name++;
851
}
852
return 1;
853
}
854
EXPORT_SYMBOL(dev_valid_name);
855
856
/**
857
* __dev_alloc_name - allocate a name for a device
858
* @net: network namespace to allocate the device name in
859
* @name: name format string
860
* @buf: scratch buffer and result name string
861
*
862
* Passed a format string - eg "lt%d" it will try and find a suitable
863
* id. It scans list of devices to build up a free map, then chooses
864
* the first empty slot. The caller must hold the dev_base or rtnl lock
865
* while allocating the name and adding the device in order to avoid
866
* duplicates.
867
* Limited to bits_per_byte * page size devices (ie 32K on most platforms).
868
* Returns the number of the unit assigned or a negative errno code.
869
*/
870
871
static int __dev_alloc_name(struct net *net, const char *name, char *buf)
872
{
873
int i = 0;
874
const char *p;
875
const int max_netdevices = 8*PAGE_SIZE;
876
unsigned long *inuse;
877
struct net_device *d;
878
879
p = strnchr(name, IFNAMSIZ-1, '%');
880
if (p) {
881
/*
882
* Verify the string as this thing may have come from
883
* the user. There must be either one "%d" and no other "%"
884
* characters.
885
*/
886
if (p[1] != 'd' || strchr(p + 2, '%'))
887
return -EINVAL;
888
889
/* Use one page as a bit array of possible slots */
890
inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
891
if (!inuse)
892
return -ENOMEM;
893
894
for_each_netdev(net, d) {
895
if (!sscanf(d->name, name, &i))
896
continue;
897
if (i < 0 || i >= max_netdevices)
898
continue;
899
900
/* avoid cases where sscanf is not exact inverse of printf */
901
snprintf(buf, IFNAMSIZ, name, i);
902
if (!strncmp(buf, d->name, IFNAMSIZ))
903
set_bit(i, inuse);
904
}
905
906
i = find_first_zero_bit(inuse, max_netdevices);
907
free_page((unsigned long) inuse);
908
}
909
910
if (buf != name)
911
snprintf(buf, IFNAMSIZ, name, i);
912
if (!__dev_get_by_name(net, buf))
913
return i;
914
915
/* It is possible to run out of possible slots
916
* when the name is long and there isn't enough space left
917
* for the digits, or if all bits are used.
918
*/
919
return -ENFILE;
920
}
921
922
/**
923
* dev_alloc_name - allocate a name for a device
924
* @dev: device
925
* @name: name format string
926
*
927
* Passed a format string - eg "lt%d" it will try and find a suitable
928
* id. It scans list of devices to build up a free map, then chooses
929
* the first empty slot. The caller must hold the dev_base or rtnl lock
930
* while allocating the name and adding the device in order to avoid
931
* duplicates.
932
* Limited to bits_per_byte * page size devices (ie 32K on most platforms).
933
* Returns the number of the unit assigned or a negative errno code.
934
*/
935
936
int dev_alloc_name(struct net_device *dev, const char *name)
937
{
938
char buf[IFNAMSIZ];
939
struct net *net;
940
int ret;
941
942
BUG_ON(!dev_net(dev));
943
net = dev_net(dev);
944
ret = __dev_alloc_name(net, name, buf);
945
if (ret >= 0)
946
strlcpy(dev->name, buf, IFNAMSIZ);
947
return ret;
948
}
949
EXPORT_SYMBOL(dev_alloc_name);
950
951
static int dev_get_valid_name(struct net_device *dev, const char *name)
952
{
953
struct net *net;
954
955
BUG_ON(!dev_net(dev));
956
net = dev_net(dev);
957
958
if (!dev_valid_name(name))
959
return -EINVAL;
960
961
if (strchr(name, '%'))
962
return dev_alloc_name(dev, name);
963
else if (__dev_get_by_name(net, name))
964
return -EEXIST;
965
else if (dev->name != name)
966
strlcpy(dev->name, name, IFNAMSIZ);
967
968
return 0;
969
}
970
971
/**
972
* dev_change_name - change name of a device
973
* @dev: device
974
* @newname: name (or format string) must be at least IFNAMSIZ
975
*
976
* Change name of a device, can pass format strings "eth%d".
977
* for wildcarding.
978
*/
979
int dev_change_name(struct net_device *dev, const char *newname)
980
{
981
char oldname[IFNAMSIZ];
982
int err = 0;
983
int ret;
984
struct net *net;
985
986
ASSERT_RTNL();
987
BUG_ON(!dev_net(dev));
988
989
net = dev_net(dev);
990
if (dev->flags & IFF_UP)
991
return -EBUSY;
992
993
if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
994
return 0;
995
996
memcpy(oldname, dev->name, IFNAMSIZ);
997
998
err = dev_get_valid_name(dev, newname);
999
if (err < 0)
1000
return err;
1001
1002
rollback:
1003
ret = device_rename(&dev->dev, dev->name);
1004
if (ret) {
1005
memcpy(dev->name, oldname, IFNAMSIZ);
1006
return ret;
1007
}
1008
1009
write_lock_bh(&dev_base_lock);
1010
hlist_del_rcu(&dev->name_hlist);
1011
write_unlock_bh(&dev_base_lock);
1012
1013
synchronize_rcu();
1014
1015
write_lock_bh(&dev_base_lock);
1016
hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017
write_unlock_bh(&dev_base_lock);
1018
1019
ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020
ret = notifier_to_errno(ret);
1021
1022
if (ret) {
1023
/* err >= 0 after dev_alloc_name() or stores the first errno */
1024
if (err >= 0) {
1025
err = ret;
1026
memcpy(dev->name, oldname, IFNAMSIZ);
1027
goto rollback;
1028
} else {
1029
printk(KERN_ERR
1030
"%s: name change rollback failed: %d.\n",
1031
dev->name, ret);
1032
}
1033
}
1034
1035
return err;
1036
}
1037
1038
/**
1039
* dev_set_alias - change ifalias of a device
1040
* @dev: device
1041
* @alias: name up to IFALIASZ
1042
* @len: limit of bytes to copy from info
1043
*
1044
* Set ifalias for a device,
1045
*/
1046
int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047
{
1048
ASSERT_RTNL();
1049
1050
if (len >= IFALIASZ)
1051
return -EINVAL;
1052
1053
if (!len) {
1054
if (dev->ifalias) {
1055
kfree(dev->ifalias);
1056
dev->ifalias = NULL;
1057
}
1058
return 0;
1059
}
1060
1061
dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062
if (!dev->ifalias)
1063
return -ENOMEM;
1064
1065
strlcpy(dev->ifalias, alias, len+1);
1066
return len;
1067
}
1068
1069
1070
/**
1071
* netdev_features_change - device changes features
1072
* @dev: device to cause notification
1073
*
1074
* Called to indicate a device has changed features.
1075
*/
1076
void netdev_features_change(struct net_device *dev)
1077
{
1078
call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079
}
1080
EXPORT_SYMBOL(netdev_features_change);
1081
1082
/**
1083
* netdev_state_change - device changes state
1084
* @dev: device to cause notification
1085
*
1086
* Called to indicate a device has changed state. This function calls
1087
* the notifier chains for netdev_chain and sends a NEWLINK message
1088
* to the routing socket.
1089
*/
1090
void netdev_state_change(struct net_device *dev)
1091
{
1092
if (dev->flags & IFF_UP) {
1093
call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094
rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095
}
1096
}
1097
EXPORT_SYMBOL(netdev_state_change);
1098
1099
int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100
{
1101
return call_netdevice_notifiers(event, dev);
1102
}
1103
EXPORT_SYMBOL(netdev_bonding_change);
1104
1105
/**
1106
* dev_load - load a network module
1107
* @net: the applicable net namespace
1108
* @name: name of interface
1109
*
1110
* If a network interface is not present and the process has suitable
1111
* privileges this function loads the module. If module loading is not
1112
* available in this kernel then it becomes a nop.
1113
*/
1114
1115
void dev_load(struct net *net, const char *name)
1116
{
1117
struct net_device *dev;
1118
int no_module;
1119
1120
rcu_read_lock();
1121
dev = dev_get_by_name_rcu(net, name);
1122
rcu_read_unlock();
1123
1124
no_module = !dev;
1125
if (no_module && capable(CAP_NET_ADMIN))
1126
no_module = request_module("netdev-%s", name);
1127
if (no_module && capable(CAP_SYS_MODULE)) {
1128
if (!request_module("%s", name))
1129
pr_err("Loading kernel module for a network device "
1130
"with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s "
1131
"instead\n", name);
1132
}
1133
}
1134
EXPORT_SYMBOL(dev_load);
1135
1136
static int __dev_open(struct net_device *dev)
1137
{
1138
const struct net_device_ops *ops = dev->netdev_ops;
1139
int ret;
1140
1141
ASSERT_RTNL();
1142
1143
if (!netif_device_present(dev))
1144
return -ENODEV;
1145
1146
ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1147
ret = notifier_to_errno(ret);
1148
if (ret)
1149
return ret;
1150
1151
set_bit(__LINK_STATE_START, &dev->state);
1152
1153
if (ops->ndo_validate_addr)
1154
ret = ops->ndo_validate_addr(dev);
1155
1156
if (!ret && ops->ndo_open)
1157
ret = ops->ndo_open(dev);
1158
1159
if (ret)
1160
clear_bit(__LINK_STATE_START, &dev->state);
1161
else {
1162
dev->flags |= IFF_UP;
1163
net_dmaengine_get();
1164
dev_set_rx_mode(dev);
1165
dev_activate(dev);
1166
}
1167
1168
return ret;
1169
}
1170
1171
/**
1172
* dev_open - prepare an interface for use.
1173
* @dev: device to open
1174
*
1175
* Takes a device from down to up state. The device's private open
1176
* function is invoked and then the multicast lists are loaded. Finally
1177
* the device is moved into the up state and a %NETDEV_UP message is
1178
* sent to the netdev notifier chain.
1179
*
1180
* Calling this function on an active interface is a nop. On a failure
1181
* a negative errno code is returned.
1182
*/
1183
int dev_open(struct net_device *dev)
1184
{
1185
int ret;
1186
1187
if (dev->flags & IFF_UP)
1188
return 0;
1189
1190
ret = __dev_open(dev);
1191
if (ret < 0)
1192
return ret;
1193
1194
rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1195
call_netdevice_notifiers(NETDEV_UP, dev);
1196
1197
return ret;
1198
}
1199
EXPORT_SYMBOL(dev_open);
1200
1201
static int __dev_close_many(struct list_head *head)
1202
{
1203
struct net_device *dev;
1204
1205
ASSERT_RTNL();
1206
might_sleep();
1207
1208
list_for_each_entry(dev, head, unreg_list) {
1209
call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1210
1211
clear_bit(__LINK_STATE_START, &dev->state);
1212
1213
/* Synchronize to scheduled poll. We cannot touch poll list, it
1214
* can be even on different cpu. So just clear netif_running().
1215
*
1216
* dev->stop() will invoke napi_disable() on all of it's
1217
* napi_struct instances on this device.
1218
*/
1219
smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220
}
1221
1222
dev_deactivate_many(head);
1223
1224
list_for_each_entry(dev, head, unreg_list) {
1225
const struct net_device_ops *ops = dev->netdev_ops;
1226
1227
/*
1228
* Call the device specific close. This cannot fail.
1229
* Only if device is UP
1230
*
1231
* We allow it to be called even after a DETACH hot-plug
1232
* event.
1233
*/
1234
if (ops->ndo_stop)
1235
ops->ndo_stop(dev);
1236
1237
dev->flags &= ~IFF_UP;
1238
net_dmaengine_put();
1239
}
1240
1241
return 0;
1242
}
1243
1244
static int __dev_close(struct net_device *dev)
1245
{
1246
int retval;
1247
LIST_HEAD(single);
1248
1249
list_add(&dev->unreg_list, &single);
1250
retval = __dev_close_many(&single);
1251
list_del(&single);
1252
return retval;
1253
}
1254
1255
static int dev_close_many(struct list_head *head)
1256
{
1257
struct net_device *dev, *tmp;
1258
LIST_HEAD(tmp_list);
1259
1260
list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261
if (!(dev->flags & IFF_UP))
1262
list_move(&dev->unreg_list, &tmp_list);
1263
1264
__dev_close_many(head);
1265
1266
list_for_each_entry(dev, head, unreg_list) {
1267
rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268
call_netdevice_notifiers(NETDEV_DOWN, dev);
1269
}
1270
1271
/* rollback_registered_many needs the complete original list */
1272
list_splice(&tmp_list, head);
1273
return 0;
1274
}
1275
1276
/**
1277
* dev_close - shutdown an interface.
1278
* @dev: device to shutdown
1279
*
1280
* This function moves an active device into down state. A
1281
* %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282
* is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1283
* chain.
1284
*/
1285
int dev_close(struct net_device *dev)
1286
{
1287
if (dev->flags & IFF_UP) {
1288
LIST_HEAD(single);
1289
1290
list_add(&dev->unreg_list, &single);
1291
dev_close_many(&single);
1292
list_del(&single);
1293
}
1294
return 0;
1295
}
1296
EXPORT_SYMBOL(dev_close);
1297
1298
1299
/**
1300
* dev_disable_lro - disable Large Receive Offload on a device
1301
* @dev: device
1302
*
1303
* Disable Large Receive Offload (LRO) on a net device. Must be
1304
* called under RTNL. This is needed if received packets may be
1305
* forwarded to another interface.
1306
*/
1307
void dev_disable_lro(struct net_device *dev)
1308
{
1309
u32 flags;
1310
1311
/*
1312
* If we're trying to disable lro on a vlan device
1313
* use the underlying physical device instead
1314
*/
1315
if (is_vlan_dev(dev))
1316
dev = vlan_dev_real_dev(dev);
1317
1318
if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1319
flags = dev->ethtool_ops->get_flags(dev);
1320
else
1321
flags = ethtool_op_get_flags(dev);
1322
1323
if (!(flags & ETH_FLAG_LRO))
1324
return;
1325
1326
__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1327
if (unlikely(dev->features & NETIF_F_LRO))
1328
netdev_WARN(dev, "failed to disable LRO!\n");
1329
}
1330
EXPORT_SYMBOL(dev_disable_lro);
1331
1332
1333
static int dev_boot_phase = 1;
1334
1335
/**
1336
* register_netdevice_notifier - register a network notifier block
1337
* @nb: notifier
1338
*
1339
* Register a notifier to be called when network device events occur.
1340
* The notifier passed is linked into the kernel structures and must
1341
* not be reused until it has been unregistered. A negative errno code
1342
* is returned on a failure.
1343
*
1344
* When registered all registration and up events are replayed
1345
* to the new notifier to allow device to have a race free
1346
* view of the network device list.
1347
*/
1348
1349
int register_netdevice_notifier(struct notifier_block *nb)
1350
{
1351
struct net_device *dev;
1352
struct net_device *last;
1353
struct net *net;
1354
int err;
1355
1356
rtnl_lock();
1357
err = raw_notifier_chain_register(&netdev_chain, nb);
1358
if (err)
1359
goto unlock;
1360
if (dev_boot_phase)
1361
goto unlock;
1362
for_each_net(net) {
1363
for_each_netdev(net, dev) {
1364
err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1365
err = notifier_to_errno(err);
1366
if (err)
1367
goto rollback;
1368
1369
if (!(dev->flags & IFF_UP))
1370
continue;
1371
1372
nb->notifier_call(nb, NETDEV_UP, dev);
1373
}
1374
}
1375
1376
unlock:
1377
rtnl_unlock();
1378
return err;
1379
1380
rollback:
1381
last = dev;
1382
for_each_net(net) {
1383
for_each_netdev(net, dev) {
1384
if (dev == last)
1385
break;
1386
1387
if (dev->flags & IFF_UP) {
1388
nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1389
nb->notifier_call(nb, NETDEV_DOWN, dev);
1390
}
1391
nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1392
nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1393
}
1394
}
1395
1396
raw_notifier_chain_unregister(&netdev_chain, nb);
1397
goto unlock;
1398
}
1399
EXPORT_SYMBOL(register_netdevice_notifier);
1400
1401
/**
1402
* unregister_netdevice_notifier - unregister a network notifier block
1403
* @nb: notifier
1404
*
1405
* Unregister a notifier previously registered by
1406
* register_netdevice_notifier(). The notifier is unlinked into the
1407
* kernel structures and may then be reused. A negative errno code
1408
* is returned on a failure.
1409
*/
1410
1411
int unregister_netdevice_notifier(struct notifier_block *nb)
1412
{
1413
int err;
1414
1415
rtnl_lock();
1416
err = raw_notifier_chain_unregister(&netdev_chain, nb);
1417
rtnl_unlock();
1418
return err;
1419
}
1420
EXPORT_SYMBOL(unregister_netdevice_notifier);
1421
1422
/**
1423
* call_netdevice_notifiers - call all network notifier blocks
1424
* @val: value passed unmodified to notifier function
1425
* @dev: net_device pointer passed unmodified to notifier function
1426
*
1427
* Call all network notifier blocks. Parameters and return value
1428
* are as for raw_notifier_call_chain().
1429
*/
1430
1431
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1432
{
1433
ASSERT_RTNL();
1434
return raw_notifier_call_chain(&netdev_chain, val, dev);
1435
}
1436
EXPORT_SYMBOL(call_netdevice_notifiers);
1437
1438
/* When > 0 there are consumers of rx skb time stamps */
1439
static atomic_t netstamp_needed = ATOMIC_INIT(0);
1440
1441
void net_enable_timestamp(void)
1442
{
1443
atomic_inc(&netstamp_needed);
1444
}
1445
EXPORT_SYMBOL(net_enable_timestamp);
1446
1447
void net_disable_timestamp(void)
1448
{
1449
atomic_dec(&netstamp_needed);
1450
}
1451
EXPORT_SYMBOL(net_disable_timestamp);
1452
1453
static inline void net_timestamp_set(struct sk_buff *skb)
1454
{
1455
if (atomic_read(&netstamp_needed))
1456
__net_timestamp(skb);
1457
else
1458
skb->tstamp.tv64 = 0;
1459
}
1460
1461
static inline void net_timestamp_check(struct sk_buff *skb)
1462
{
1463
if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1464
__net_timestamp(skb);
1465
}
1466
1467
static inline bool is_skb_forwardable(struct net_device *dev,
1468
struct sk_buff *skb)
1469
{
1470
unsigned int len;
1471
1472
if (!(dev->flags & IFF_UP))
1473
return false;
1474
1475
len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1476
if (skb->len <= len)
1477
return true;
1478
1479
/* if TSO is enabled, we don't care about the length as the packet
1480
* could be forwarded without being segmented before
1481
*/
1482
if (skb_is_gso(skb))
1483
return true;
1484
1485
return false;
1486
}
1487
1488
/**
1489
* dev_forward_skb - loopback an skb to another netif
1490
*
1491
* @dev: destination network device
1492
* @skb: buffer to forward
1493
*
1494
* return values:
1495
* NET_RX_SUCCESS (no congestion)
1496
* NET_RX_DROP (packet was dropped, but freed)
1497
*
1498
* dev_forward_skb can be used for injecting an skb from the
1499
* start_xmit function of one device into the receive queue
1500
* of another device.
1501
*
1502
* The receiving device may be in another namespace, so
1503
* we have to clear all information in the skb that could
1504
* impact namespace isolation.
1505
*/
1506
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1507
{
1508
skb_orphan(skb);
1509
nf_reset(skb);
1510
1511
if (unlikely(!is_skb_forwardable(dev, skb))) {
1512
atomic_long_inc(&dev->rx_dropped);
1513
kfree_skb(skb);
1514
return NET_RX_DROP;
1515
}
1516
skb_set_dev(skb, dev);
1517
skb->tstamp.tv64 = 0;
1518
skb->pkt_type = PACKET_HOST;
1519
skb->protocol = eth_type_trans(skb, dev);
1520
return netif_rx(skb);
1521
}
1522
EXPORT_SYMBOL_GPL(dev_forward_skb);
1523
1524
static inline int deliver_skb(struct sk_buff *skb,
1525
struct packet_type *pt_prev,
1526
struct net_device *orig_dev)
1527
{
1528
atomic_inc(&skb->users);
1529
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1530
}
1531
1532
/*
1533
* Support routine. Sends outgoing frames to any network
1534
* taps currently in use.
1535
*/
1536
1537
static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1538
{
1539
struct packet_type *ptype;
1540
struct sk_buff *skb2 = NULL;
1541
struct packet_type *pt_prev = NULL;
1542
1543
rcu_read_lock();
1544
list_for_each_entry_rcu(ptype, &ptype_all, list) {
1545
/* Never send packets back to the socket
1546
* they originated from - MvS ([email protected])
1547
*/
1548
if ((ptype->dev == dev || !ptype->dev) &&
1549
(ptype->af_packet_priv == NULL ||
1550
(struct sock *)ptype->af_packet_priv != skb->sk)) {
1551
if (pt_prev) {
1552
deliver_skb(skb2, pt_prev, skb->dev);
1553
pt_prev = ptype;
1554
continue;
1555
}
1556
1557
skb2 = skb_clone(skb, GFP_ATOMIC);
1558
if (!skb2)
1559
break;
1560
1561
net_timestamp_set(skb2);
1562
1563
/* skb->nh should be correctly
1564
set by sender, so that the second statement is
1565
just protection against buggy protocols.
1566
*/
1567
skb_reset_mac_header(skb2);
1568
1569
if (skb_network_header(skb2) < skb2->data ||
1570
skb2->network_header > skb2->tail) {
1571
if (net_ratelimit())
1572
printk(KERN_CRIT "protocol %04x is "
1573
"buggy, dev %s\n",
1574
ntohs(skb2->protocol),
1575
dev->name);
1576
skb_reset_network_header(skb2);
1577
}
1578
1579
skb2->transport_header = skb2->network_header;
1580
skb2->pkt_type = PACKET_OUTGOING;
1581
pt_prev = ptype;
1582
}
1583
}
1584
if (pt_prev)
1585
pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1586
rcu_read_unlock();
1587
}
1588
1589
/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1590
* @dev: Network device
1591
* @txq: number of queues available
1592
*
1593
* If real_num_tx_queues is changed the tc mappings may no longer be
1594
* valid. To resolve this verify the tc mapping remains valid and if
1595
* not NULL the mapping. With no priorities mapping to this
1596
* offset/count pair it will no longer be used. In the worst case TC0
1597
* is invalid nothing can be done so disable priority mappings. If is
1598
* expected that drivers will fix this mapping if they can before
1599
* calling netif_set_real_num_tx_queues.
1600
*/
1601
static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1602
{
1603
int i;
1604
struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1605
1606
/* If TC0 is invalidated disable TC mapping */
1607
if (tc->offset + tc->count > txq) {
1608
pr_warning("Number of in use tx queues changed "
1609
"invalidating tc mappings. Priority "
1610
"traffic classification disabled!\n");
1611
dev->num_tc = 0;
1612
return;
1613
}
1614
1615
/* Invalidated prio to tc mappings set to TC0 */
1616
for (i = 1; i < TC_BITMASK + 1; i++) {
1617
int q = netdev_get_prio_tc_map(dev, i);
1618
1619
tc = &dev->tc_to_txq[q];
1620
if (tc->offset + tc->count > txq) {
1621
pr_warning("Number of in use tx queues "
1622
"changed. Priority %i to tc "
1623
"mapping %i is no longer valid "
1624
"setting map to 0\n",
1625
i, q);
1626
netdev_set_prio_tc_map(dev, i, 0);
1627
}
1628
}
1629
}
1630
1631
/*
1632
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1633
* greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1634
*/
1635
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1636
{
1637
int rc;
1638
1639
if (txq < 1 || txq > dev->num_tx_queues)
1640
return -EINVAL;
1641
1642
if (dev->reg_state == NETREG_REGISTERED ||
1643
dev->reg_state == NETREG_UNREGISTERING) {
1644
ASSERT_RTNL();
1645
1646
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1647
txq);
1648
if (rc)
1649
return rc;
1650
1651
if (dev->num_tc)
1652
netif_setup_tc(dev, txq);
1653
1654
if (txq < dev->real_num_tx_queues)
1655
qdisc_reset_all_tx_gt(dev, txq);
1656
}
1657
1658
dev->real_num_tx_queues = txq;
1659
return 0;
1660
}
1661
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1662
1663
#ifdef CONFIG_RPS
1664
/**
1665
* netif_set_real_num_rx_queues - set actual number of RX queues used
1666
* @dev: Network device
1667
* @rxq: Actual number of RX queues
1668
*
1669
* This must be called either with the rtnl_lock held or before
1670
* registration of the net device. Returns 0 on success, or a
1671
* negative error code. If called before registration, it always
1672
* succeeds.
1673
*/
1674
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1675
{
1676
int rc;
1677
1678
if (rxq < 1 || rxq > dev->num_rx_queues)
1679
return -EINVAL;
1680
1681
if (dev->reg_state == NETREG_REGISTERED) {
1682
ASSERT_RTNL();
1683
1684
rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1685
rxq);
1686
if (rc)
1687
return rc;
1688
}
1689
1690
dev->real_num_rx_queues = rxq;
1691
return 0;
1692
}
1693
EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1694
#endif
1695
1696
static inline void __netif_reschedule(struct Qdisc *q)
1697
{
1698
struct softnet_data *sd;
1699
unsigned long flags;
1700
1701
local_irq_save(flags);
1702
sd = &__get_cpu_var(softnet_data);
1703
q->next_sched = NULL;
1704
*sd->output_queue_tailp = q;
1705
sd->output_queue_tailp = &q->next_sched;
1706
raise_softirq_irqoff(NET_TX_SOFTIRQ);
1707
local_irq_restore(flags);
1708
}
1709
1710
void __netif_schedule(struct Qdisc *q)
1711
{
1712
if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1713
__netif_reschedule(q);
1714
}
1715
EXPORT_SYMBOL(__netif_schedule);
1716
1717
void dev_kfree_skb_irq(struct sk_buff *skb)
1718
{
1719
if (atomic_dec_and_test(&skb->users)) {
1720
struct softnet_data *sd;
1721
unsigned long flags;
1722
1723
local_irq_save(flags);
1724
sd = &__get_cpu_var(softnet_data);
1725
skb->next = sd->completion_queue;
1726
sd->completion_queue = skb;
1727
raise_softirq_irqoff(NET_TX_SOFTIRQ);
1728
local_irq_restore(flags);
1729
}
1730
}
1731
EXPORT_SYMBOL(dev_kfree_skb_irq);
1732
1733
void dev_kfree_skb_any(struct sk_buff *skb)
1734
{
1735
if (in_irq() || irqs_disabled())
1736
dev_kfree_skb_irq(skb);
1737
else
1738
dev_kfree_skb(skb);
1739
}
1740
EXPORT_SYMBOL(dev_kfree_skb_any);
1741
1742
1743
/**
1744
* netif_device_detach - mark device as removed
1745
* @dev: network device
1746
*
1747
* Mark device as removed from system and therefore no longer available.
1748
*/
1749
void netif_device_detach(struct net_device *dev)
1750
{
1751
if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1752
netif_running(dev)) {
1753
netif_tx_stop_all_queues(dev);
1754
}
1755
}
1756
EXPORT_SYMBOL(netif_device_detach);
1757
1758
/**
1759
* netif_device_attach - mark device as attached
1760
* @dev: network device
1761
*
1762
* Mark device as attached from system and restart if needed.
1763
*/
1764
void netif_device_attach(struct net_device *dev)
1765
{
1766
if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1767
netif_running(dev)) {
1768
netif_tx_wake_all_queues(dev);
1769
__netdev_watchdog_up(dev);
1770
}
1771
}
1772
EXPORT_SYMBOL(netif_device_attach);
1773
1774
/**
1775
* skb_dev_set -- assign a new device to a buffer
1776
* @skb: buffer for the new device
1777
* @dev: network device
1778
*
1779
* If an skb is owned by a device already, we have to reset
1780
* all data private to the namespace a device belongs to
1781
* before assigning it a new device.
1782
*/
1783
#ifdef CONFIG_NET_NS
1784
void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1785
{
1786
skb_dst_drop(skb);
1787
if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1788
secpath_reset(skb);
1789
nf_reset(skb);
1790
skb_init_secmark(skb);
1791
skb->mark = 0;
1792
skb->priority = 0;
1793
skb->nf_trace = 0;
1794
skb->ipvs_property = 0;
1795
#ifdef CONFIG_NET_SCHED
1796
skb->tc_index = 0;
1797
#endif
1798
}
1799
skb->dev = dev;
1800
}
1801
EXPORT_SYMBOL(skb_set_dev);
1802
#endif /* CONFIG_NET_NS */
1803
1804
/*
1805
* Invalidate hardware checksum when packet is to be mangled, and
1806
* complete checksum manually on outgoing path.
1807
*/
1808
int skb_checksum_help(struct sk_buff *skb)
1809
{
1810
__wsum csum;
1811
int ret = 0, offset;
1812
1813
if (skb->ip_summed == CHECKSUM_COMPLETE)
1814
goto out_set_summed;
1815
1816
if (unlikely(skb_shinfo(skb)->gso_size)) {
1817
/* Let GSO fix up the checksum. */
1818
goto out_set_summed;
1819
}
1820
1821
offset = skb_checksum_start_offset(skb);
1822
BUG_ON(offset >= skb_headlen(skb));
1823
csum = skb_checksum(skb, offset, skb->len - offset, 0);
1824
1825
offset += skb->csum_offset;
1826
BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1827
1828
if (skb_cloned(skb) &&
1829
!skb_clone_writable(skb, offset + sizeof(__sum16))) {
1830
ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1831
if (ret)
1832
goto out;
1833
}
1834
1835
*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1836
out_set_summed:
1837
skb->ip_summed = CHECKSUM_NONE;
1838
out:
1839
return ret;
1840
}
1841
EXPORT_SYMBOL(skb_checksum_help);
1842
1843
/**
1844
* skb_gso_segment - Perform segmentation on skb.
1845
* @skb: buffer to segment
1846
* @features: features for the output path (see dev->features)
1847
*
1848
* This function segments the given skb and returns a list of segments.
1849
*
1850
* It may return NULL if the skb requires no segmentation. This is
1851
* only possible when GSO is used for verifying header integrity.
1852
*/
1853
struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1854
{
1855
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1856
struct packet_type *ptype;
1857
__be16 type = skb->protocol;
1858
int vlan_depth = ETH_HLEN;
1859
int err;
1860
1861
while (type == htons(ETH_P_8021Q)) {
1862
struct vlan_hdr *vh;
1863
1864
if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1865
return ERR_PTR(-EINVAL);
1866
1867
vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1868
type = vh->h_vlan_encapsulated_proto;
1869
vlan_depth += VLAN_HLEN;
1870
}
1871
1872
skb_reset_mac_header(skb);
1873
skb->mac_len = skb->network_header - skb->mac_header;
1874
__skb_pull(skb, skb->mac_len);
1875
1876
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1877
struct net_device *dev = skb->dev;
1878
struct ethtool_drvinfo info = {};
1879
1880
if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1881
dev->ethtool_ops->get_drvinfo(dev, &info);
1882
1883
WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1884
info.driver, dev ? dev->features : 0L,
1885
skb->sk ? skb->sk->sk_route_caps : 0L,
1886
skb->len, skb->data_len, skb->ip_summed);
1887
1888
if (skb_header_cloned(skb) &&
1889
(err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1890
return ERR_PTR(err);
1891
}
1892
1893
rcu_read_lock();
1894
list_for_each_entry_rcu(ptype,
1895
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1896
if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1897
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1898
err = ptype->gso_send_check(skb);
1899
segs = ERR_PTR(err);
1900
if (err || skb_gso_ok(skb, features))
1901
break;
1902
__skb_push(skb, (skb->data -
1903
skb_network_header(skb)));
1904
}
1905
segs = ptype->gso_segment(skb, features);
1906
break;
1907
}
1908
}
1909
rcu_read_unlock();
1910
1911
__skb_push(skb, skb->data - skb_mac_header(skb));
1912
1913
return segs;
1914
}
1915
EXPORT_SYMBOL(skb_gso_segment);
1916
1917
/* Take action when hardware reception checksum errors are detected. */
1918
#ifdef CONFIG_BUG
1919
void netdev_rx_csum_fault(struct net_device *dev)
1920
{
1921
if (net_ratelimit()) {
1922
printk(KERN_ERR "%s: hw csum failure.\n",
1923
dev ? dev->name : "<unknown>");
1924
dump_stack();
1925
}
1926
}
1927
EXPORT_SYMBOL(netdev_rx_csum_fault);
1928
#endif
1929
1930
/* Actually, we should eliminate this check as soon as we know, that:
1931
* 1. IOMMU is present and allows to map all the memory.
1932
* 2. No high memory really exists on this machine.
1933
*/
1934
1935
static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1936
{
1937
#ifdef CONFIG_HIGHMEM
1938
int i;
1939
if (!(dev->features & NETIF_F_HIGHDMA)) {
1940
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1941
if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1942
return 1;
1943
}
1944
1945
if (PCI_DMA_BUS_IS_PHYS) {
1946
struct device *pdev = dev->dev.parent;
1947
1948
if (!pdev)
1949
return 0;
1950
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1951
dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1952
if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1953
return 1;
1954
}
1955
}
1956
#endif
1957
return 0;
1958
}
1959
1960
struct dev_gso_cb {
1961
void (*destructor)(struct sk_buff *skb);
1962
};
1963
1964
#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1965
1966
static void dev_gso_skb_destructor(struct sk_buff *skb)
1967
{
1968
struct dev_gso_cb *cb;
1969
1970
do {
1971
struct sk_buff *nskb = skb->next;
1972
1973
skb->next = nskb->next;
1974
nskb->next = NULL;
1975
kfree_skb(nskb);
1976
} while (skb->next);
1977
1978
cb = DEV_GSO_CB(skb);
1979
if (cb->destructor)
1980
cb->destructor(skb);
1981
}
1982
1983
/**
1984
* dev_gso_segment - Perform emulated hardware segmentation on skb.
1985
* @skb: buffer to segment
1986
* @features: device features as applicable to this skb
1987
*
1988
* This function segments the given skb and stores the list of segments
1989
* in skb->next.
1990
*/
1991
static int dev_gso_segment(struct sk_buff *skb, int features)
1992
{
1993
struct sk_buff *segs;
1994
1995
segs = skb_gso_segment(skb, features);
1996
1997
/* Verifying header integrity only. */
1998
if (!segs)
1999
return 0;
2000
2001
if (IS_ERR(segs))
2002
return PTR_ERR(segs);
2003
2004
skb->next = segs;
2005
DEV_GSO_CB(skb)->destructor = skb->destructor;
2006
skb->destructor = dev_gso_skb_destructor;
2007
2008
return 0;
2009
}
2010
2011
/*
2012
* Try to orphan skb early, right before transmission by the device.
2013
* We cannot orphan skb if tx timestamp is requested or the sk-reference
2014
* is needed on driver level for other reasons, e.g. see net/can/raw.c
2015
*/
2016
static inline void skb_orphan_try(struct sk_buff *skb)
2017
{
2018
struct sock *sk = skb->sk;
2019
2020
if (sk && !skb_shinfo(skb)->tx_flags) {
2021
/* skb_tx_hash() wont be able to get sk.
2022
* We copy sk_hash into skb->rxhash
2023
*/
2024
if (!skb->rxhash)
2025
skb->rxhash = sk->sk_hash;
2026
skb_orphan(skb);
2027
}
2028
}
2029
2030
static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2031
{
2032
return ((features & NETIF_F_GEN_CSUM) ||
2033
((features & NETIF_F_V4_CSUM) &&
2034
protocol == htons(ETH_P_IP)) ||
2035
((features & NETIF_F_V6_CSUM) &&
2036
protocol == htons(ETH_P_IPV6)) ||
2037
((features & NETIF_F_FCOE_CRC) &&
2038
protocol == htons(ETH_P_FCOE)));
2039
}
2040
2041
static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2042
{
2043
if (!can_checksum_protocol(features, protocol)) {
2044
features &= ~NETIF_F_ALL_CSUM;
2045
features &= ~NETIF_F_SG;
2046
} else if (illegal_highdma(skb->dev, skb)) {
2047
features &= ~NETIF_F_SG;
2048
}
2049
2050
return features;
2051
}
2052
2053
u32 netif_skb_features(struct sk_buff *skb)
2054
{
2055
__be16 protocol = skb->protocol;
2056
u32 features = skb->dev->features;
2057
2058
if (protocol == htons(ETH_P_8021Q)) {
2059
struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2060
protocol = veh->h_vlan_encapsulated_proto;
2061
} else if (!vlan_tx_tag_present(skb)) {
2062
return harmonize_features(skb, protocol, features);
2063
}
2064
2065
features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2066
2067
if (protocol != htons(ETH_P_8021Q)) {
2068
return harmonize_features(skb, protocol, features);
2069
} else {
2070
features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2071
NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2072
return harmonize_features(skb, protocol, features);
2073
}
2074
}
2075
EXPORT_SYMBOL(netif_skb_features);
2076
2077
/*
2078
* Returns true if either:
2079
* 1. skb has frag_list and the device doesn't support FRAGLIST, or
2080
* 2. skb is fragmented and the device does not support SG, or if
2081
* at least one of fragments is in highmem and device does not
2082
* support DMA from it.
2083
*/
2084
static inline int skb_needs_linearize(struct sk_buff *skb,
2085
int features)
2086
{
2087
return skb_is_nonlinear(skb) &&
2088
((skb_has_frag_list(skb) &&
2089
!(features & NETIF_F_FRAGLIST)) ||
2090
(skb_shinfo(skb)->nr_frags &&
2091
!(features & NETIF_F_SG)));
2092
}
2093
2094
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2095
struct netdev_queue *txq)
2096
{
2097
const struct net_device_ops *ops = dev->netdev_ops;
2098
int rc = NETDEV_TX_OK;
2099
unsigned int skb_len;
2100
2101
if (likely(!skb->next)) {
2102
u32 features;
2103
2104
/*
2105
* If device doesn't need skb->dst, release it right now while
2106
* its hot in this cpu cache
2107
*/
2108
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2109
skb_dst_drop(skb);
2110
2111
if (!list_empty(&ptype_all))
2112
dev_queue_xmit_nit(skb, dev);
2113
2114
skb_orphan_try(skb);
2115
2116
features = netif_skb_features(skb);
2117
2118
if (vlan_tx_tag_present(skb) &&
2119
!(features & NETIF_F_HW_VLAN_TX)) {
2120
skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2121
if (unlikely(!skb))
2122
goto out;
2123
2124
skb->vlan_tci = 0;
2125
}
2126
2127
if (netif_needs_gso(skb, features)) {
2128
if (unlikely(dev_gso_segment(skb, features)))
2129
goto out_kfree_skb;
2130
if (skb->next)
2131
goto gso;
2132
} else {
2133
if (skb_needs_linearize(skb, features) &&
2134
__skb_linearize(skb))
2135
goto out_kfree_skb;
2136
2137
/* If packet is not checksummed and device does not
2138
* support checksumming for this protocol, complete
2139
* checksumming here.
2140
*/
2141
if (skb->ip_summed == CHECKSUM_PARTIAL) {
2142
skb_set_transport_header(skb,
2143
skb_checksum_start_offset(skb));
2144
if (!(features & NETIF_F_ALL_CSUM) &&
2145
skb_checksum_help(skb))
2146
goto out_kfree_skb;
2147
}
2148
}
2149
2150
skb_len = skb->len;
2151
rc = ops->ndo_start_xmit(skb, dev);
2152
trace_net_dev_xmit(skb, rc, dev, skb_len);
2153
if (rc == NETDEV_TX_OK)
2154
txq_trans_update(txq);
2155
return rc;
2156
}
2157
2158
gso:
2159
do {
2160
struct sk_buff *nskb = skb->next;
2161
2162
skb->next = nskb->next;
2163
nskb->next = NULL;
2164
2165
/*
2166
* If device doesn't need nskb->dst, release it right now while
2167
* its hot in this cpu cache
2168
*/
2169
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2170
skb_dst_drop(nskb);
2171
2172
skb_len = nskb->len;
2173
rc = ops->ndo_start_xmit(nskb, dev);
2174
trace_net_dev_xmit(nskb, rc, dev, skb_len);
2175
if (unlikely(rc != NETDEV_TX_OK)) {
2176
if (rc & ~NETDEV_TX_MASK)
2177
goto out_kfree_gso_skb;
2178
nskb->next = skb->next;
2179
skb->next = nskb;
2180
return rc;
2181
}
2182
txq_trans_update(txq);
2183
if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2184
return NETDEV_TX_BUSY;
2185
} while (skb->next);
2186
2187
out_kfree_gso_skb:
2188
if (likely(skb->next == NULL))
2189
skb->destructor = DEV_GSO_CB(skb)->destructor;
2190
out_kfree_skb:
2191
kfree_skb(skb);
2192
out:
2193
return rc;
2194
}
2195
2196
static u32 hashrnd __read_mostly;
2197
2198
/*
2199
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
2200
* to be used as a distribution range.
2201
*/
2202
u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2203
unsigned int num_tx_queues)
2204
{
2205
u32 hash;
2206
u16 qoffset = 0;
2207
u16 qcount = num_tx_queues;
2208
2209
if (skb_rx_queue_recorded(skb)) {
2210
hash = skb_get_rx_queue(skb);
2211
while (unlikely(hash >= num_tx_queues))
2212
hash -= num_tx_queues;
2213
return hash;
2214
}
2215
2216
if (dev->num_tc) {
2217
u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2218
qoffset = dev->tc_to_txq[tc].offset;
2219
qcount = dev->tc_to_txq[tc].count;
2220
}
2221
2222
if (skb->sk && skb->sk->sk_hash)
2223
hash = skb->sk->sk_hash;
2224
else
2225
hash = (__force u16) skb->protocol ^ skb->rxhash;
2226
hash = jhash_1word(hash, hashrnd);
2227
2228
return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2229
}
2230
EXPORT_SYMBOL(__skb_tx_hash);
2231
2232
static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2233
{
2234
if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2235
if (net_ratelimit()) {
2236
pr_warning("%s selects TX queue %d, but "
2237
"real number of TX queues is %d\n",
2238
dev->name, queue_index, dev->real_num_tx_queues);
2239
}
2240
return 0;
2241
}
2242
return queue_index;
2243
}
2244
2245
static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2246
{
2247
#ifdef CONFIG_XPS
2248
struct xps_dev_maps *dev_maps;
2249
struct xps_map *map;
2250
int queue_index = -1;
2251
2252
rcu_read_lock();
2253
dev_maps = rcu_dereference(dev->xps_maps);
2254
if (dev_maps) {
2255
map = rcu_dereference(
2256
dev_maps->cpu_map[raw_smp_processor_id()]);
2257
if (map) {
2258
if (map->len == 1)
2259
queue_index = map->queues[0];
2260
else {
2261
u32 hash;
2262
if (skb->sk && skb->sk->sk_hash)
2263
hash = skb->sk->sk_hash;
2264
else
2265
hash = (__force u16) skb->protocol ^
2266
skb->rxhash;
2267
hash = jhash_1word(hash, hashrnd);
2268
queue_index = map->queues[
2269
((u64)hash * map->len) >> 32];
2270
}
2271
if (unlikely(queue_index >= dev->real_num_tx_queues))
2272
queue_index = -1;
2273
}
2274
}
2275
rcu_read_unlock();
2276
2277
return queue_index;
2278
#else
2279
return -1;
2280
#endif
2281
}
2282
2283
static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2284
struct sk_buff *skb)
2285
{
2286
int queue_index;
2287
const struct net_device_ops *ops = dev->netdev_ops;
2288
2289
if (dev->real_num_tx_queues == 1)
2290
queue_index = 0;
2291
else if (ops->ndo_select_queue) {
2292
queue_index = ops->ndo_select_queue(dev, skb);
2293
queue_index = dev_cap_txqueue(dev, queue_index);
2294
} else {
2295
struct sock *sk = skb->sk;
2296
queue_index = sk_tx_queue_get(sk);
2297
2298
if (queue_index < 0 || skb->ooo_okay ||
2299
queue_index >= dev->real_num_tx_queues) {
2300
int old_index = queue_index;
2301
2302
queue_index = get_xps_queue(dev, skb);
2303
if (queue_index < 0)
2304
queue_index = skb_tx_hash(dev, skb);
2305
2306
if (queue_index != old_index && sk) {
2307
struct dst_entry *dst =
2308
rcu_dereference_check(sk->sk_dst_cache, 1);
2309
2310
if (dst && skb_dst(skb) == dst)
2311
sk_tx_queue_set(sk, queue_index);
2312
}
2313
}
2314
}
2315
2316
skb_set_queue_mapping(skb, queue_index);
2317
return netdev_get_tx_queue(dev, queue_index);
2318
}
2319
2320
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2321
struct net_device *dev,
2322
struct netdev_queue *txq)
2323
{
2324
spinlock_t *root_lock = qdisc_lock(q);
2325
bool contended;
2326
int rc;
2327
2328
qdisc_skb_cb(skb)->pkt_len = skb->len;
2329
qdisc_calculate_pkt_len(skb, q);
2330
/*
2331
* Heuristic to force contended enqueues to serialize on a
2332
* separate lock before trying to get qdisc main lock.
2333
* This permits __QDISC_STATE_RUNNING owner to get the lock more often
2334
* and dequeue packets faster.
2335
*/
2336
contended = qdisc_is_running(q);
2337
if (unlikely(contended))
2338
spin_lock(&q->busylock);
2339
2340
spin_lock(root_lock);
2341
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2342
kfree_skb(skb);
2343
rc = NET_XMIT_DROP;
2344
} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2345
qdisc_run_begin(q)) {
2346
/*
2347
* This is a work-conserving queue; there are no old skbs
2348
* waiting to be sent out; and the qdisc is not running -
2349
* xmit the skb directly.
2350
*/
2351
if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2352
skb_dst_force(skb);
2353
2354
qdisc_bstats_update(q, skb);
2355
2356
if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2357
if (unlikely(contended)) {
2358
spin_unlock(&q->busylock);
2359
contended = false;
2360
}
2361
__qdisc_run(q);
2362
} else
2363
qdisc_run_end(q);
2364
2365
rc = NET_XMIT_SUCCESS;
2366
} else {
2367
skb_dst_force(skb);
2368
rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2369
if (qdisc_run_begin(q)) {
2370
if (unlikely(contended)) {
2371
spin_unlock(&q->busylock);
2372
contended = false;
2373
}
2374
__qdisc_run(q);
2375
}
2376
}
2377
spin_unlock(root_lock);
2378
if (unlikely(contended))
2379
spin_unlock(&q->busylock);
2380
return rc;
2381
}
2382
2383
static DEFINE_PER_CPU(int, xmit_recursion);
2384
#define RECURSION_LIMIT 10
2385
2386
/**
2387
* dev_queue_xmit - transmit a buffer
2388
* @skb: buffer to transmit
2389
*
2390
* Queue a buffer for transmission to a network device. The caller must
2391
* have set the device and priority and built the buffer before calling
2392
* this function. The function can be called from an interrupt.
2393
*
2394
* A negative errno code is returned on a failure. A success does not
2395
* guarantee the frame will be transmitted as it may be dropped due
2396
* to congestion or traffic shaping.
2397
*
2398
* -----------------------------------------------------------------------------------
2399
* I notice this method can also return errors from the queue disciplines,
2400
* including NET_XMIT_DROP, which is a positive value. So, errors can also
2401
* be positive.
2402
*
2403
* Regardless of the return value, the skb is consumed, so it is currently
2404
* difficult to retry a send to this method. (You can bump the ref count
2405
* before sending to hold a reference for retry if you are careful.)
2406
*
2407
* When calling this method, interrupts MUST be enabled. This is because
2408
* the BH enable code must have IRQs enabled so that it will not deadlock.
2409
* --BLG
2410
*/
2411
int dev_queue_xmit(struct sk_buff *skb)
2412
{
2413
struct net_device *dev = skb->dev;
2414
struct netdev_queue *txq;
2415
struct Qdisc *q;
2416
int rc = -ENOMEM;
2417
2418
/* Disable soft irqs for various locks below. Also
2419
* stops preemption for RCU.
2420
*/
2421
rcu_read_lock_bh();
2422
2423
txq = dev_pick_tx(dev, skb);
2424
q = rcu_dereference_bh(txq->qdisc);
2425
2426
#ifdef CONFIG_NET_CLS_ACT
2427
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2428
#endif
2429
trace_net_dev_queue(skb);
2430
if (q->enqueue) {
2431
rc = __dev_xmit_skb(skb, q, dev, txq);
2432
goto out;
2433
}
2434
2435
/* The device has no queue. Common case for software devices:
2436
loopback, all the sorts of tunnels...
2437
2438
Really, it is unlikely that netif_tx_lock protection is necessary
2439
here. (f.e. loopback and IP tunnels are clean ignoring statistics
2440
counters.)
2441
However, it is possible, that they rely on protection
2442
made by us here.
2443
2444
Check this and shot the lock. It is not prone from deadlocks.
2445
Either shot noqueue qdisc, it is even simpler 8)
2446
*/
2447
if (dev->flags & IFF_UP) {
2448
int cpu = smp_processor_id(); /* ok because BHs are off */
2449
2450
if (txq->xmit_lock_owner != cpu) {
2451
2452
if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2453
goto recursion_alert;
2454
2455
HARD_TX_LOCK(dev, txq, cpu);
2456
2457
if (!netif_tx_queue_stopped(txq)) {
2458
__this_cpu_inc(xmit_recursion);
2459
rc = dev_hard_start_xmit(skb, dev, txq);
2460
__this_cpu_dec(xmit_recursion);
2461
if (dev_xmit_complete(rc)) {
2462
HARD_TX_UNLOCK(dev, txq);
2463
goto out;
2464
}
2465
}
2466
HARD_TX_UNLOCK(dev, txq);
2467
if (net_ratelimit())
2468
printk(KERN_CRIT "Virtual device %s asks to "
2469
"queue packet!\n", dev->name);
2470
} else {
2471
/* Recursion is detected! It is possible,
2472
* unfortunately
2473
*/
2474
recursion_alert:
2475
if (net_ratelimit())
2476
printk(KERN_CRIT "Dead loop on virtual device "
2477
"%s, fix it urgently!\n", dev->name);
2478
}
2479
}
2480
2481
rc = -ENETDOWN;
2482
rcu_read_unlock_bh();
2483
2484
kfree_skb(skb);
2485
return rc;
2486
out:
2487
rcu_read_unlock_bh();
2488
return rc;
2489
}
2490
EXPORT_SYMBOL(dev_queue_xmit);
2491
2492
2493
/*=======================================================================
2494
Receiver routines
2495
=======================================================================*/
2496
2497
int netdev_max_backlog __read_mostly = 1000;
2498
int netdev_tstamp_prequeue __read_mostly = 1;
2499
int netdev_budget __read_mostly = 300;
2500
int weight_p __read_mostly = 64; /* old backlog weight */
2501
2502
/* Called with irq disabled */
2503
static inline void ____napi_schedule(struct softnet_data *sd,
2504
struct napi_struct *napi)
2505
{
2506
list_add_tail(&napi->poll_list, &sd->poll_list);
2507
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2508
}
2509
2510
/*
2511
* __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2512
* and src/dst port numbers. Returns a non-zero hash number on success
2513
* and 0 on failure.
2514
*/
2515
__u32 __skb_get_rxhash(struct sk_buff *skb)
2516
{
2517
int nhoff, hash = 0, poff;
2518
const struct ipv6hdr *ip6;
2519
const struct iphdr *ip;
2520
u8 ip_proto;
2521
u32 addr1, addr2, ihl;
2522
union {
2523
u32 v32;
2524
u16 v16[2];
2525
} ports;
2526
2527
nhoff = skb_network_offset(skb);
2528
2529
switch (skb->protocol) {
2530
case __constant_htons(ETH_P_IP):
2531
if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2532
goto done;
2533
2534
ip = (const struct iphdr *) (skb->data + nhoff);
2535
if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2536
ip_proto = 0;
2537
else
2538
ip_proto = ip->protocol;
2539
addr1 = (__force u32) ip->saddr;
2540
addr2 = (__force u32) ip->daddr;
2541
ihl = ip->ihl;
2542
break;
2543
case __constant_htons(ETH_P_IPV6):
2544
if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2545
goto done;
2546
2547
ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2548
ip_proto = ip6->nexthdr;
2549
addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2550
addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2551
ihl = (40 >> 2);
2552
break;
2553
default:
2554
goto done;
2555
}
2556
2557
ports.v32 = 0;
2558
poff = proto_ports_offset(ip_proto);
2559
if (poff >= 0) {
2560
nhoff += ihl * 4 + poff;
2561
if (pskb_may_pull(skb, nhoff + 4)) {
2562
ports.v32 = * (__force u32 *) (skb->data + nhoff);
2563
if (ports.v16[1] < ports.v16[0])
2564
swap(ports.v16[0], ports.v16[1]);
2565
}
2566
}
2567
2568
/* get a consistent hash (same value on both flow directions) */
2569
if (addr2 < addr1)
2570
swap(addr1, addr2);
2571
2572
hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2573
if (!hash)
2574
hash = 1;
2575
2576
done:
2577
return hash;
2578
}
2579
EXPORT_SYMBOL(__skb_get_rxhash);
2580
2581
#ifdef CONFIG_RPS
2582
2583
/* One global table that all flow-based protocols share. */
2584
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2585
EXPORT_SYMBOL(rps_sock_flow_table);
2586
2587
static struct rps_dev_flow *
2588
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2589
struct rps_dev_flow *rflow, u16 next_cpu)
2590
{
2591
u16 tcpu;
2592
2593
tcpu = rflow->cpu = next_cpu;
2594
if (tcpu != RPS_NO_CPU) {
2595
#ifdef CONFIG_RFS_ACCEL
2596
struct netdev_rx_queue *rxqueue;
2597
struct rps_dev_flow_table *flow_table;
2598
struct rps_dev_flow *old_rflow;
2599
u32 flow_id;
2600
u16 rxq_index;
2601
int rc;
2602
2603
/* Should we steer this flow to a different hardware queue? */
2604
if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2605
!(dev->features & NETIF_F_NTUPLE))
2606
goto out;
2607
rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2608
if (rxq_index == skb_get_rx_queue(skb))
2609
goto out;
2610
2611
rxqueue = dev->_rx + rxq_index;
2612
flow_table = rcu_dereference(rxqueue->rps_flow_table);
2613
if (!flow_table)
2614
goto out;
2615
flow_id = skb->rxhash & flow_table->mask;
2616
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2617
rxq_index, flow_id);
2618
if (rc < 0)
2619
goto out;
2620
old_rflow = rflow;
2621
rflow = &flow_table->flows[flow_id];
2622
rflow->cpu = next_cpu;
2623
rflow->filter = rc;
2624
if (old_rflow->filter == rflow->filter)
2625
old_rflow->filter = RPS_NO_FILTER;
2626
out:
2627
#endif
2628
rflow->last_qtail =
2629
per_cpu(softnet_data, tcpu).input_queue_head;
2630
}
2631
2632
return rflow;
2633
}
2634
2635
/*
2636
* get_rps_cpu is called from netif_receive_skb and returns the target
2637
* CPU from the RPS map of the receiving queue for a given skb.
2638
* rcu_read_lock must be held on entry.
2639
*/
2640
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2641
struct rps_dev_flow **rflowp)
2642
{
2643
struct netdev_rx_queue *rxqueue;
2644
struct rps_map *map;
2645
struct rps_dev_flow_table *flow_table;
2646
struct rps_sock_flow_table *sock_flow_table;
2647
int cpu = -1;
2648
u16 tcpu;
2649
2650
if (skb_rx_queue_recorded(skb)) {
2651
u16 index = skb_get_rx_queue(skb);
2652
if (unlikely(index >= dev->real_num_rx_queues)) {
2653
WARN_ONCE(dev->real_num_rx_queues > 1,
2654
"%s received packet on queue %u, but number "
2655
"of RX queues is %u\n",
2656
dev->name, index, dev->real_num_rx_queues);
2657
goto done;
2658
}
2659
rxqueue = dev->_rx + index;
2660
} else
2661
rxqueue = dev->_rx;
2662
2663
map = rcu_dereference(rxqueue->rps_map);
2664
if (map) {
2665
if (map->len == 1 &&
2666
!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2667
tcpu = map->cpus[0];
2668
if (cpu_online(tcpu))
2669
cpu = tcpu;
2670
goto done;
2671
}
2672
} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2673
goto done;
2674
}
2675
2676
skb_reset_network_header(skb);
2677
if (!skb_get_rxhash(skb))
2678
goto done;
2679
2680
flow_table = rcu_dereference(rxqueue->rps_flow_table);
2681
sock_flow_table = rcu_dereference(rps_sock_flow_table);
2682
if (flow_table && sock_flow_table) {
2683
u16 next_cpu;
2684
struct rps_dev_flow *rflow;
2685
2686
rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2687
tcpu = rflow->cpu;
2688
2689
next_cpu = sock_flow_table->ents[skb->rxhash &
2690
sock_flow_table->mask];
2691
2692
/*
2693
* If the desired CPU (where last recvmsg was done) is
2694
* different from current CPU (one in the rx-queue flow
2695
* table entry), switch if one of the following holds:
2696
* - Current CPU is unset (equal to RPS_NO_CPU).
2697
* - Current CPU is offline.
2698
* - The current CPU's queue tail has advanced beyond the
2699
* last packet that was enqueued using this table entry.
2700
* This guarantees that all previous packets for the flow
2701
* have been dequeued, thus preserving in order delivery.
2702
*/
2703
if (unlikely(tcpu != next_cpu) &&
2704
(tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2705
((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2706
rflow->last_qtail)) >= 0))
2707
rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2708
2709
if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2710
*rflowp = rflow;
2711
cpu = tcpu;
2712
goto done;
2713
}
2714
}
2715
2716
if (map) {
2717
tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2718
2719
if (cpu_online(tcpu)) {
2720
cpu = tcpu;
2721
goto done;
2722
}
2723
}
2724
2725
done:
2726
return cpu;
2727
}
2728
2729
#ifdef CONFIG_RFS_ACCEL
2730
2731
/**
2732
* rps_may_expire_flow - check whether an RFS hardware filter may be removed
2733
* @dev: Device on which the filter was set
2734
* @rxq_index: RX queue index
2735
* @flow_id: Flow ID passed to ndo_rx_flow_steer()
2736
* @filter_id: Filter ID returned by ndo_rx_flow_steer()
2737
*
2738
* Drivers that implement ndo_rx_flow_steer() should periodically call
2739
* this function for each installed filter and remove the filters for
2740
* which it returns %true.
2741
*/
2742
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2743
u32 flow_id, u16 filter_id)
2744
{
2745
struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2746
struct rps_dev_flow_table *flow_table;
2747
struct rps_dev_flow *rflow;
2748
bool expire = true;
2749
int cpu;
2750
2751
rcu_read_lock();
2752
flow_table = rcu_dereference(rxqueue->rps_flow_table);
2753
if (flow_table && flow_id <= flow_table->mask) {
2754
rflow = &flow_table->flows[flow_id];
2755
cpu = ACCESS_ONCE(rflow->cpu);
2756
if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2757
((int)(per_cpu(softnet_data, cpu).input_queue_head -
2758
rflow->last_qtail) <
2759
(int)(10 * flow_table->mask)))
2760
expire = false;
2761
}
2762
rcu_read_unlock();
2763
return expire;
2764
}
2765
EXPORT_SYMBOL(rps_may_expire_flow);
2766
2767
#endif /* CONFIG_RFS_ACCEL */
2768
2769
/* Called from hardirq (IPI) context */
2770
static void rps_trigger_softirq(void *data)
2771
{
2772
struct softnet_data *sd = data;
2773
2774
____napi_schedule(sd, &sd->backlog);
2775
sd->received_rps++;
2776
}
2777
2778
#endif /* CONFIG_RPS */
2779
2780
/*
2781
* Check if this softnet_data structure is another cpu one
2782
* If yes, queue it to our IPI list and return 1
2783
* If no, return 0
2784
*/
2785
static int rps_ipi_queued(struct softnet_data *sd)
2786
{
2787
#ifdef CONFIG_RPS
2788
struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2789
2790
if (sd != mysd) {
2791
sd->rps_ipi_next = mysd->rps_ipi_list;
2792
mysd->rps_ipi_list = sd;
2793
2794
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2795
return 1;
2796
}
2797
#endif /* CONFIG_RPS */
2798
return 0;
2799
}
2800
2801
/*
2802
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
2803
* queue (may be a remote CPU queue).
2804
*/
2805
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2806
unsigned int *qtail)
2807
{
2808
struct softnet_data *sd;
2809
unsigned long flags;
2810
2811
sd = &per_cpu(softnet_data, cpu);
2812
2813
local_irq_save(flags);
2814
2815
rps_lock(sd);
2816
if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2817
if (skb_queue_len(&sd->input_pkt_queue)) {
2818
enqueue:
2819
__skb_queue_tail(&sd->input_pkt_queue, skb);
2820
input_queue_tail_incr_save(sd, qtail);
2821
rps_unlock(sd);
2822
local_irq_restore(flags);
2823
return NET_RX_SUCCESS;
2824
}
2825
2826
/* Schedule NAPI for backlog device
2827
* We can use non atomic operation since we own the queue lock
2828
*/
2829
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2830
if (!rps_ipi_queued(sd))
2831
____napi_schedule(sd, &sd->backlog);
2832
}
2833
goto enqueue;
2834
}
2835
2836
sd->dropped++;
2837
rps_unlock(sd);
2838
2839
local_irq_restore(flags);
2840
2841
atomic_long_inc(&skb->dev->rx_dropped);
2842
kfree_skb(skb);
2843
return NET_RX_DROP;
2844
}
2845
2846
/**
2847
* netif_rx - post buffer to the network code
2848
* @skb: buffer to post
2849
*
2850
* This function receives a packet from a device driver and queues it for
2851
* the upper (protocol) levels to process. It always succeeds. The buffer
2852
* may be dropped during processing for congestion control or by the
2853
* protocol layers.
2854
*
2855
* return values:
2856
* NET_RX_SUCCESS (no congestion)
2857
* NET_RX_DROP (packet was dropped)
2858
*
2859
*/
2860
2861
int netif_rx(struct sk_buff *skb)
2862
{
2863
int ret;
2864
2865
/* if netpoll wants it, pretend we never saw it */
2866
if (netpoll_rx(skb))
2867
return NET_RX_DROP;
2868
2869
if (netdev_tstamp_prequeue)
2870
net_timestamp_check(skb);
2871
2872
trace_netif_rx(skb);
2873
#ifdef CONFIG_RPS
2874
{
2875
struct rps_dev_flow voidflow, *rflow = &voidflow;
2876
int cpu;
2877
2878
preempt_disable();
2879
rcu_read_lock();
2880
2881
cpu = get_rps_cpu(skb->dev, skb, &rflow);
2882
if (cpu < 0)
2883
cpu = smp_processor_id();
2884
2885
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2886
2887
rcu_read_unlock();
2888
preempt_enable();
2889
}
2890
#else
2891
{
2892
unsigned int qtail;
2893
ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2894
put_cpu();
2895
}
2896
#endif
2897
return ret;
2898
}
2899
EXPORT_SYMBOL(netif_rx);
2900
2901
int netif_rx_ni(struct sk_buff *skb)
2902
{
2903
int err;
2904
2905
preempt_disable();
2906
err = netif_rx(skb);
2907
if (local_softirq_pending())
2908
do_softirq();
2909
preempt_enable();
2910
2911
return err;
2912
}
2913
EXPORT_SYMBOL(netif_rx_ni);
2914
2915
static void net_tx_action(struct softirq_action *h)
2916
{
2917
struct softnet_data *sd = &__get_cpu_var(softnet_data);
2918
2919
if (sd->completion_queue) {
2920
struct sk_buff *clist;
2921
2922
local_irq_disable();
2923
clist = sd->completion_queue;
2924
sd->completion_queue = NULL;
2925
local_irq_enable();
2926
2927
while (clist) {
2928
struct sk_buff *skb = clist;
2929
clist = clist->next;
2930
2931
WARN_ON(atomic_read(&skb->users));
2932
trace_kfree_skb(skb, net_tx_action);
2933
__kfree_skb(skb);
2934
}
2935
}
2936
2937
if (sd->output_queue) {
2938
struct Qdisc *head;
2939
2940
local_irq_disable();
2941
head = sd->output_queue;
2942
sd->output_queue = NULL;
2943
sd->output_queue_tailp = &sd->output_queue;
2944
local_irq_enable();
2945
2946
while (head) {
2947
struct Qdisc *q = head;
2948
spinlock_t *root_lock;
2949
2950
head = head->next_sched;
2951
2952
root_lock = qdisc_lock(q);
2953
if (spin_trylock(root_lock)) {
2954
smp_mb__before_clear_bit();
2955
clear_bit(__QDISC_STATE_SCHED,
2956
&q->state);
2957
qdisc_run(q);
2958
spin_unlock(root_lock);
2959
} else {
2960
if (!test_bit(__QDISC_STATE_DEACTIVATED,
2961
&q->state)) {
2962
__netif_reschedule(q);
2963
} else {
2964
smp_mb__before_clear_bit();
2965
clear_bit(__QDISC_STATE_SCHED,
2966
&q->state);
2967
}
2968
}
2969
}
2970
}
2971
}
2972
2973
#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2974
(defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2975
/* This hook is defined here for ATM LANE */
2976
int (*br_fdb_test_addr_hook)(struct net_device *dev,
2977
unsigned char *addr) __read_mostly;
2978
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2979
#endif
2980
2981
#ifdef CONFIG_NET_CLS_ACT
2982
/* TODO: Maybe we should just force sch_ingress to be compiled in
2983
* when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2984
* a compare and 2 stores extra right now if we dont have it on
2985
* but have CONFIG_NET_CLS_ACT
2986
* NOTE: This doesn't stop any functionality; if you dont have
2987
* the ingress scheduler, you just can't add policies on ingress.
2988
*
2989
*/
2990
static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2991
{
2992
struct net_device *dev = skb->dev;
2993
u32 ttl = G_TC_RTTL(skb->tc_verd);
2994
int result = TC_ACT_OK;
2995
struct Qdisc *q;
2996
2997
if (unlikely(MAX_RED_LOOP < ttl++)) {
2998
if (net_ratelimit())
2999
pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3000
skb->skb_iif, dev->ifindex);
3001
return TC_ACT_SHOT;
3002
}
3003
3004
skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3005
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3006
3007
q = rxq->qdisc;
3008
if (q != &noop_qdisc) {
3009
spin_lock(qdisc_lock(q));
3010
if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3011
result = qdisc_enqueue_root(skb, q);
3012
spin_unlock(qdisc_lock(q));
3013
}
3014
3015
return result;
3016
}
3017
3018
static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3019
struct packet_type **pt_prev,
3020
int *ret, struct net_device *orig_dev)
3021
{
3022
struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3023
3024
if (!rxq || rxq->qdisc == &noop_qdisc)
3025
goto out;
3026
3027
if (*pt_prev) {
3028
*ret = deliver_skb(skb, *pt_prev, orig_dev);
3029
*pt_prev = NULL;
3030
}
3031
3032
switch (ing_filter(skb, rxq)) {
3033
case TC_ACT_SHOT:
3034
case TC_ACT_STOLEN:
3035
kfree_skb(skb);
3036
return NULL;
3037
}
3038
3039
out:
3040
skb->tc_verd = 0;
3041
return skb;
3042
}
3043
#endif
3044
3045
/**
3046
* netdev_rx_handler_register - register receive handler
3047
* @dev: device to register a handler for
3048
* @rx_handler: receive handler to register
3049
* @rx_handler_data: data pointer that is used by rx handler
3050
*
3051
* Register a receive hander for a device. This handler will then be
3052
* called from __netif_receive_skb. A negative errno code is returned
3053
* on a failure.
3054
*
3055
* The caller must hold the rtnl_mutex.
3056
*
3057
* For a general description of rx_handler, see enum rx_handler_result.
3058
*/
3059
int netdev_rx_handler_register(struct net_device *dev,
3060
rx_handler_func_t *rx_handler,
3061
void *rx_handler_data)
3062
{
3063
ASSERT_RTNL();
3064
3065
if (dev->rx_handler)
3066
return -EBUSY;
3067
3068
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3069
rcu_assign_pointer(dev->rx_handler, rx_handler);
3070
3071
return 0;
3072
}
3073
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3074
3075
/**
3076
* netdev_rx_handler_unregister - unregister receive handler
3077
* @dev: device to unregister a handler from
3078
*
3079
* Unregister a receive hander from a device.
3080
*
3081
* The caller must hold the rtnl_mutex.
3082
*/
3083
void netdev_rx_handler_unregister(struct net_device *dev)
3084
{
3085
3086
ASSERT_RTNL();
3087
rcu_assign_pointer(dev->rx_handler, NULL);
3088
rcu_assign_pointer(dev->rx_handler_data, NULL);
3089
}
3090
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3091
3092
static int __netif_receive_skb(struct sk_buff *skb)
3093
{
3094
struct packet_type *ptype, *pt_prev;
3095
rx_handler_func_t *rx_handler;
3096
struct net_device *orig_dev;
3097
struct net_device *null_or_dev;
3098
bool deliver_exact = false;
3099
int ret = NET_RX_DROP;
3100
__be16 type;
3101
3102
if (!netdev_tstamp_prequeue)
3103
net_timestamp_check(skb);
3104
3105
trace_netif_receive_skb(skb);
3106
3107
/* if we've gotten here through NAPI, check netpoll */
3108
if (netpoll_receive_skb(skb))
3109
return NET_RX_DROP;
3110
3111
if (!skb->skb_iif)
3112
skb->skb_iif = skb->dev->ifindex;
3113
orig_dev = skb->dev;
3114
3115
skb_reset_network_header(skb);
3116
skb_reset_transport_header(skb);
3117
skb_reset_mac_len(skb);
3118
3119
pt_prev = NULL;
3120
3121
rcu_read_lock();
3122
3123
another_round:
3124
3125
__this_cpu_inc(softnet_data.processed);
3126
3127
if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3128
skb = vlan_untag(skb);
3129
if (unlikely(!skb))
3130
goto out;
3131
}
3132
3133
#ifdef CONFIG_NET_CLS_ACT
3134
if (skb->tc_verd & TC_NCLS) {
3135
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3136
goto ncls;
3137
}
3138
#endif
3139
3140
list_for_each_entry_rcu(ptype, &ptype_all, list) {
3141
if (!ptype->dev || ptype->dev == skb->dev) {
3142
if (pt_prev)
3143
ret = deliver_skb(skb, pt_prev, orig_dev);
3144
pt_prev = ptype;
3145
}
3146
}
3147
3148
#ifdef CONFIG_NET_CLS_ACT
3149
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3150
if (!skb)
3151
goto out;
3152
ncls:
3153
#endif
3154
3155
rx_handler = rcu_dereference(skb->dev->rx_handler);
3156
if (rx_handler) {
3157
if (pt_prev) {
3158
ret = deliver_skb(skb, pt_prev, orig_dev);
3159
pt_prev = NULL;
3160
}
3161
switch (rx_handler(&skb)) {
3162
case RX_HANDLER_CONSUMED:
3163
goto out;
3164
case RX_HANDLER_ANOTHER:
3165
goto another_round;
3166
case RX_HANDLER_EXACT:
3167
deliver_exact = true;
3168
case RX_HANDLER_PASS:
3169
break;
3170
default:
3171
BUG();
3172
}
3173
}
3174
3175
if (vlan_tx_tag_present(skb)) {
3176
if (pt_prev) {
3177
ret = deliver_skb(skb, pt_prev, orig_dev);
3178
pt_prev = NULL;
3179
}
3180
if (vlan_do_receive(&skb)) {
3181
ret = __netif_receive_skb(skb);
3182
goto out;
3183
} else if (unlikely(!skb))
3184
goto out;
3185
}
3186
3187
/* deliver only exact match when indicated */
3188
null_or_dev = deliver_exact ? skb->dev : NULL;
3189
3190
type = skb->protocol;
3191
list_for_each_entry_rcu(ptype,
3192
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3193
if (ptype->type == type &&
3194
(ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3195
ptype->dev == orig_dev)) {
3196
if (pt_prev)
3197
ret = deliver_skb(skb, pt_prev, orig_dev);
3198
pt_prev = ptype;
3199
}
3200
}
3201
3202
if (pt_prev) {
3203
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3204
} else {
3205
atomic_long_inc(&skb->dev->rx_dropped);
3206
kfree_skb(skb);
3207
/* Jamal, now you will not able to escape explaining
3208
* me how you were going to use this. :-)
3209
*/
3210
ret = NET_RX_DROP;
3211
}
3212
3213
out:
3214
rcu_read_unlock();
3215
return ret;
3216
}
3217
3218
/**
3219
* netif_receive_skb - process receive buffer from network
3220
* @skb: buffer to process
3221
*
3222
* netif_receive_skb() is the main receive data processing function.
3223
* It always succeeds. The buffer may be dropped during processing
3224
* for congestion control or by the protocol layers.
3225
*
3226
* This function may only be called from softirq context and interrupts
3227
* should be enabled.
3228
*
3229
* Return values (usually ignored):
3230
* NET_RX_SUCCESS: no congestion
3231
* NET_RX_DROP: packet was dropped
3232
*/
3233
int netif_receive_skb(struct sk_buff *skb)
3234
{
3235
if (netdev_tstamp_prequeue)
3236
net_timestamp_check(skb);
3237
3238
if (skb_defer_rx_timestamp(skb))
3239
return NET_RX_SUCCESS;
3240
3241
#ifdef CONFIG_RPS
3242
{
3243
struct rps_dev_flow voidflow, *rflow = &voidflow;
3244
int cpu, ret;
3245
3246
rcu_read_lock();
3247
3248
cpu = get_rps_cpu(skb->dev, skb, &rflow);
3249
3250
if (cpu >= 0) {
3251
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3252
rcu_read_unlock();
3253
} else {
3254
rcu_read_unlock();
3255
ret = __netif_receive_skb(skb);
3256
}
3257
3258
return ret;
3259
}
3260
#else
3261
return __netif_receive_skb(skb);
3262
#endif
3263
}
3264
EXPORT_SYMBOL(netif_receive_skb);
3265
3266
/* Network device is going away, flush any packets still pending
3267
* Called with irqs disabled.
3268
*/
3269
static void flush_backlog(void *arg)
3270
{
3271
struct net_device *dev = arg;
3272
struct softnet_data *sd = &__get_cpu_var(softnet_data);
3273
struct sk_buff *skb, *tmp;
3274
3275
rps_lock(sd);
3276
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3277
if (skb->dev == dev) {
3278
__skb_unlink(skb, &sd->input_pkt_queue);
3279
kfree_skb(skb);
3280
input_queue_head_incr(sd);
3281
}
3282
}
3283
rps_unlock(sd);
3284
3285
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3286
if (skb->dev == dev) {
3287
__skb_unlink(skb, &sd->process_queue);
3288
kfree_skb(skb);
3289
input_queue_head_incr(sd);
3290
}
3291
}
3292
}
3293
3294
static int napi_gro_complete(struct sk_buff *skb)
3295
{
3296
struct packet_type *ptype;
3297
__be16 type = skb->protocol;
3298
struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3299
int err = -ENOENT;
3300
3301
if (NAPI_GRO_CB(skb)->count == 1) {
3302
skb_shinfo(skb)->gso_size = 0;
3303
goto out;
3304
}
3305
3306
rcu_read_lock();
3307
list_for_each_entry_rcu(ptype, head, list) {
3308
if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3309
continue;
3310
3311
err = ptype->gro_complete(skb);
3312
break;
3313
}
3314
rcu_read_unlock();
3315
3316
if (err) {
3317
WARN_ON(&ptype->list == head);
3318
kfree_skb(skb);
3319
return NET_RX_SUCCESS;
3320
}
3321
3322
out:
3323
return netif_receive_skb(skb);
3324
}
3325
3326
inline void napi_gro_flush(struct napi_struct *napi)
3327
{
3328
struct sk_buff *skb, *next;
3329
3330
for (skb = napi->gro_list; skb; skb = next) {
3331
next = skb->next;
3332
skb->next = NULL;
3333
napi_gro_complete(skb);
3334
}
3335
3336
napi->gro_count = 0;
3337
napi->gro_list = NULL;
3338
}
3339
EXPORT_SYMBOL(napi_gro_flush);
3340
3341
enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3342
{
3343
struct sk_buff **pp = NULL;
3344
struct packet_type *ptype;
3345
__be16 type = skb->protocol;
3346
struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3347
int same_flow;
3348
int mac_len;
3349
enum gro_result ret;
3350
3351
if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3352
goto normal;
3353
3354
if (skb_is_gso(skb) || skb_has_frag_list(skb))
3355
goto normal;
3356
3357
rcu_read_lock();
3358
list_for_each_entry_rcu(ptype, head, list) {
3359
if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3360
continue;
3361
3362
skb_set_network_header(skb, skb_gro_offset(skb));
3363
mac_len = skb->network_header - skb->mac_header;
3364
skb->mac_len = mac_len;
3365
NAPI_GRO_CB(skb)->same_flow = 0;
3366
NAPI_GRO_CB(skb)->flush = 0;
3367
NAPI_GRO_CB(skb)->free = 0;
3368
3369
pp = ptype->gro_receive(&napi->gro_list, skb);
3370
break;
3371
}
3372
rcu_read_unlock();
3373
3374
if (&ptype->list == head)
3375
goto normal;
3376
3377
same_flow = NAPI_GRO_CB(skb)->same_flow;
3378
ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3379
3380
if (pp) {
3381
struct sk_buff *nskb = *pp;
3382
3383
*pp = nskb->next;
3384
nskb->next = NULL;
3385
napi_gro_complete(nskb);
3386
napi->gro_count--;
3387
}
3388
3389
if (same_flow)
3390
goto ok;
3391
3392
if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3393
goto normal;
3394
3395
napi->gro_count++;
3396
NAPI_GRO_CB(skb)->count = 1;
3397
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3398
skb->next = napi->gro_list;
3399
napi->gro_list = skb;
3400
ret = GRO_HELD;
3401
3402
pull:
3403
if (skb_headlen(skb) < skb_gro_offset(skb)) {
3404
int grow = skb_gro_offset(skb) - skb_headlen(skb);
3405
3406
BUG_ON(skb->end - skb->tail < grow);
3407
3408
memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3409
3410
skb->tail += grow;
3411
skb->data_len -= grow;
3412
3413
skb_shinfo(skb)->frags[0].page_offset += grow;
3414
skb_shinfo(skb)->frags[0].size -= grow;
3415
3416
if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3417
put_page(skb_shinfo(skb)->frags[0].page);
3418
memmove(skb_shinfo(skb)->frags,
3419
skb_shinfo(skb)->frags + 1,
3420
--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3421
}
3422
}
3423
3424
ok:
3425
return ret;
3426
3427
normal:
3428
ret = GRO_NORMAL;
3429
goto pull;
3430
}
3431
EXPORT_SYMBOL(dev_gro_receive);
3432
3433
static inline gro_result_t
3434
__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3435
{
3436
struct sk_buff *p;
3437
3438
for (p = napi->gro_list; p; p = p->next) {
3439
unsigned long diffs;
3440
3441
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3442
diffs |= p->vlan_tci ^ skb->vlan_tci;
3443
diffs |= compare_ether_header(skb_mac_header(p),
3444
skb_gro_mac_header(skb));
3445
NAPI_GRO_CB(p)->same_flow = !diffs;
3446
NAPI_GRO_CB(p)->flush = 0;
3447
}
3448
3449
return dev_gro_receive(napi, skb);
3450
}
3451
3452
gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3453
{
3454
switch (ret) {
3455
case GRO_NORMAL:
3456
if (netif_receive_skb(skb))
3457
ret = GRO_DROP;
3458
break;
3459
3460
case GRO_DROP:
3461
case GRO_MERGED_FREE:
3462
kfree_skb(skb);
3463
break;
3464
3465
case GRO_HELD:
3466
case GRO_MERGED:
3467
break;
3468
}
3469
3470
return ret;
3471
}
3472
EXPORT_SYMBOL(napi_skb_finish);
3473
3474
void skb_gro_reset_offset(struct sk_buff *skb)
3475
{
3476
NAPI_GRO_CB(skb)->data_offset = 0;
3477
NAPI_GRO_CB(skb)->frag0 = NULL;
3478
NAPI_GRO_CB(skb)->frag0_len = 0;
3479
3480
if (skb->mac_header == skb->tail &&
3481
!PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3482
NAPI_GRO_CB(skb)->frag0 =
3483
page_address(skb_shinfo(skb)->frags[0].page) +
3484
skb_shinfo(skb)->frags[0].page_offset;
3485
NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3486
}
3487
}
3488
EXPORT_SYMBOL(skb_gro_reset_offset);
3489
3490
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3491
{
3492
skb_gro_reset_offset(skb);
3493
3494
return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3495
}
3496
EXPORT_SYMBOL(napi_gro_receive);
3497
3498
static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3499
{
3500
__skb_pull(skb, skb_headlen(skb));
3501
skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3502
skb->vlan_tci = 0;
3503
skb->dev = napi->dev;
3504
skb->skb_iif = 0;
3505
3506
napi->skb = skb;
3507
}
3508
3509
struct sk_buff *napi_get_frags(struct napi_struct *napi)
3510
{
3511
struct sk_buff *skb = napi->skb;
3512
3513
if (!skb) {
3514
skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3515
if (skb)
3516
napi->skb = skb;
3517
}
3518
return skb;
3519
}
3520
EXPORT_SYMBOL(napi_get_frags);
3521
3522
gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3523
gro_result_t ret)
3524
{
3525
switch (ret) {
3526
case GRO_NORMAL:
3527
case GRO_HELD:
3528
skb->protocol = eth_type_trans(skb, skb->dev);
3529
3530
if (ret == GRO_HELD)
3531
skb_gro_pull(skb, -ETH_HLEN);
3532
else if (netif_receive_skb(skb))
3533
ret = GRO_DROP;
3534
break;
3535
3536
case GRO_DROP:
3537
case GRO_MERGED_FREE:
3538
napi_reuse_skb(napi, skb);
3539
break;
3540
3541
case GRO_MERGED:
3542
break;
3543
}
3544
3545
return ret;
3546
}
3547
EXPORT_SYMBOL(napi_frags_finish);
3548
3549
struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3550
{
3551
struct sk_buff *skb = napi->skb;
3552
struct ethhdr *eth;
3553
unsigned int hlen;
3554
unsigned int off;
3555
3556
napi->skb = NULL;
3557
3558
skb_reset_mac_header(skb);
3559
skb_gro_reset_offset(skb);
3560
3561
off = skb_gro_offset(skb);
3562
hlen = off + sizeof(*eth);
3563
eth = skb_gro_header_fast(skb, off);
3564
if (skb_gro_header_hard(skb, hlen)) {
3565
eth = skb_gro_header_slow(skb, hlen, off);
3566
if (unlikely(!eth)) {
3567
napi_reuse_skb(napi, skb);
3568
skb = NULL;
3569
goto out;
3570
}
3571
}
3572
3573
skb_gro_pull(skb, sizeof(*eth));
3574
3575
/*
3576
* This works because the only protocols we care about don't require
3577
* special handling. We'll fix it up properly at the end.
3578
*/
3579
skb->protocol = eth->h_proto;
3580
3581
out:
3582
return skb;
3583
}
3584
EXPORT_SYMBOL(napi_frags_skb);
3585
3586
gro_result_t napi_gro_frags(struct napi_struct *napi)
3587
{
3588
struct sk_buff *skb = napi_frags_skb(napi);
3589
3590
if (!skb)
3591
return GRO_DROP;
3592
3593
return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3594
}
3595
EXPORT_SYMBOL(napi_gro_frags);
3596
3597
/*
3598
* net_rps_action sends any pending IPI's for rps.
3599
* Note: called with local irq disabled, but exits with local irq enabled.
3600
*/
3601
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3602
{
3603
#ifdef CONFIG_RPS
3604
struct softnet_data *remsd = sd->rps_ipi_list;
3605
3606
if (remsd) {
3607
sd->rps_ipi_list = NULL;
3608
3609
local_irq_enable();
3610
3611
/* Send pending IPI's to kick RPS processing on remote cpus. */
3612
while (remsd) {
3613
struct softnet_data *next = remsd->rps_ipi_next;
3614
3615
if (cpu_online(remsd->cpu))
3616
__smp_call_function_single(remsd->cpu,
3617
&remsd->csd, 0);
3618
remsd = next;
3619
}
3620
} else
3621
#endif
3622
local_irq_enable();
3623
}
3624
3625
static int process_backlog(struct napi_struct *napi, int quota)
3626
{
3627
int work = 0;
3628
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3629
3630
#ifdef CONFIG_RPS
3631
/* Check if we have pending ipi, its better to send them now,
3632
* not waiting net_rx_action() end.
3633
*/
3634
if (sd->rps_ipi_list) {
3635
local_irq_disable();
3636
net_rps_action_and_irq_enable(sd);
3637
}
3638
#endif
3639
napi->weight = weight_p;
3640
local_irq_disable();
3641
while (work < quota) {
3642
struct sk_buff *skb;
3643
unsigned int qlen;
3644
3645
while ((skb = __skb_dequeue(&sd->process_queue))) {
3646
local_irq_enable();
3647
__netif_receive_skb(skb);
3648
local_irq_disable();
3649
input_queue_head_incr(sd);
3650
if (++work >= quota) {
3651
local_irq_enable();
3652
return work;
3653
}
3654
}
3655
3656
rps_lock(sd);
3657
qlen = skb_queue_len(&sd->input_pkt_queue);
3658
if (qlen)
3659
skb_queue_splice_tail_init(&sd->input_pkt_queue,
3660
&sd->process_queue);
3661
3662
if (qlen < quota - work) {
3663
/*
3664
* Inline a custom version of __napi_complete().
3665
* only current cpu owns and manipulates this napi,
3666
* and NAPI_STATE_SCHED is the only possible flag set on backlog.
3667
* we can use a plain write instead of clear_bit(),
3668
* and we dont need an smp_mb() memory barrier.
3669
*/
3670
list_del(&napi->poll_list);
3671
napi->state = 0;
3672
3673
quota = work + qlen;
3674
}
3675
rps_unlock(sd);
3676
}
3677
local_irq_enable();
3678
3679
return work;
3680
}
3681
3682
/**
3683
* __napi_schedule - schedule for receive
3684
* @n: entry to schedule
3685
*
3686
* The entry's receive function will be scheduled to run
3687
*/
3688
void __napi_schedule(struct napi_struct *n)
3689
{
3690
unsigned long flags;
3691
3692
local_irq_save(flags);
3693
____napi_schedule(&__get_cpu_var(softnet_data), n);
3694
local_irq_restore(flags);
3695
}
3696
EXPORT_SYMBOL(__napi_schedule);
3697
3698
void __napi_complete(struct napi_struct *n)
3699
{
3700
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3701
BUG_ON(n->gro_list);
3702
3703
list_del(&n->poll_list);
3704
smp_mb__before_clear_bit();
3705
clear_bit(NAPI_STATE_SCHED, &n->state);
3706
}
3707
EXPORT_SYMBOL(__napi_complete);
3708
3709
void napi_complete(struct napi_struct *n)
3710
{
3711
unsigned long flags;
3712
3713
/*
3714
* don't let napi dequeue from the cpu poll list
3715
* just in case its running on a different cpu
3716
*/
3717
if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3718
return;
3719
3720
napi_gro_flush(n);
3721
local_irq_save(flags);
3722
__napi_complete(n);
3723
local_irq_restore(flags);
3724
}
3725
EXPORT_SYMBOL(napi_complete);
3726
3727
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3728
int (*poll)(struct napi_struct *, int), int weight)
3729
{
3730
INIT_LIST_HEAD(&napi->poll_list);
3731
napi->gro_count = 0;
3732
napi->gro_list = NULL;
3733
napi->skb = NULL;
3734
napi->poll = poll;
3735
napi->weight = weight;
3736
list_add(&napi->dev_list, &dev->napi_list);
3737
napi->dev = dev;
3738
#ifdef CONFIG_NETPOLL
3739
spin_lock_init(&napi->poll_lock);
3740
napi->poll_owner = -1;
3741
#endif
3742
set_bit(NAPI_STATE_SCHED, &napi->state);
3743
}
3744
EXPORT_SYMBOL(netif_napi_add);
3745
3746
void netif_napi_del(struct napi_struct *napi)
3747
{
3748
struct sk_buff *skb, *next;
3749
3750
list_del_init(&napi->dev_list);
3751
napi_free_frags(napi);
3752
3753
for (skb = napi->gro_list; skb; skb = next) {
3754
next = skb->next;
3755
skb->next = NULL;
3756
kfree_skb(skb);
3757
}
3758
3759
napi->gro_list = NULL;
3760
napi->gro_count = 0;
3761
}
3762
EXPORT_SYMBOL(netif_napi_del);
3763
3764
static void net_rx_action(struct softirq_action *h)
3765
{
3766
struct softnet_data *sd = &__get_cpu_var(softnet_data);
3767
unsigned long time_limit = jiffies + 2;
3768
int budget = netdev_budget;
3769
void *have;
3770
3771
local_irq_disable();
3772
3773
while (!list_empty(&sd->poll_list)) {
3774
struct napi_struct *n;
3775
int work, weight;
3776
3777
/* If softirq window is exhuasted then punt.
3778
* Allow this to run for 2 jiffies since which will allow
3779
* an average latency of 1.5/HZ.
3780
*/
3781
if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3782
goto softnet_break;
3783
3784
local_irq_enable();
3785
3786
/* Even though interrupts have been re-enabled, this
3787
* access is safe because interrupts can only add new
3788
* entries to the tail of this list, and only ->poll()
3789
* calls can remove this head entry from the list.
3790
*/
3791
n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3792
3793
have = netpoll_poll_lock(n);
3794
3795
weight = n->weight;
3796
3797
/* This NAPI_STATE_SCHED test is for avoiding a race
3798
* with netpoll's poll_napi(). Only the entity which
3799
* obtains the lock and sees NAPI_STATE_SCHED set will
3800
* actually make the ->poll() call. Therefore we avoid
3801
* accidentally calling ->poll() when NAPI is not scheduled.
3802
*/
3803
work = 0;
3804
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3805
work = n->poll(n, weight);
3806
trace_napi_poll(n);
3807
}
3808
3809
WARN_ON_ONCE(work > weight);
3810
3811
budget -= work;
3812
3813
local_irq_disable();
3814
3815
/* Drivers must not modify the NAPI state if they
3816
* consume the entire weight. In such cases this code
3817
* still "owns" the NAPI instance and therefore can
3818
* move the instance around on the list at-will.
3819
*/
3820
if (unlikely(work == weight)) {
3821
if (unlikely(napi_disable_pending(n))) {
3822
local_irq_enable();
3823
napi_complete(n);
3824
local_irq_disable();
3825
} else
3826
list_move_tail(&n->poll_list, &sd->poll_list);
3827
}
3828
3829
netpoll_poll_unlock(have);
3830
}
3831
out:
3832
net_rps_action_and_irq_enable(sd);
3833
3834
#ifdef CONFIG_NET_DMA
3835
/*
3836
* There may not be any more sk_buffs coming right now, so push
3837
* any pending DMA copies to hardware
3838
*/
3839
dma_issue_pending_all();
3840
#endif
3841
3842
return;
3843
3844
softnet_break:
3845
sd->time_squeeze++;
3846
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3847
goto out;
3848
}
3849
3850
static gifconf_func_t *gifconf_list[NPROTO];
3851
3852
/**
3853
* register_gifconf - register a SIOCGIF handler
3854
* @family: Address family
3855
* @gifconf: Function handler
3856
*
3857
* Register protocol dependent address dumping routines. The handler
3858
* that is passed must not be freed or reused until it has been replaced
3859
* by another handler.
3860
*/
3861
int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3862
{
3863
if (family >= NPROTO)
3864
return -EINVAL;
3865
gifconf_list[family] = gifconf;
3866
return 0;
3867
}
3868
EXPORT_SYMBOL(register_gifconf);
3869
3870
3871
/*
3872
* Map an interface index to its name (SIOCGIFNAME)
3873
*/
3874
3875
/*
3876
* We need this ioctl for efficient implementation of the
3877
* if_indextoname() function required by the IPv6 API. Without
3878
* it, we would have to search all the interfaces to find a
3879
* match. --pb
3880
*/
3881
3882
static int dev_ifname(struct net *net, struct ifreq __user *arg)
3883
{
3884
struct net_device *dev;
3885
struct ifreq ifr;
3886
3887
/*
3888
* Fetch the caller's info block.
3889
*/
3890
3891
if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3892
return -EFAULT;
3893
3894
rcu_read_lock();
3895
dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3896
if (!dev) {
3897
rcu_read_unlock();
3898
return -ENODEV;
3899
}
3900
3901
strcpy(ifr.ifr_name, dev->name);
3902
rcu_read_unlock();
3903
3904
if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3905
return -EFAULT;
3906
return 0;
3907
}
3908
3909
/*
3910
* Perform a SIOCGIFCONF call. This structure will change
3911
* size eventually, and there is nothing I can do about it.
3912
* Thus we will need a 'compatibility mode'.
3913
*/
3914
3915
static int dev_ifconf(struct net *net, char __user *arg)
3916
{
3917
struct ifconf ifc;
3918
struct net_device *dev;
3919
char __user *pos;
3920
int len;
3921
int total;
3922
int i;
3923
3924
/*
3925
* Fetch the caller's info block.
3926
*/
3927
3928
if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3929
return -EFAULT;
3930
3931
pos = ifc.ifc_buf;
3932
len = ifc.ifc_len;
3933
3934
/*
3935
* Loop over the interfaces, and write an info block for each.
3936
*/
3937
3938
total = 0;
3939
for_each_netdev(net, dev) {
3940
for (i = 0; i < NPROTO; i++) {
3941
if (gifconf_list[i]) {
3942
int done;
3943
if (!pos)
3944
done = gifconf_list[i](dev, NULL, 0);
3945
else
3946
done = gifconf_list[i](dev, pos + total,
3947
len - total);
3948
if (done < 0)
3949
return -EFAULT;
3950
total += done;
3951
}
3952
}
3953
}
3954
3955
/*
3956
* All done. Write the updated control block back to the caller.
3957
*/
3958
ifc.ifc_len = total;
3959
3960
/*
3961
* Both BSD and Solaris return 0 here, so we do too.
3962
*/
3963
return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3964
}
3965
3966
#ifdef CONFIG_PROC_FS
3967
/*
3968
* This is invoked by the /proc filesystem handler to display a device
3969
* in detail.
3970
*/
3971
void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3972
__acquires(RCU)
3973
{
3974
struct net *net = seq_file_net(seq);
3975
loff_t off;
3976
struct net_device *dev;
3977
3978
rcu_read_lock();
3979
if (!*pos)
3980
return SEQ_START_TOKEN;
3981
3982
off = 1;
3983
for_each_netdev_rcu(net, dev)
3984
if (off++ == *pos)
3985
return dev;
3986
3987
return NULL;
3988
}
3989
3990
void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3991
{
3992
struct net_device *dev = v;
3993
3994
if (v == SEQ_START_TOKEN)
3995
dev = first_net_device_rcu(seq_file_net(seq));
3996
else
3997
dev = next_net_device_rcu(dev);
3998
3999
++*pos;
4000
return dev;
4001
}
4002
4003
void dev_seq_stop(struct seq_file *seq, void *v)
4004
__releases(RCU)
4005
{
4006
rcu_read_unlock();
4007
}
4008
4009
static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4010
{
4011
struct rtnl_link_stats64 temp;
4012
const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4013
4014
seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4015
"%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4016
dev->name, stats->rx_bytes, stats->rx_packets,
4017
stats->rx_errors,
4018
stats->rx_dropped + stats->rx_missed_errors,
4019
stats->rx_fifo_errors,
4020
stats->rx_length_errors + stats->rx_over_errors +
4021
stats->rx_crc_errors + stats->rx_frame_errors,
4022
stats->rx_compressed, stats->multicast,
4023
stats->tx_bytes, stats->tx_packets,
4024
stats->tx_errors, stats->tx_dropped,
4025
stats->tx_fifo_errors, stats->collisions,
4026
stats->tx_carrier_errors +
4027
stats->tx_aborted_errors +
4028
stats->tx_window_errors +
4029
stats->tx_heartbeat_errors,
4030
stats->tx_compressed);
4031
}
4032
4033
/*
4034
* Called from the PROCfs module. This now uses the new arbitrary sized
4035
* /proc/net interface to create /proc/net/dev
4036
*/
4037
static int dev_seq_show(struct seq_file *seq, void *v)
4038
{
4039
if (v == SEQ_START_TOKEN)
4040
seq_puts(seq, "Inter-| Receive "
4041
" | Transmit\n"
4042
" face |bytes packets errs drop fifo frame "
4043
"compressed multicast|bytes packets errs "
4044
"drop fifo colls carrier compressed\n");
4045
else
4046
dev_seq_printf_stats(seq, v);
4047
return 0;
4048
}
4049
4050
static struct softnet_data *softnet_get_online(loff_t *pos)
4051
{
4052
struct softnet_data *sd = NULL;
4053
4054
while (*pos < nr_cpu_ids)
4055
if (cpu_online(*pos)) {
4056
sd = &per_cpu(softnet_data, *pos);
4057
break;
4058
} else
4059
++*pos;
4060
return sd;
4061
}
4062
4063
static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4064
{
4065
return softnet_get_online(pos);
4066
}
4067
4068
static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4069
{
4070
++*pos;
4071
return softnet_get_online(pos);
4072
}
4073
4074
static void softnet_seq_stop(struct seq_file *seq, void *v)
4075
{
4076
}
4077
4078
static int softnet_seq_show(struct seq_file *seq, void *v)
4079
{
4080
struct softnet_data *sd = v;
4081
4082
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4083
sd->processed, sd->dropped, sd->time_squeeze, 0,
4084
0, 0, 0, 0, /* was fastroute */
4085
sd->cpu_collision, sd->received_rps);
4086
return 0;
4087
}
4088
4089
static const struct seq_operations dev_seq_ops = {
4090
.start = dev_seq_start,
4091
.next = dev_seq_next,
4092
.stop = dev_seq_stop,
4093
.show = dev_seq_show,
4094
};
4095
4096
static int dev_seq_open(struct inode *inode, struct file *file)
4097
{
4098
return seq_open_net(inode, file, &dev_seq_ops,
4099
sizeof(struct seq_net_private));
4100
}
4101
4102
static const struct file_operations dev_seq_fops = {
4103
.owner = THIS_MODULE,
4104
.open = dev_seq_open,
4105
.read = seq_read,
4106
.llseek = seq_lseek,
4107
.release = seq_release_net,
4108
};
4109
4110
static const struct seq_operations softnet_seq_ops = {
4111
.start = softnet_seq_start,
4112
.next = softnet_seq_next,
4113
.stop = softnet_seq_stop,
4114
.show = softnet_seq_show,
4115
};
4116
4117
static int softnet_seq_open(struct inode *inode, struct file *file)
4118
{
4119
return seq_open(file, &softnet_seq_ops);
4120
}
4121
4122
static const struct file_operations softnet_seq_fops = {
4123
.owner = THIS_MODULE,
4124
.open = softnet_seq_open,
4125
.read = seq_read,
4126
.llseek = seq_lseek,
4127
.release = seq_release,
4128
};
4129
4130
static void *ptype_get_idx(loff_t pos)
4131
{
4132
struct packet_type *pt = NULL;
4133
loff_t i = 0;
4134
int t;
4135
4136
list_for_each_entry_rcu(pt, &ptype_all, list) {
4137
if (i == pos)
4138
return pt;
4139
++i;
4140
}
4141
4142
for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4143
list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4144
if (i == pos)
4145
return pt;
4146
++i;
4147
}
4148
}
4149
return NULL;
4150
}
4151
4152
static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4153
__acquires(RCU)
4154
{
4155
rcu_read_lock();
4156
return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4157
}
4158
4159
static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4160
{
4161
struct packet_type *pt;
4162
struct list_head *nxt;
4163
int hash;
4164
4165
++*pos;
4166
if (v == SEQ_START_TOKEN)
4167
return ptype_get_idx(0);
4168
4169
pt = v;
4170
nxt = pt->list.next;
4171
if (pt->type == htons(ETH_P_ALL)) {
4172
if (nxt != &ptype_all)
4173
goto found;
4174
hash = 0;
4175
nxt = ptype_base[0].next;
4176
} else
4177
hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4178
4179
while (nxt == &ptype_base[hash]) {
4180
if (++hash >= PTYPE_HASH_SIZE)
4181
return NULL;
4182
nxt = ptype_base[hash].next;
4183
}
4184
found:
4185
return list_entry(nxt, struct packet_type, list);
4186
}
4187
4188
static void ptype_seq_stop(struct seq_file *seq, void *v)
4189
__releases(RCU)
4190
{
4191
rcu_read_unlock();
4192
}
4193
4194
static int ptype_seq_show(struct seq_file *seq, void *v)
4195
{
4196
struct packet_type *pt = v;
4197
4198
if (v == SEQ_START_TOKEN)
4199
seq_puts(seq, "Type Device Function\n");
4200
else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4201
if (pt->type == htons(ETH_P_ALL))
4202
seq_puts(seq, "ALL ");
4203
else
4204
seq_printf(seq, "%04x", ntohs(pt->type));
4205
4206
seq_printf(seq, " %-8s %pF\n",
4207
pt->dev ? pt->dev->name : "", pt->func);
4208
}
4209
4210
return 0;
4211
}
4212
4213
static const struct seq_operations ptype_seq_ops = {
4214
.start = ptype_seq_start,
4215
.next = ptype_seq_next,
4216
.stop = ptype_seq_stop,
4217
.show = ptype_seq_show,
4218
};
4219
4220
static int ptype_seq_open(struct inode *inode, struct file *file)
4221
{
4222
return seq_open_net(inode, file, &ptype_seq_ops,
4223
sizeof(struct seq_net_private));
4224
}
4225
4226
static const struct file_operations ptype_seq_fops = {
4227
.owner = THIS_MODULE,
4228
.open = ptype_seq_open,
4229
.read = seq_read,
4230
.llseek = seq_lseek,
4231
.release = seq_release_net,
4232
};
4233
4234
4235
static int __net_init dev_proc_net_init(struct net *net)
4236
{
4237
int rc = -ENOMEM;
4238
4239
if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4240
goto out;
4241
if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4242
goto out_dev;
4243
if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4244
goto out_softnet;
4245
4246
if (wext_proc_init(net))
4247
goto out_ptype;
4248
rc = 0;
4249
out:
4250
return rc;
4251
out_ptype:
4252
proc_net_remove(net, "ptype");
4253
out_softnet:
4254
proc_net_remove(net, "softnet_stat");
4255
out_dev:
4256
proc_net_remove(net, "dev");
4257
goto out;
4258
}
4259
4260
static void __net_exit dev_proc_net_exit(struct net *net)
4261
{
4262
wext_proc_exit(net);
4263
4264
proc_net_remove(net, "ptype");
4265
proc_net_remove(net, "softnet_stat");
4266
proc_net_remove(net, "dev");
4267
}
4268
4269
static struct pernet_operations __net_initdata dev_proc_ops = {
4270
.init = dev_proc_net_init,
4271
.exit = dev_proc_net_exit,
4272
};
4273
4274
static int __init dev_proc_init(void)
4275
{
4276
return register_pernet_subsys(&dev_proc_ops);
4277
}
4278
#else
4279
#define dev_proc_init() 0
4280
#endif /* CONFIG_PROC_FS */
4281
4282
4283
/**
4284
* netdev_set_master - set up master pointer
4285
* @slave: slave device
4286
* @master: new master device
4287
*
4288
* Changes the master device of the slave. Pass %NULL to break the
4289
* bonding. The caller must hold the RTNL semaphore. On a failure
4290
* a negative errno code is returned. On success the reference counts
4291
* are adjusted and the function returns zero.
4292
*/
4293
int netdev_set_master(struct net_device *slave, struct net_device *master)
4294
{
4295
struct net_device *old = slave->master;
4296
4297
ASSERT_RTNL();
4298
4299
if (master) {
4300
if (old)
4301
return -EBUSY;
4302
dev_hold(master);
4303
}
4304
4305
slave->master = master;
4306
4307
if (old)
4308
dev_put(old);
4309
return 0;
4310
}
4311
EXPORT_SYMBOL(netdev_set_master);
4312
4313
/**
4314
* netdev_set_bond_master - set up bonding master/slave pair
4315
* @slave: slave device
4316
* @master: new master device
4317
*
4318
* Changes the master device of the slave. Pass %NULL to break the
4319
* bonding. The caller must hold the RTNL semaphore. On a failure
4320
* a negative errno code is returned. On success %RTM_NEWLINK is sent
4321
* to the routing socket and the function returns zero.
4322
*/
4323
int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4324
{
4325
int err;
4326
4327
ASSERT_RTNL();
4328
4329
err = netdev_set_master(slave, master);
4330
if (err)
4331
return err;
4332
if (master)
4333
slave->flags |= IFF_SLAVE;
4334
else
4335
slave->flags &= ~IFF_SLAVE;
4336
4337
rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4338
return 0;
4339
}
4340
EXPORT_SYMBOL(netdev_set_bond_master);
4341
4342
static void dev_change_rx_flags(struct net_device *dev, int flags)
4343
{
4344
const struct net_device_ops *ops = dev->netdev_ops;
4345
4346
if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4347
ops->ndo_change_rx_flags(dev, flags);
4348
}
4349
4350
static int __dev_set_promiscuity(struct net_device *dev, int inc)
4351
{
4352
unsigned short old_flags = dev->flags;
4353
uid_t uid;
4354
gid_t gid;
4355
4356
ASSERT_RTNL();
4357
4358
dev->flags |= IFF_PROMISC;
4359
dev->promiscuity += inc;
4360
if (dev->promiscuity == 0) {
4361
/*
4362
* Avoid overflow.
4363
* If inc causes overflow, untouch promisc and return error.
4364
*/
4365
if (inc < 0)
4366
dev->flags &= ~IFF_PROMISC;
4367
else {
4368
dev->promiscuity -= inc;
4369
printk(KERN_WARNING "%s: promiscuity touches roof, "
4370
"set promiscuity failed, promiscuity feature "
4371
"of device might be broken.\n", dev->name);
4372
return -EOVERFLOW;
4373
}
4374
}
4375
if (dev->flags != old_flags) {
4376
printk(KERN_INFO "device %s %s promiscuous mode\n",
4377
dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4378
"left");
4379
if (audit_enabled) {
4380
current_uid_gid(&uid, &gid);
4381
audit_log(current->audit_context, GFP_ATOMIC,
4382
AUDIT_ANOM_PROMISCUOUS,
4383
"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4384
dev->name, (dev->flags & IFF_PROMISC),
4385
(old_flags & IFF_PROMISC),
4386
audit_get_loginuid(current),
4387
uid, gid,
4388
audit_get_sessionid(current));
4389
}
4390
4391
dev_change_rx_flags(dev, IFF_PROMISC);
4392
}
4393
return 0;
4394
}
4395
4396
/**
4397
* dev_set_promiscuity - update promiscuity count on a device
4398
* @dev: device
4399
* @inc: modifier
4400
*
4401
* Add or remove promiscuity from a device. While the count in the device
4402
* remains above zero the interface remains promiscuous. Once it hits zero
4403
* the device reverts back to normal filtering operation. A negative inc
4404
* value is used to drop promiscuity on the device.
4405
* Return 0 if successful or a negative errno code on error.
4406
*/
4407
int dev_set_promiscuity(struct net_device *dev, int inc)
4408
{
4409
unsigned short old_flags = dev->flags;
4410
int err;
4411
4412
err = __dev_set_promiscuity(dev, inc);
4413
if (err < 0)
4414
return err;
4415
if (dev->flags != old_flags)
4416
dev_set_rx_mode(dev);
4417
return err;
4418
}
4419
EXPORT_SYMBOL(dev_set_promiscuity);
4420
4421
/**
4422
* dev_set_allmulti - update allmulti count on a device
4423
* @dev: device
4424
* @inc: modifier
4425
*
4426
* Add or remove reception of all multicast frames to a device. While the
4427
* count in the device remains above zero the interface remains listening
4428
* to all interfaces. Once it hits zero the device reverts back to normal
4429
* filtering operation. A negative @inc value is used to drop the counter
4430
* when releasing a resource needing all multicasts.
4431
* Return 0 if successful or a negative errno code on error.
4432
*/
4433
4434
int dev_set_allmulti(struct net_device *dev, int inc)
4435
{
4436
unsigned short old_flags = dev->flags;
4437
4438
ASSERT_RTNL();
4439
4440
dev->flags |= IFF_ALLMULTI;
4441
dev->allmulti += inc;
4442
if (dev->allmulti == 0) {
4443
/*
4444
* Avoid overflow.
4445
* If inc causes overflow, untouch allmulti and return error.
4446
*/
4447
if (inc < 0)
4448
dev->flags &= ~IFF_ALLMULTI;
4449
else {
4450
dev->allmulti -= inc;
4451
printk(KERN_WARNING "%s: allmulti touches roof, "
4452
"set allmulti failed, allmulti feature of "
4453
"device might be broken.\n", dev->name);
4454
return -EOVERFLOW;
4455
}
4456
}
4457
if (dev->flags ^ old_flags) {
4458
dev_change_rx_flags(dev, IFF_ALLMULTI);
4459
dev_set_rx_mode(dev);
4460
}
4461
return 0;
4462
}
4463
EXPORT_SYMBOL(dev_set_allmulti);
4464
4465
/*
4466
* Upload unicast and multicast address lists to device and
4467
* configure RX filtering. When the device doesn't support unicast
4468
* filtering it is put in promiscuous mode while unicast addresses
4469
* are present.
4470
*/
4471
void __dev_set_rx_mode(struct net_device *dev)
4472
{
4473
const struct net_device_ops *ops = dev->netdev_ops;
4474
4475
/* dev_open will call this function so the list will stay sane. */
4476
if (!(dev->flags&IFF_UP))
4477
return;
4478
4479
if (!netif_device_present(dev))
4480
return;
4481
4482
if (ops->ndo_set_rx_mode)
4483
ops->ndo_set_rx_mode(dev);
4484
else {
4485
/* Unicast addresses changes may only happen under the rtnl,
4486
* therefore calling __dev_set_promiscuity here is safe.
4487
*/
4488
if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4489
__dev_set_promiscuity(dev, 1);
4490
dev->uc_promisc = 1;
4491
} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4492
__dev_set_promiscuity(dev, -1);
4493
dev->uc_promisc = 0;
4494
}
4495
4496
if (ops->ndo_set_multicast_list)
4497
ops->ndo_set_multicast_list(dev);
4498
}
4499
}
4500
4501
void dev_set_rx_mode(struct net_device *dev)
4502
{
4503
netif_addr_lock_bh(dev);
4504
__dev_set_rx_mode(dev);
4505
netif_addr_unlock_bh(dev);
4506
}
4507
4508
/**
4509
* dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4510
* @dev: device
4511
* @cmd: memory area for ethtool_ops::get_settings() result
4512
*
4513
* The cmd arg is initialized properly (cleared and
4514
* ethtool_cmd::cmd field set to ETHTOOL_GSET).
4515
*
4516
* Return device's ethtool_ops::get_settings() result value or
4517
* -EOPNOTSUPP when device doesn't expose
4518
* ethtool_ops::get_settings() operation.
4519
*/
4520
int dev_ethtool_get_settings(struct net_device *dev,
4521
struct ethtool_cmd *cmd)
4522
{
4523
if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4524
return -EOPNOTSUPP;
4525
4526
memset(cmd, 0, sizeof(struct ethtool_cmd));
4527
cmd->cmd = ETHTOOL_GSET;
4528
return dev->ethtool_ops->get_settings(dev, cmd);
4529
}
4530
EXPORT_SYMBOL(dev_ethtool_get_settings);
4531
4532
/**
4533
* dev_get_flags - get flags reported to userspace
4534
* @dev: device
4535
*
4536
* Get the combination of flag bits exported through APIs to userspace.
4537
*/
4538
unsigned dev_get_flags(const struct net_device *dev)
4539
{
4540
unsigned flags;
4541
4542
flags = (dev->flags & ~(IFF_PROMISC |
4543
IFF_ALLMULTI |
4544
IFF_RUNNING |
4545
IFF_LOWER_UP |
4546
IFF_DORMANT)) |
4547
(dev->gflags & (IFF_PROMISC |
4548
IFF_ALLMULTI));
4549
4550
if (netif_running(dev)) {
4551
if (netif_oper_up(dev))
4552
flags |= IFF_RUNNING;
4553
if (netif_carrier_ok(dev))
4554
flags |= IFF_LOWER_UP;
4555
if (netif_dormant(dev))
4556
flags |= IFF_DORMANT;
4557
}
4558
4559
return flags;
4560
}
4561
EXPORT_SYMBOL(dev_get_flags);
4562
4563
int __dev_change_flags(struct net_device *dev, unsigned int flags)
4564
{
4565
int old_flags = dev->flags;
4566
int ret;
4567
4568
ASSERT_RTNL();
4569
4570
/*
4571
* Set the flags on our device.
4572
*/
4573
4574
dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4575
IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4576
IFF_AUTOMEDIA)) |
4577
(dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4578
IFF_ALLMULTI));
4579
4580
/*
4581
* Load in the correct multicast list now the flags have changed.
4582
*/
4583
4584
if ((old_flags ^ flags) & IFF_MULTICAST)
4585
dev_change_rx_flags(dev, IFF_MULTICAST);
4586
4587
dev_set_rx_mode(dev);
4588
4589
/*
4590
* Have we downed the interface. We handle IFF_UP ourselves
4591
* according to user attempts to set it, rather than blindly
4592
* setting it.
4593
*/
4594
4595
ret = 0;
4596
if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4597
ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4598
4599
if (!ret)
4600
dev_set_rx_mode(dev);
4601
}
4602
4603
if ((flags ^ dev->gflags) & IFF_PROMISC) {
4604
int inc = (flags & IFF_PROMISC) ? 1 : -1;
4605
4606
dev->gflags ^= IFF_PROMISC;
4607
dev_set_promiscuity(dev, inc);
4608
}
4609
4610
/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4611
is important. Some (broken) drivers set IFF_PROMISC, when
4612
IFF_ALLMULTI is requested not asking us and not reporting.
4613
*/
4614
if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4615
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4616
4617
dev->gflags ^= IFF_ALLMULTI;
4618
dev_set_allmulti(dev, inc);
4619
}
4620
4621
return ret;
4622
}
4623
4624
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4625
{
4626
unsigned int changes = dev->flags ^ old_flags;
4627
4628
if (changes & IFF_UP) {
4629
if (dev->flags & IFF_UP)
4630
call_netdevice_notifiers(NETDEV_UP, dev);
4631
else
4632
call_netdevice_notifiers(NETDEV_DOWN, dev);
4633
}
4634
4635
if (dev->flags & IFF_UP &&
4636
(changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4637
call_netdevice_notifiers(NETDEV_CHANGE, dev);
4638
}
4639
4640
/**
4641
* dev_change_flags - change device settings
4642
* @dev: device
4643
* @flags: device state flags
4644
*
4645
* Change settings on device based state flags. The flags are
4646
* in the userspace exported format.
4647
*/
4648
int dev_change_flags(struct net_device *dev, unsigned flags)
4649
{
4650
int ret, changes;
4651
int old_flags = dev->flags;
4652
4653
ret = __dev_change_flags(dev, flags);
4654
if (ret < 0)
4655
return ret;
4656
4657
changes = old_flags ^ dev->flags;
4658
if (changes)
4659
rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4660
4661
__dev_notify_flags(dev, old_flags);
4662
return ret;
4663
}
4664
EXPORT_SYMBOL(dev_change_flags);
4665
4666
/**
4667
* dev_set_mtu - Change maximum transfer unit
4668
* @dev: device
4669
* @new_mtu: new transfer unit
4670
*
4671
* Change the maximum transfer size of the network device.
4672
*/
4673
int dev_set_mtu(struct net_device *dev, int new_mtu)
4674
{
4675
const struct net_device_ops *ops = dev->netdev_ops;
4676
int err;
4677
4678
if (new_mtu == dev->mtu)
4679
return 0;
4680
4681
/* MTU must be positive. */
4682
if (new_mtu < 0)
4683
return -EINVAL;
4684
4685
if (!netif_device_present(dev))
4686
return -ENODEV;
4687
4688
err = 0;
4689
if (ops->ndo_change_mtu)
4690
err = ops->ndo_change_mtu(dev, new_mtu);
4691
else
4692
dev->mtu = new_mtu;
4693
4694
if (!err && dev->flags & IFF_UP)
4695
call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4696
return err;
4697
}
4698
EXPORT_SYMBOL(dev_set_mtu);
4699
4700
/**
4701
* dev_set_group - Change group this device belongs to
4702
* @dev: device
4703
* @new_group: group this device should belong to
4704
*/
4705
void dev_set_group(struct net_device *dev, int new_group)
4706
{
4707
dev->group = new_group;
4708
}
4709
EXPORT_SYMBOL(dev_set_group);
4710
4711
/**
4712
* dev_set_mac_address - Change Media Access Control Address
4713
* @dev: device
4714
* @sa: new address
4715
*
4716
* Change the hardware (MAC) address of the device
4717
*/
4718
int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4719
{
4720
const struct net_device_ops *ops = dev->netdev_ops;
4721
int err;
4722
4723
if (!ops->ndo_set_mac_address)
4724
return -EOPNOTSUPP;
4725
if (sa->sa_family != dev->type)
4726
return -EINVAL;
4727
if (!netif_device_present(dev))
4728
return -ENODEV;
4729
err = ops->ndo_set_mac_address(dev, sa);
4730
if (!err)
4731
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4732
return err;
4733
}
4734
EXPORT_SYMBOL(dev_set_mac_address);
4735
4736
/*
4737
* Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4738
*/
4739
static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4740
{
4741
int err;
4742
struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4743
4744
if (!dev)
4745
return -ENODEV;
4746
4747
switch (cmd) {
4748
case SIOCGIFFLAGS: /* Get interface flags */
4749
ifr->ifr_flags = (short) dev_get_flags(dev);
4750
return 0;
4751
4752
case SIOCGIFMETRIC: /* Get the metric on the interface
4753
(currently unused) */
4754
ifr->ifr_metric = 0;
4755
return 0;
4756
4757
case SIOCGIFMTU: /* Get the MTU of a device */
4758
ifr->ifr_mtu = dev->mtu;
4759
return 0;
4760
4761
case SIOCGIFHWADDR:
4762
if (!dev->addr_len)
4763
memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4764
else
4765
memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4766
min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4767
ifr->ifr_hwaddr.sa_family = dev->type;
4768
return 0;
4769
4770
case SIOCGIFSLAVE:
4771
err = -EINVAL;
4772
break;
4773
4774
case SIOCGIFMAP:
4775
ifr->ifr_map.mem_start = dev->mem_start;
4776
ifr->ifr_map.mem_end = dev->mem_end;
4777
ifr->ifr_map.base_addr = dev->base_addr;
4778
ifr->ifr_map.irq = dev->irq;
4779
ifr->ifr_map.dma = dev->dma;
4780
ifr->ifr_map.port = dev->if_port;
4781
return 0;
4782
4783
case SIOCGIFINDEX:
4784
ifr->ifr_ifindex = dev->ifindex;
4785
return 0;
4786
4787
case SIOCGIFTXQLEN:
4788
ifr->ifr_qlen = dev->tx_queue_len;
4789
return 0;
4790
4791
default:
4792
/* dev_ioctl() should ensure this case
4793
* is never reached
4794
*/
4795
WARN_ON(1);
4796
err = -ENOTTY;
4797
break;
4798
4799
}
4800
return err;
4801
}
4802
4803
/*
4804
* Perform the SIOCxIFxxx calls, inside rtnl_lock()
4805
*/
4806
static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4807
{
4808
int err;
4809
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4810
const struct net_device_ops *ops;
4811
4812
if (!dev)
4813
return -ENODEV;
4814
4815
ops = dev->netdev_ops;
4816
4817
switch (cmd) {
4818
case SIOCSIFFLAGS: /* Set interface flags */
4819
return dev_change_flags(dev, ifr->ifr_flags);
4820
4821
case SIOCSIFMETRIC: /* Set the metric on the interface
4822
(currently unused) */
4823
return -EOPNOTSUPP;
4824
4825
case SIOCSIFMTU: /* Set the MTU of a device */
4826
return dev_set_mtu(dev, ifr->ifr_mtu);
4827
4828
case SIOCSIFHWADDR:
4829
return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4830
4831
case SIOCSIFHWBROADCAST:
4832
if (ifr->ifr_hwaddr.sa_family != dev->type)
4833
return -EINVAL;
4834
memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4835
min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4836
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4837
return 0;
4838
4839
case SIOCSIFMAP:
4840
if (ops->ndo_set_config) {
4841
if (!netif_device_present(dev))
4842
return -ENODEV;
4843
return ops->ndo_set_config(dev, &ifr->ifr_map);
4844
}
4845
return -EOPNOTSUPP;
4846
4847
case SIOCADDMULTI:
4848
if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4849
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4850
return -EINVAL;
4851
if (!netif_device_present(dev))
4852
return -ENODEV;
4853
return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4854
4855
case SIOCDELMULTI:
4856
if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4857
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4858
return -EINVAL;
4859
if (!netif_device_present(dev))
4860
return -ENODEV;
4861
return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4862
4863
case SIOCSIFTXQLEN:
4864
if (ifr->ifr_qlen < 0)
4865
return -EINVAL;
4866
dev->tx_queue_len = ifr->ifr_qlen;
4867
return 0;
4868
4869
case SIOCSIFNAME:
4870
ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4871
return dev_change_name(dev, ifr->ifr_newname);
4872
4873
/*
4874
* Unknown or private ioctl
4875
*/
4876
default:
4877
if ((cmd >= SIOCDEVPRIVATE &&
4878
cmd <= SIOCDEVPRIVATE + 15) ||
4879
cmd == SIOCBONDENSLAVE ||
4880
cmd == SIOCBONDRELEASE ||
4881
cmd == SIOCBONDSETHWADDR ||
4882
cmd == SIOCBONDSLAVEINFOQUERY ||
4883
cmd == SIOCBONDINFOQUERY ||
4884
cmd == SIOCBONDCHANGEACTIVE ||
4885
cmd == SIOCGMIIPHY ||
4886
cmd == SIOCGMIIREG ||
4887
cmd == SIOCSMIIREG ||
4888
cmd == SIOCBRADDIF ||
4889
cmd == SIOCBRDELIF ||
4890
cmd == SIOCSHWTSTAMP ||
4891
cmd == SIOCWANDEV) {
4892
err = -EOPNOTSUPP;
4893
if (ops->ndo_do_ioctl) {
4894
if (netif_device_present(dev))
4895
err = ops->ndo_do_ioctl(dev, ifr, cmd);
4896
else
4897
err = -ENODEV;
4898
}
4899
} else
4900
err = -EINVAL;
4901
4902
}
4903
return err;
4904
}
4905
4906
/*
4907
* This function handles all "interface"-type I/O control requests. The actual
4908
* 'doing' part of this is dev_ifsioc above.
4909
*/
4910
4911
/**
4912
* dev_ioctl - network device ioctl
4913
* @net: the applicable net namespace
4914
* @cmd: command to issue
4915
* @arg: pointer to a struct ifreq in user space
4916
*
4917
* Issue ioctl functions to devices. This is normally called by the
4918
* user space syscall interfaces but can sometimes be useful for
4919
* other purposes. The return value is the return from the syscall if
4920
* positive or a negative errno code on error.
4921
*/
4922
4923
int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4924
{
4925
struct ifreq ifr;
4926
int ret;
4927
char *colon;
4928
4929
/* One special case: SIOCGIFCONF takes ifconf argument
4930
and requires shared lock, because it sleeps writing
4931
to user space.
4932
*/
4933
4934
if (cmd == SIOCGIFCONF) {
4935
rtnl_lock();
4936
ret = dev_ifconf(net, (char __user *) arg);
4937
rtnl_unlock();
4938
return ret;
4939
}
4940
if (cmd == SIOCGIFNAME)
4941
return dev_ifname(net, (struct ifreq __user *)arg);
4942
4943
if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4944
return -EFAULT;
4945
4946
ifr.ifr_name[IFNAMSIZ-1] = 0;
4947
4948
colon = strchr(ifr.ifr_name, ':');
4949
if (colon)
4950
*colon = 0;
4951
4952
/*
4953
* See which interface the caller is talking about.
4954
*/
4955
4956
switch (cmd) {
4957
/*
4958
* These ioctl calls:
4959
* - can be done by all.
4960
* - atomic and do not require locking.
4961
* - return a value
4962
*/
4963
case SIOCGIFFLAGS:
4964
case SIOCGIFMETRIC:
4965
case SIOCGIFMTU:
4966
case SIOCGIFHWADDR:
4967
case SIOCGIFSLAVE:
4968
case SIOCGIFMAP:
4969
case SIOCGIFINDEX:
4970
case SIOCGIFTXQLEN:
4971
dev_load(net, ifr.ifr_name);
4972
rcu_read_lock();
4973
ret = dev_ifsioc_locked(net, &ifr, cmd);
4974
rcu_read_unlock();
4975
if (!ret) {
4976
if (colon)
4977
*colon = ':';
4978
if (copy_to_user(arg, &ifr,
4979
sizeof(struct ifreq)))
4980
ret = -EFAULT;
4981
}
4982
return ret;
4983
4984
case SIOCETHTOOL:
4985
dev_load(net, ifr.ifr_name);
4986
rtnl_lock();
4987
ret = dev_ethtool(net, &ifr);
4988
rtnl_unlock();
4989
if (!ret) {
4990
if (colon)
4991
*colon = ':';
4992
if (copy_to_user(arg, &ifr,
4993
sizeof(struct ifreq)))
4994
ret = -EFAULT;
4995
}
4996
return ret;
4997
4998
/*
4999
* These ioctl calls:
5000
* - require superuser power.
5001
* - require strict serialization.
5002
* - return a value
5003
*/
5004
case SIOCGMIIPHY:
5005
case SIOCGMIIREG:
5006
case SIOCSIFNAME:
5007
if (!capable(CAP_NET_ADMIN))
5008
return -EPERM;
5009
dev_load(net, ifr.ifr_name);
5010
rtnl_lock();
5011
ret = dev_ifsioc(net, &ifr, cmd);
5012
rtnl_unlock();
5013
if (!ret) {
5014
if (colon)
5015
*colon = ':';
5016
if (copy_to_user(arg, &ifr,
5017
sizeof(struct ifreq)))
5018
ret = -EFAULT;
5019
}
5020
return ret;
5021
5022
/*
5023
* These ioctl calls:
5024
* - require superuser power.
5025
* - require strict serialization.
5026
* - do not return a value
5027
*/
5028
case SIOCSIFFLAGS:
5029
case SIOCSIFMETRIC:
5030
case SIOCSIFMTU:
5031
case SIOCSIFMAP:
5032
case SIOCSIFHWADDR:
5033
case SIOCSIFSLAVE:
5034
case SIOCADDMULTI:
5035
case SIOCDELMULTI:
5036
case SIOCSIFHWBROADCAST:
5037
case SIOCSIFTXQLEN:
5038
case SIOCSMIIREG:
5039
case SIOCBONDENSLAVE:
5040
case SIOCBONDRELEASE:
5041
case SIOCBONDSETHWADDR:
5042
case SIOCBONDCHANGEACTIVE:
5043
case SIOCBRADDIF:
5044
case SIOCBRDELIF:
5045
case SIOCSHWTSTAMP:
5046
if (!capable(CAP_NET_ADMIN))
5047
return -EPERM;
5048
/* fall through */
5049
case SIOCBONDSLAVEINFOQUERY:
5050
case SIOCBONDINFOQUERY:
5051
dev_load(net, ifr.ifr_name);
5052
rtnl_lock();
5053
ret = dev_ifsioc(net, &ifr, cmd);
5054
rtnl_unlock();
5055
return ret;
5056
5057
case SIOCGIFMEM:
5058
/* Get the per device memory space. We can add this but
5059
* currently do not support it */
5060
case SIOCSIFMEM:
5061
/* Set the per device memory buffer space.
5062
* Not applicable in our case */
5063
case SIOCSIFLINK:
5064
return -ENOTTY;
5065
5066
/*
5067
* Unknown or private ioctl.
5068
*/
5069
default:
5070
if (cmd == SIOCWANDEV ||
5071
(cmd >= SIOCDEVPRIVATE &&
5072
cmd <= SIOCDEVPRIVATE + 15)) {
5073
dev_load(net, ifr.ifr_name);
5074
rtnl_lock();
5075
ret = dev_ifsioc(net, &ifr, cmd);
5076
rtnl_unlock();
5077
if (!ret && copy_to_user(arg, &ifr,
5078
sizeof(struct ifreq)))
5079
ret = -EFAULT;
5080
return ret;
5081
}
5082
/* Take care of Wireless Extensions */
5083
if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5084
return wext_handle_ioctl(net, &ifr, cmd, arg);
5085
return -ENOTTY;
5086
}
5087
}
5088
5089
5090
/**
5091
* dev_new_index - allocate an ifindex
5092
* @net: the applicable net namespace
5093
*
5094
* Returns a suitable unique value for a new device interface
5095
* number. The caller must hold the rtnl semaphore or the
5096
* dev_base_lock to be sure it remains unique.
5097
*/
5098
static int dev_new_index(struct net *net)
5099
{
5100
static int ifindex;
5101
for (;;) {
5102
if (++ifindex <= 0)
5103
ifindex = 1;
5104
if (!__dev_get_by_index(net, ifindex))
5105
return ifindex;
5106
}
5107
}
5108
5109
/* Delayed registration/unregisteration */
5110
static LIST_HEAD(net_todo_list);
5111
5112
static void net_set_todo(struct net_device *dev)
5113
{
5114
list_add_tail(&dev->todo_list, &net_todo_list);
5115
}
5116
5117
static void rollback_registered_many(struct list_head *head)
5118
{
5119
struct net_device *dev, *tmp;
5120
5121
BUG_ON(dev_boot_phase);
5122
ASSERT_RTNL();
5123
5124
list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5125
/* Some devices call without registering
5126
* for initialization unwind. Remove those
5127
* devices and proceed with the remaining.
5128
*/
5129
if (dev->reg_state == NETREG_UNINITIALIZED) {
5130
pr_debug("unregister_netdevice: device %s/%p never "
5131
"was registered\n", dev->name, dev);
5132
5133
WARN_ON(1);
5134
list_del(&dev->unreg_list);
5135
continue;
5136
}
5137
dev->dismantle = true;
5138
BUG_ON(dev->reg_state != NETREG_REGISTERED);
5139
}
5140
5141
/* If device is running, close it first. */
5142
dev_close_many(head);
5143
5144
list_for_each_entry(dev, head, unreg_list) {
5145
/* And unlink it from device chain. */
5146
unlist_netdevice(dev);
5147
5148
dev->reg_state = NETREG_UNREGISTERING;
5149
}
5150
5151
synchronize_net();
5152
5153
list_for_each_entry(dev, head, unreg_list) {
5154
/* Shutdown queueing discipline. */
5155
dev_shutdown(dev);
5156
5157
5158
/* Notify protocols, that we are about to destroy
5159
this device. They should clean all the things.
5160
*/
5161
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5162
5163
if (!dev->rtnl_link_ops ||
5164
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5165
rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5166
5167
/*
5168
* Flush the unicast and multicast chains
5169
*/
5170
dev_uc_flush(dev);
5171
dev_mc_flush(dev);
5172
5173
if (dev->netdev_ops->ndo_uninit)
5174
dev->netdev_ops->ndo_uninit(dev);
5175
5176
/* Notifier chain MUST detach us from master device. */
5177
WARN_ON(dev->master);
5178
5179
/* Remove entries from kobject tree */
5180
netdev_unregister_kobject(dev);
5181
}
5182
5183
/* Process any work delayed until the end of the batch */
5184
dev = list_first_entry(head, struct net_device, unreg_list);
5185
call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5186
5187
rcu_barrier();
5188
5189
list_for_each_entry(dev, head, unreg_list)
5190
dev_put(dev);
5191
}
5192
5193
static void rollback_registered(struct net_device *dev)
5194
{
5195
LIST_HEAD(single);
5196
5197
list_add(&dev->unreg_list, &single);
5198
rollback_registered_many(&single);
5199
list_del(&single);
5200
}
5201
5202
u32 netdev_fix_features(struct net_device *dev, u32 features)
5203
{
5204
/* Fix illegal checksum combinations */
5205
if ((features & NETIF_F_HW_CSUM) &&
5206
(features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5207
netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5208
features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5209
}
5210
5211
if ((features & NETIF_F_NO_CSUM) &&
5212
(features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5213
netdev_warn(dev, "mixed no checksumming and other settings.\n");
5214
features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5215
}
5216
5217
/* Fix illegal SG+CSUM combinations. */
5218
if ((features & NETIF_F_SG) &&
5219
!(features & NETIF_F_ALL_CSUM)) {
5220
netdev_dbg(dev,
5221
"Dropping NETIF_F_SG since no checksum feature.\n");
5222
features &= ~NETIF_F_SG;
5223
}
5224
5225
/* TSO requires that SG is present as well. */
5226
if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5227
netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5228
features &= ~NETIF_F_ALL_TSO;
5229
}
5230
5231
/* TSO ECN requires that TSO is present as well. */
5232
if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5233
features &= ~NETIF_F_TSO_ECN;
5234
5235
/* Software GSO depends on SG. */
5236
if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5237
netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5238
features &= ~NETIF_F_GSO;
5239
}
5240
5241
/* UFO needs SG and checksumming */
5242
if (features & NETIF_F_UFO) {
5243
/* maybe split UFO into V4 and V6? */
5244
if (!((features & NETIF_F_GEN_CSUM) ||
5245
(features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5246
== (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5247
netdev_dbg(dev,
5248
"Dropping NETIF_F_UFO since no checksum offload features.\n");
5249
features &= ~NETIF_F_UFO;
5250
}
5251
5252
if (!(features & NETIF_F_SG)) {
5253
netdev_dbg(dev,
5254
"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5255
features &= ~NETIF_F_UFO;
5256
}
5257
}
5258
5259
return features;
5260
}
5261
EXPORT_SYMBOL(netdev_fix_features);
5262
5263
int __netdev_update_features(struct net_device *dev)
5264
{
5265
u32 features;
5266
int err = 0;
5267
5268
ASSERT_RTNL();
5269
5270
features = netdev_get_wanted_features(dev);
5271
5272
if (dev->netdev_ops->ndo_fix_features)
5273
features = dev->netdev_ops->ndo_fix_features(dev, features);
5274
5275
/* driver might be less strict about feature dependencies */
5276
features = netdev_fix_features(dev, features);
5277
5278
if (dev->features == features)
5279
return 0;
5280
5281
netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5282
dev->features, features);
5283
5284
if (dev->netdev_ops->ndo_set_features)
5285
err = dev->netdev_ops->ndo_set_features(dev, features);
5286
5287
if (unlikely(err < 0)) {
5288
netdev_err(dev,
5289
"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5290
err, features, dev->features);
5291
return -1;
5292
}
5293
5294
if (!err)
5295
dev->features = features;
5296
5297
return 1;
5298
}
5299
5300
/**
5301
* netdev_update_features - recalculate device features
5302
* @dev: the device to check
5303
*
5304
* Recalculate dev->features set and send notifications if it
5305
* has changed. Should be called after driver or hardware dependent
5306
* conditions might have changed that influence the features.
5307
*/
5308
void netdev_update_features(struct net_device *dev)
5309
{
5310
if (__netdev_update_features(dev))
5311
netdev_features_change(dev);
5312
}
5313
EXPORT_SYMBOL(netdev_update_features);
5314
5315
/**
5316
* netdev_change_features - recalculate device features
5317
* @dev: the device to check
5318
*
5319
* Recalculate dev->features set and send notifications even
5320
* if they have not changed. Should be called instead of
5321
* netdev_update_features() if also dev->vlan_features might
5322
* have changed to allow the changes to be propagated to stacked
5323
* VLAN devices.
5324
*/
5325
void netdev_change_features(struct net_device *dev)
5326
{
5327
__netdev_update_features(dev);
5328
netdev_features_change(dev);
5329
}
5330
EXPORT_SYMBOL(netdev_change_features);
5331
5332
/**
5333
* netif_stacked_transfer_operstate - transfer operstate
5334
* @rootdev: the root or lower level device to transfer state from
5335
* @dev: the device to transfer operstate to
5336
*
5337
* Transfer operational state from root to device. This is normally
5338
* called when a stacking relationship exists between the root
5339
* device and the device(a leaf device).
5340
*/
5341
void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5342
struct net_device *dev)
5343
{
5344
if (rootdev->operstate == IF_OPER_DORMANT)
5345
netif_dormant_on(dev);
5346
else
5347
netif_dormant_off(dev);
5348
5349
if (netif_carrier_ok(rootdev)) {
5350
if (!netif_carrier_ok(dev))
5351
netif_carrier_on(dev);
5352
} else {
5353
if (netif_carrier_ok(dev))
5354
netif_carrier_off(dev);
5355
}
5356
}
5357
EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5358
5359
#ifdef CONFIG_RPS
5360
static int netif_alloc_rx_queues(struct net_device *dev)
5361
{
5362
unsigned int i, count = dev->num_rx_queues;
5363
struct netdev_rx_queue *rx;
5364
5365
BUG_ON(count < 1);
5366
5367
rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5368
if (!rx) {
5369
pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5370
return -ENOMEM;
5371
}
5372
dev->_rx = rx;
5373
5374
for (i = 0; i < count; i++)
5375
rx[i].dev = dev;
5376
return 0;
5377
}
5378
#endif
5379
5380
static void netdev_init_one_queue(struct net_device *dev,
5381
struct netdev_queue *queue, void *_unused)
5382
{
5383
/* Initialize queue lock */
5384
spin_lock_init(&queue->_xmit_lock);
5385
netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5386
queue->xmit_lock_owner = -1;
5387
netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5388
queue->dev = dev;
5389
}
5390
5391
static int netif_alloc_netdev_queues(struct net_device *dev)
5392
{
5393
unsigned int count = dev->num_tx_queues;
5394
struct netdev_queue *tx;
5395
5396
BUG_ON(count < 1);
5397
5398
tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5399
if (!tx) {
5400
pr_err("netdev: Unable to allocate %u tx queues.\n",
5401
count);
5402
return -ENOMEM;
5403
}
5404
dev->_tx = tx;
5405
5406
netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5407
spin_lock_init(&dev->tx_global_lock);
5408
5409
return 0;
5410
}
5411
5412
/**
5413
* register_netdevice - register a network device
5414
* @dev: device to register
5415
*
5416
* Take a completed network device structure and add it to the kernel
5417
* interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5418
* chain. 0 is returned on success. A negative errno code is returned
5419
* on a failure to set up the device, or if the name is a duplicate.
5420
*
5421
* Callers must hold the rtnl semaphore. You may want
5422
* register_netdev() instead of this.
5423
*
5424
* BUGS:
5425
* The locking appears insufficient to guarantee two parallel registers
5426
* will not get the same name.
5427
*/
5428
5429
int register_netdevice(struct net_device *dev)
5430
{
5431
int ret;
5432
struct net *net = dev_net(dev);
5433
5434
BUG_ON(dev_boot_phase);
5435
ASSERT_RTNL();
5436
5437
might_sleep();
5438
5439
/* When net_device's are persistent, this will be fatal. */
5440
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5441
BUG_ON(!net);
5442
5443
spin_lock_init(&dev->addr_list_lock);
5444
netdev_set_addr_lockdep_class(dev);
5445
5446
dev->iflink = -1;
5447
5448
ret = dev_get_valid_name(dev, dev->name);
5449
if (ret < 0)
5450
goto out;
5451
5452
/* Init, if this function is available */
5453
if (dev->netdev_ops->ndo_init) {
5454
ret = dev->netdev_ops->ndo_init(dev);
5455
if (ret) {
5456
if (ret > 0)
5457
ret = -EIO;
5458
goto out;
5459
}
5460
}
5461
5462
dev->ifindex = dev_new_index(net);
5463
if (dev->iflink == -1)
5464
dev->iflink = dev->ifindex;
5465
5466
/* Transfer changeable features to wanted_features and enable
5467
* software offloads (GSO and GRO).
5468
*/
5469
dev->hw_features |= NETIF_F_SOFT_FEATURES;
5470
dev->features |= NETIF_F_SOFT_FEATURES;
5471
dev->wanted_features = dev->features & dev->hw_features;
5472
5473
/* Turn on no cache copy if HW is doing checksum */
5474
dev->hw_features |= NETIF_F_NOCACHE_COPY;
5475
if ((dev->features & NETIF_F_ALL_CSUM) &&
5476
!(dev->features & NETIF_F_NO_CSUM)) {
5477
dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5478
dev->features |= NETIF_F_NOCACHE_COPY;
5479
}
5480
5481
/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5482
* vlan_dev_init() will do the dev->features check, so these features
5483
* are enabled only if supported by underlying device.
5484
*/
5485
dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5486
5487
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5488
ret = notifier_to_errno(ret);
5489
if (ret)
5490
goto err_uninit;
5491
5492
ret = netdev_register_kobject(dev);
5493
if (ret)
5494
goto err_uninit;
5495
dev->reg_state = NETREG_REGISTERED;
5496
5497
__netdev_update_features(dev);
5498
5499
/*
5500
* Default initial state at registry is that the
5501
* device is present.
5502
*/
5503
5504
set_bit(__LINK_STATE_PRESENT, &dev->state);
5505
5506
dev_init_scheduler(dev);
5507
dev_hold(dev);
5508
list_netdevice(dev);
5509
5510
/* Notify protocols, that a new device appeared. */
5511
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5512
ret = notifier_to_errno(ret);
5513
if (ret) {
5514
rollback_registered(dev);
5515
dev->reg_state = NETREG_UNREGISTERED;
5516
}
5517
/*
5518
* Prevent userspace races by waiting until the network
5519
* device is fully setup before sending notifications.
5520
*/
5521
if (!dev->rtnl_link_ops ||
5522
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5523
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5524
5525
out:
5526
return ret;
5527
5528
err_uninit:
5529
if (dev->netdev_ops->ndo_uninit)
5530
dev->netdev_ops->ndo_uninit(dev);
5531
goto out;
5532
}
5533
EXPORT_SYMBOL(register_netdevice);
5534
5535
/**
5536
* init_dummy_netdev - init a dummy network device for NAPI
5537
* @dev: device to init
5538
*
5539
* This takes a network device structure and initialize the minimum
5540
* amount of fields so it can be used to schedule NAPI polls without
5541
* registering a full blown interface. This is to be used by drivers
5542
* that need to tie several hardware interfaces to a single NAPI
5543
* poll scheduler due to HW limitations.
5544
*/
5545
int init_dummy_netdev(struct net_device *dev)
5546
{
5547
/* Clear everything. Note we don't initialize spinlocks
5548
* are they aren't supposed to be taken by any of the
5549
* NAPI code and this dummy netdev is supposed to be
5550
* only ever used for NAPI polls
5551
*/
5552
memset(dev, 0, sizeof(struct net_device));
5553
5554
/* make sure we BUG if trying to hit standard
5555
* register/unregister code path
5556
*/
5557
dev->reg_state = NETREG_DUMMY;
5558
5559
/* NAPI wants this */
5560
INIT_LIST_HEAD(&dev->napi_list);
5561
5562
/* a dummy interface is started by default */
5563
set_bit(__LINK_STATE_PRESENT, &dev->state);
5564
set_bit(__LINK_STATE_START, &dev->state);
5565
5566
/* Note : We dont allocate pcpu_refcnt for dummy devices,
5567
* because users of this 'device' dont need to change
5568
* its refcount.
5569
*/
5570
5571
return 0;
5572
}
5573
EXPORT_SYMBOL_GPL(init_dummy_netdev);
5574
5575
5576
/**
5577
* register_netdev - register a network device
5578
* @dev: device to register
5579
*
5580
* Take a completed network device structure and add it to the kernel
5581
* interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5582
* chain. 0 is returned on success. A negative errno code is returned
5583
* on a failure to set up the device, or if the name is a duplicate.
5584
*
5585
* This is a wrapper around register_netdevice that takes the rtnl semaphore
5586
* and expands the device name if you passed a format string to
5587
* alloc_netdev.
5588
*/
5589
int register_netdev(struct net_device *dev)
5590
{
5591
int err;
5592
5593
rtnl_lock();
5594
err = register_netdevice(dev);
5595
rtnl_unlock();
5596
return err;
5597
}
5598
EXPORT_SYMBOL(register_netdev);
5599
5600
int netdev_refcnt_read(const struct net_device *dev)
5601
{
5602
int i, refcnt = 0;
5603
5604
for_each_possible_cpu(i)
5605
refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5606
return refcnt;
5607
}
5608
EXPORT_SYMBOL(netdev_refcnt_read);
5609
5610
/*
5611
* netdev_wait_allrefs - wait until all references are gone.
5612
*
5613
* This is called when unregistering network devices.
5614
*
5615
* Any protocol or device that holds a reference should register
5616
* for netdevice notification, and cleanup and put back the
5617
* reference if they receive an UNREGISTER event.
5618
* We can get stuck here if buggy protocols don't correctly
5619
* call dev_put.
5620
*/
5621
static void netdev_wait_allrefs(struct net_device *dev)
5622
{
5623
unsigned long rebroadcast_time, warning_time;
5624
int refcnt;
5625
5626
linkwatch_forget_dev(dev);
5627
5628
rebroadcast_time = warning_time = jiffies;
5629
refcnt = netdev_refcnt_read(dev);
5630
5631
while (refcnt != 0) {
5632
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5633
rtnl_lock();
5634
5635
/* Rebroadcast unregister notification */
5636
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5637
/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5638
* should have already handle it the first time */
5639
5640
if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5641
&dev->state)) {
5642
/* We must not have linkwatch events
5643
* pending on unregister. If this
5644
* happens, we simply run the queue
5645
* unscheduled, resulting in a noop
5646
* for this device.
5647
*/
5648
linkwatch_run_queue();
5649
}
5650
5651
__rtnl_unlock();
5652
5653
rebroadcast_time = jiffies;
5654
}
5655
5656
msleep(250);
5657
5658
refcnt = netdev_refcnt_read(dev);
5659
5660
if (time_after(jiffies, warning_time + 10 * HZ)) {
5661
printk(KERN_EMERG "unregister_netdevice: "
5662
"waiting for %s to become free. Usage "
5663
"count = %d\n",
5664
dev->name, refcnt);
5665
warning_time = jiffies;
5666
}
5667
}
5668
}
5669
5670
/* The sequence is:
5671
*
5672
* rtnl_lock();
5673
* ...
5674
* register_netdevice(x1);
5675
* register_netdevice(x2);
5676
* ...
5677
* unregister_netdevice(y1);
5678
* unregister_netdevice(y2);
5679
* ...
5680
* rtnl_unlock();
5681
* free_netdev(y1);
5682
* free_netdev(y2);
5683
*
5684
* We are invoked by rtnl_unlock().
5685
* This allows us to deal with problems:
5686
* 1) We can delete sysfs objects which invoke hotplug
5687
* without deadlocking with linkwatch via keventd.
5688
* 2) Since we run with the RTNL semaphore not held, we can sleep
5689
* safely in order to wait for the netdev refcnt to drop to zero.
5690
*
5691
* We must not return until all unregister events added during
5692
* the interval the lock was held have been completed.
5693
*/
5694
void netdev_run_todo(void)
5695
{
5696
struct list_head list;
5697
5698
/* Snapshot list, allow later requests */
5699
list_replace_init(&net_todo_list, &list);
5700
5701
__rtnl_unlock();
5702
5703
while (!list_empty(&list)) {
5704
struct net_device *dev
5705
= list_first_entry(&list, struct net_device, todo_list);
5706
list_del(&dev->todo_list);
5707
5708
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5709
printk(KERN_ERR "network todo '%s' but state %d\n",
5710
dev->name, dev->reg_state);
5711
dump_stack();
5712
continue;
5713
}
5714
5715
dev->reg_state = NETREG_UNREGISTERED;
5716
5717
on_each_cpu(flush_backlog, dev, 1);
5718
5719
netdev_wait_allrefs(dev);
5720
5721
/* paranoia */
5722
BUG_ON(netdev_refcnt_read(dev));
5723
WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5724
WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5725
WARN_ON(dev->dn_ptr);
5726
5727
if (dev->destructor)
5728
dev->destructor(dev);
5729
5730
/* Free network device */
5731
kobject_put(&dev->dev.kobj);
5732
}
5733
}
5734
5735
/* Convert net_device_stats to rtnl_link_stats64. They have the same
5736
* fields in the same order, with only the type differing.
5737
*/
5738
static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5739
const struct net_device_stats *netdev_stats)
5740
{
5741
#if BITS_PER_LONG == 64
5742
BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5743
memcpy(stats64, netdev_stats, sizeof(*stats64));
5744
#else
5745
size_t i, n = sizeof(*stats64) / sizeof(u64);
5746
const unsigned long *src = (const unsigned long *)netdev_stats;
5747
u64 *dst = (u64 *)stats64;
5748
5749
BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5750
sizeof(*stats64) / sizeof(u64));
5751
for (i = 0; i < n; i++)
5752
dst[i] = src[i];
5753
#endif
5754
}
5755
5756
/**
5757
* dev_get_stats - get network device statistics
5758
* @dev: device to get statistics from
5759
* @storage: place to store stats
5760
*
5761
* Get network statistics from device. Return @storage.
5762
* The device driver may provide its own method by setting
5763
* dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5764
* otherwise the internal statistics structure is used.
5765
*/
5766
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5767
struct rtnl_link_stats64 *storage)
5768
{
5769
const struct net_device_ops *ops = dev->netdev_ops;
5770
5771
if (ops->ndo_get_stats64) {
5772
memset(storage, 0, sizeof(*storage));
5773
ops->ndo_get_stats64(dev, storage);
5774
} else if (ops->ndo_get_stats) {
5775
netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5776
} else {
5777
netdev_stats_to_stats64(storage, &dev->stats);
5778
}
5779
storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5780
return storage;
5781
}
5782
EXPORT_SYMBOL(dev_get_stats);
5783
5784
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5785
{
5786
struct netdev_queue *queue = dev_ingress_queue(dev);
5787
5788
#ifdef CONFIG_NET_CLS_ACT
5789
if (queue)
5790
return queue;
5791
queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5792
if (!queue)
5793
return NULL;
5794
netdev_init_one_queue(dev, queue, NULL);
5795
queue->qdisc = &noop_qdisc;
5796
queue->qdisc_sleeping = &noop_qdisc;
5797
rcu_assign_pointer(dev->ingress_queue, queue);
5798
#endif
5799
return queue;
5800
}
5801
5802
/**
5803
* alloc_netdev_mqs - allocate network device
5804
* @sizeof_priv: size of private data to allocate space for
5805
* @name: device name format string
5806
* @setup: callback to initialize device
5807
* @txqs: the number of TX subqueues to allocate
5808
* @rxqs: the number of RX subqueues to allocate
5809
*
5810
* Allocates a struct net_device with private data area for driver use
5811
* and performs basic initialization. Also allocates subquue structs
5812
* for each queue on the device.
5813
*/
5814
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5815
void (*setup)(struct net_device *),
5816
unsigned int txqs, unsigned int rxqs)
5817
{
5818
struct net_device *dev;
5819
size_t alloc_size;
5820
struct net_device *p;
5821
5822
BUG_ON(strlen(name) >= sizeof(dev->name));
5823
5824
if (txqs < 1) {
5825
pr_err("alloc_netdev: Unable to allocate device "
5826
"with zero queues.\n");
5827
return NULL;
5828
}
5829
5830
#ifdef CONFIG_RPS
5831
if (rxqs < 1) {
5832
pr_err("alloc_netdev: Unable to allocate device "
5833
"with zero RX queues.\n");
5834
return NULL;
5835
}
5836
#endif
5837
5838
alloc_size = sizeof(struct net_device);
5839
if (sizeof_priv) {
5840
/* ensure 32-byte alignment of private area */
5841
alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5842
alloc_size += sizeof_priv;
5843
}
5844
/* ensure 32-byte alignment of whole construct */
5845
alloc_size += NETDEV_ALIGN - 1;
5846
5847
p = kzalloc(alloc_size, GFP_KERNEL);
5848
if (!p) {
5849
printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5850
return NULL;
5851
}
5852
5853
dev = PTR_ALIGN(p, NETDEV_ALIGN);
5854
dev->padded = (char *)dev - (char *)p;
5855
5856
dev->pcpu_refcnt = alloc_percpu(int);
5857
if (!dev->pcpu_refcnt)
5858
goto free_p;
5859
5860
if (dev_addr_init(dev))
5861
goto free_pcpu;
5862
5863
dev_mc_init(dev);
5864
dev_uc_init(dev);
5865
5866
dev_net_set(dev, &init_net);
5867
5868
dev->gso_max_size = GSO_MAX_SIZE;
5869
5870
INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5871
dev->ethtool_ntuple_list.count = 0;
5872
INIT_LIST_HEAD(&dev->napi_list);
5873
INIT_LIST_HEAD(&dev->unreg_list);
5874
INIT_LIST_HEAD(&dev->link_watch_list);
5875
dev->priv_flags = IFF_XMIT_DST_RELEASE;
5876
setup(dev);
5877
5878
dev->num_tx_queues = txqs;
5879
dev->real_num_tx_queues = txqs;
5880
if (netif_alloc_netdev_queues(dev))
5881
goto free_all;
5882
5883
#ifdef CONFIG_RPS
5884
dev->num_rx_queues = rxqs;
5885
dev->real_num_rx_queues = rxqs;
5886
if (netif_alloc_rx_queues(dev))
5887
goto free_all;
5888
#endif
5889
5890
strcpy(dev->name, name);
5891
dev->group = INIT_NETDEV_GROUP;
5892
return dev;
5893
5894
free_all:
5895
free_netdev(dev);
5896
return NULL;
5897
5898
free_pcpu:
5899
free_percpu(dev->pcpu_refcnt);
5900
kfree(dev->_tx);
5901
#ifdef CONFIG_RPS
5902
kfree(dev->_rx);
5903
#endif
5904
5905
free_p:
5906
kfree(p);
5907
return NULL;
5908
}
5909
EXPORT_SYMBOL(alloc_netdev_mqs);
5910
5911
/**
5912
* free_netdev - free network device
5913
* @dev: device
5914
*
5915
* This function does the last stage of destroying an allocated device
5916
* interface. The reference to the device object is released.
5917
* If this is the last reference then it will be freed.
5918
*/
5919
void free_netdev(struct net_device *dev)
5920
{
5921
struct napi_struct *p, *n;
5922
5923
release_net(dev_net(dev));
5924
5925
kfree(dev->_tx);
5926
#ifdef CONFIG_RPS
5927
kfree(dev->_rx);
5928
#endif
5929
5930
kfree(rcu_dereference_raw(dev->ingress_queue));
5931
5932
/* Flush device addresses */
5933
dev_addr_flush(dev);
5934
5935
/* Clear ethtool n-tuple list */
5936
ethtool_ntuple_flush(dev);
5937
5938
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5939
netif_napi_del(p);
5940
5941
free_percpu(dev->pcpu_refcnt);
5942
dev->pcpu_refcnt = NULL;
5943
5944
/* Compatibility with error handling in drivers */
5945
if (dev->reg_state == NETREG_UNINITIALIZED) {
5946
kfree((char *)dev - dev->padded);
5947
return;
5948
}
5949
5950
BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5951
dev->reg_state = NETREG_RELEASED;
5952
5953
/* will free via device release */
5954
put_device(&dev->dev);
5955
}
5956
EXPORT_SYMBOL(free_netdev);
5957
5958
/**
5959
* synchronize_net - Synchronize with packet receive processing
5960
*
5961
* Wait for packets currently being received to be done.
5962
* Does not block later packets from starting.
5963
*/
5964
void synchronize_net(void)
5965
{
5966
might_sleep();
5967
if (rtnl_is_locked())
5968
synchronize_rcu_expedited();
5969
else
5970
synchronize_rcu();
5971
}
5972
EXPORT_SYMBOL(synchronize_net);
5973
5974
/**
5975
* unregister_netdevice_queue - remove device from the kernel
5976
* @dev: device
5977
* @head: list
5978
*
5979
* This function shuts down a device interface and removes it
5980
* from the kernel tables.
5981
* If head not NULL, device is queued to be unregistered later.
5982
*
5983
* Callers must hold the rtnl semaphore. You may want
5984
* unregister_netdev() instead of this.
5985
*/
5986
5987
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5988
{
5989
ASSERT_RTNL();
5990
5991
if (head) {
5992
list_move_tail(&dev->unreg_list, head);
5993
} else {
5994
rollback_registered(dev);
5995
/* Finish processing unregister after unlock */
5996
net_set_todo(dev);
5997
}
5998
}
5999
EXPORT_SYMBOL(unregister_netdevice_queue);
6000
6001
/**
6002
* unregister_netdevice_many - unregister many devices
6003
* @head: list of devices
6004
*/
6005
void unregister_netdevice_many(struct list_head *head)
6006
{
6007
struct net_device *dev;
6008
6009
if (!list_empty(head)) {
6010
rollback_registered_many(head);
6011
list_for_each_entry(dev, head, unreg_list)
6012
net_set_todo(dev);
6013
}
6014
}
6015
EXPORT_SYMBOL(unregister_netdevice_many);
6016
6017
/**
6018
* unregister_netdev - remove device from the kernel
6019
* @dev: device
6020
*
6021
* This function shuts down a device interface and removes it
6022
* from the kernel tables.
6023
*
6024
* This is just a wrapper for unregister_netdevice that takes
6025
* the rtnl semaphore. In general you want to use this and not
6026
* unregister_netdevice.
6027
*/
6028
void unregister_netdev(struct net_device *dev)
6029
{
6030
rtnl_lock();
6031
unregister_netdevice(dev);
6032
rtnl_unlock();
6033
}
6034
EXPORT_SYMBOL(unregister_netdev);
6035
6036
/**
6037
* dev_change_net_namespace - move device to different nethost namespace
6038
* @dev: device
6039
* @net: network namespace
6040
* @pat: If not NULL name pattern to try if the current device name
6041
* is already taken in the destination network namespace.
6042
*
6043
* This function shuts down a device interface and moves it
6044
* to a new network namespace. On success 0 is returned, on
6045
* a failure a netagive errno code is returned.
6046
*
6047
* Callers must hold the rtnl semaphore.
6048
*/
6049
6050
int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6051
{
6052
int err;
6053
6054
ASSERT_RTNL();
6055
6056
/* Don't allow namespace local devices to be moved. */
6057
err = -EINVAL;
6058
if (dev->features & NETIF_F_NETNS_LOCAL)
6059
goto out;
6060
6061
/* Ensure the device has been registrered */
6062
err = -EINVAL;
6063
if (dev->reg_state != NETREG_REGISTERED)
6064
goto out;
6065
6066
/* Get out if there is nothing todo */
6067
err = 0;
6068
if (net_eq(dev_net(dev), net))
6069
goto out;
6070
6071
/* Pick the destination device name, and ensure
6072
* we can use it in the destination network namespace.
6073
*/
6074
err = -EEXIST;
6075
if (__dev_get_by_name(net, dev->name)) {
6076
/* We get here if we can't use the current device name */
6077
if (!pat)
6078
goto out;
6079
if (dev_get_valid_name(dev, pat) < 0)
6080
goto out;
6081
}
6082
6083
/*
6084
* And now a mini version of register_netdevice unregister_netdevice.
6085
*/
6086
6087
/* If device is running close it first. */
6088
dev_close(dev);
6089
6090
/* And unlink it from device chain */
6091
err = -ENODEV;
6092
unlist_netdevice(dev);
6093
6094
synchronize_net();
6095
6096
/* Shutdown queueing discipline. */
6097
dev_shutdown(dev);
6098
6099
/* Notify protocols, that we are about to destroy
6100
this device. They should clean all the things.
6101
6102
Note that dev->reg_state stays at NETREG_REGISTERED.
6103
This is wanted because this way 8021q and macvlan know
6104
the device is just moving and can keep their slaves up.
6105
*/
6106
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6107
call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6108
6109
/*
6110
* Flush the unicast and multicast chains
6111
*/
6112
dev_uc_flush(dev);
6113
dev_mc_flush(dev);
6114
6115
/* Actually switch the network namespace */
6116
dev_net_set(dev, net);
6117
6118
/* If there is an ifindex conflict assign a new one */
6119
if (__dev_get_by_index(net, dev->ifindex)) {
6120
int iflink = (dev->iflink == dev->ifindex);
6121
dev->ifindex = dev_new_index(net);
6122
if (iflink)
6123
dev->iflink = dev->ifindex;
6124
}
6125
6126
/* Fixup kobjects */
6127
err = device_rename(&dev->dev, dev->name);
6128
WARN_ON(err);
6129
6130
/* Add the device back in the hashes */
6131
list_netdevice(dev);
6132
6133
/* Notify protocols, that a new device appeared. */
6134
call_netdevice_notifiers(NETDEV_REGISTER, dev);
6135
6136
/*
6137
* Prevent userspace races by waiting until the network
6138
* device is fully setup before sending notifications.
6139
*/
6140
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6141
6142
synchronize_net();
6143
err = 0;
6144
out:
6145
return err;
6146
}
6147
EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6148
6149
static int dev_cpu_callback(struct notifier_block *nfb,
6150
unsigned long action,
6151
void *ocpu)
6152
{
6153
struct sk_buff **list_skb;
6154
struct sk_buff *skb;
6155
unsigned int cpu, oldcpu = (unsigned long)ocpu;
6156
struct softnet_data *sd, *oldsd;
6157
6158
if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6159
return NOTIFY_OK;
6160
6161
local_irq_disable();
6162
cpu = smp_processor_id();
6163
sd = &per_cpu(softnet_data, cpu);
6164
oldsd = &per_cpu(softnet_data, oldcpu);
6165
6166
/* Find end of our completion_queue. */
6167
list_skb = &sd->completion_queue;
6168
while (*list_skb)
6169
list_skb = &(*list_skb)->next;
6170
/* Append completion queue from offline CPU. */
6171
*list_skb = oldsd->completion_queue;
6172
oldsd->completion_queue = NULL;
6173
6174
/* Append output queue from offline CPU. */
6175
if (oldsd->output_queue) {
6176
*sd->output_queue_tailp = oldsd->output_queue;
6177
sd->output_queue_tailp = oldsd->output_queue_tailp;
6178
oldsd->output_queue = NULL;
6179
oldsd->output_queue_tailp = &oldsd->output_queue;
6180
}
6181
/* Append NAPI poll list from offline CPU. */
6182
if (!list_empty(&oldsd->poll_list)) {
6183
list_splice_init(&oldsd->poll_list, &sd->poll_list);
6184
raise_softirq_irqoff(NET_RX_SOFTIRQ);
6185
}
6186
6187
raise_softirq_irqoff(NET_TX_SOFTIRQ);
6188
local_irq_enable();
6189
6190
/* Process offline CPU's input_pkt_queue */
6191
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6192
netif_rx(skb);
6193
input_queue_head_incr(oldsd);
6194
}
6195
while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6196
netif_rx(skb);
6197
input_queue_head_incr(oldsd);
6198
}
6199
6200
return NOTIFY_OK;
6201
}
6202
6203
6204
/**
6205
* netdev_increment_features - increment feature set by one
6206
* @all: current feature set
6207
* @one: new feature set
6208
* @mask: mask feature set
6209
*
6210
* Computes a new feature set after adding a device with feature set
6211
* @one to the master device with current feature set @all. Will not
6212
* enable anything that is off in @mask. Returns the new feature set.
6213
*/
6214
u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6215
{
6216
if (mask & NETIF_F_GEN_CSUM)
6217
mask |= NETIF_F_ALL_CSUM;
6218
mask |= NETIF_F_VLAN_CHALLENGED;
6219
6220
all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6221
all &= one | ~NETIF_F_ALL_FOR_ALL;
6222
6223
/* If device needs checksumming, downgrade to it. */
6224
if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6225
all &= ~NETIF_F_NO_CSUM;
6226
6227
/* If one device supports hw checksumming, set for all. */
6228
if (all & NETIF_F_GEN_CSUM)
6229
all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6230
6231
return all;
6232
}
6233
EXPORT_SYMBOL(netdev_increment_features);
6234
6235
static struct hlist_head *netdev_create_hash(void)
6236
{
6237
int i;
6238
struct hlist_head *hash;
6239
6240
hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6241
if (hash != NULL)
6242
for (i = 0; i < NETDEV_HASHENTRIES; i++)
6243
INIT_HLIST_HEAD(&hash[i]);
6244
6245
return hash;
6246
}
6247
6248
/* Initialize per network namespace state */
6249
static int __net_init netdev_init(struct net *net)
6250
{
6251
INIT_LIST_HEAD(&net->dev_base_head);
6252
6253
net->dev_name_head = netdev_create_hash();
6254
if (net->dev_name_head == NULL)
6255
goto err_name;
6256
6257
net->dev_index_head = netdev_create_hash();
6258
if (net->dev_index_head == NULL)
6259
goto err_idx;
6260
6261
return 0;
6262
6263
err_idx:
6264
kfree(net->dev_name_head);
6265
err_name:
6266
return -ENOMEM;
6267
}
6268
6269
/**
6270
* netdev_drivername - network driver for the device
6271
* @dev: network device
6272
*
6273
* Determine network driver for device.
6274
*/
6275
const char *netdev_drivername(const struct net_device *dev)
6276
{
6277
const struct device_driver *driver;
6278
const struct device *parent;
6279
const char *empty = "";
6280
6281
parent = dev->dev.parent;
6282
if (!parent)
6283
return empty;
6284
6285
driver = parent->driver;
6286
if (driver && driver->name)
6287
return driver->name;
6288
return empty;
6289
}
6290
6291
static int __netdev_printk(const char *level, const struct net_device *dev,
6292
struct va_format *vaf)
6293
{
6294
int r;
6295
6296
if (dev && dev->dev.parent)
6297
r = dev_printk(level, dev->dev.parent, "%s: %pV",
6298
netdev_name(dev), vaf);
6299
else if (dev)
6300
r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6301
else
6302
r = printk("%s(NULL net_device): %pV", level, vaf);
6303
6304
return r;
6305
}
6306
6307
int netdev_printk(const char *level, const struct net_device *dev,
6308
const char *format, ...)
6309
{
6310
struct va_format vaf;
6311
va_list args;
6312
int r;
6313
6314
va_start(args, format);
6315
6316
vaf.fmt = format;
6317
vaf.va = &args;
6318
6319
r = __netdev_printk(level, dev, &vaf);
6320
va_end(args);
6321
6322
return r;
6323
}
6324
EXPORT_SYMBOL(netdev_printk);
6325
6326
#define define_netdev_printk_level(func, level) \
6327
int func(const struct net_device *dev, const char *fmt, ...) \
6328
{ \
6329
int r; \
6330
struct va_format vaf; \
6331
va_list args; \
6332
\
6333
va_start(args, fmt); \
6334
\
6335
vaf.fmt = fmt; \
6336
vaf.va = &args; \
6337
\
6338
r = __netdev_printk(level, dev, &vaf); \
6339
va_end(args); \
6340
\
6341
return r; \
6342
} \
6343
EXPORT_SYMBOL(func);
6344
6345
define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6346
define_netdev_printk_level(netdev_alert, KERN_ALERT);
6347
define_netdev_printk_level(netdev_crit, KERN_CRIT);
6348
define_netdev_printk_level(netdev_err, KERN_ERR);
6349
define_netdev_printk_level(netdev_warn, KERN_WARNING);
6350
define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6351
define_netdev_printk_level(netdev_info, KERN_INFO);
6352
6353
static void __net_exit netdev_exit(struct net *net)
6354
{
6355
kfree(net->dev_name_head);
6356
kfree(net->dev_index_head);
6357
}
6358
6359
static struct pernet_operations __net_initdata netdev_net_ops = {
6360
.init = netdev_init,
6361
.exit = netdev_exit,
6362
};
6363
6364
static void __net_exit default_device_exit(struct net *net)
6365
{
6366
struct net_device *dev, *aux;
6367
/*
6368
* Push all migratable network devices back to the
6369
* initial network namespace
6370
*/
6371
rtnl_lock();
6372
for_each_netdev_safe(net, dev, aux) {
6373
int err;
6374
char fb_name[IFNAMSIZ];
6375
6376
/* Ignore unmoveable devices (i.e. loopback) */
6377
if (dev->features & NETIF_F_NETNS_LOCAL)
6378
continue;
6379
6380
/* Leave virtual devices for the generic cleanup */
6381
if (dev->rtnl_link_ops)
6382
continue;
6383
6384
/* Push remaining network devices to init_net */
6385
snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6386
err = dev_change_net_namespace(dev, &init_net, fb_name);
6387
if (err) {
6388
printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6389
__func__, dev->name, err);
6390
BUG();
6391
}
6392
}
6393
rtnl_unlock();
6394
}
6395
6396
static void __net_exit default_device_exit_batch(struct list_head *net_list)
6397
{
6398
/* At exit all network devices most be removed from a network
6399
* namespace. Do this in the reverse order of registration.
6400
* Do this across as many network namespaces as possible to
6401
* improve batching efficiency.
6402
*/
6403
struct net_device *dev;
6404
struct net *net;
6405
LIST_HEAD(dev_kill_list);
6406
6407
rtnl_lock();
6408
list_for_each_entry(net, net_list, exit_list) {
6409
for_each_netdev_reverse(net, dev) {
6410
if (dev->rtnl_link_ops)
6411
dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6412
else
6413
unregister_netdevice_queue(dev, &dev_kill_list);
6414
}
6415
}
6416
unregister_netdevice_many(&dev_kill_list);
6417
list_del(&dev_kill_list);
6418
rtnl_unlock();
6419
}
6420
6421
static struct pernet_operations __net_initdata default_device_ops = {
6422
.exit = default_device_exit,
6423
.exit_batch = default_device_exit_batch,
6424
};
6425
6426
/*
6427
* Initialize the DEV module. At boot time this walks the device list and
6428
* unhooks any devices that fail to initialise (normally hardware not
6429
* present) and leaves us with a valid list of present and active devices.
6430
*
6431
*/
6432
6433
/*
6434
* This is called single threaded during boot, so no need
6435
* to take the rtnl semaphore.
6436
*/
6437
static int __init net_dev_init(void)
6438
{
6439
int i, rc = -ENOMEM;
6440
6441
BUG_ON(!dev_boot_phase);
6442
6443
if (dev_proc_init())
6444
goto out;
6445
6446
if (netdev_kobject_init())
6447
goto out;
6448
6449
INIT_LIST_HEAD(&ptype_all);
6450
for (i = 0; i < PTYPE_HASH_SIZE; i++)
6451
INIT_LIST_HEAD(&ptype_base[i]);
6452
6453
if (register_pernet_subsys(&netdev_net_ops))
6454
goto out;
6455
6456
/*
6457
* Initialise the packet receive queues.
6458
*/
6459
6460
for_each_possible_cpu(i) {
6461
struct softnet_data *sd = &per_cpu(softnet_data, i);
6462
6463
memset(sd, 0, sizeof(*sd));
6464
skb_queue_head_init(&sd->input_pkt_queue);
6465
skb_queue_head_init(&sd->process_queue);
6466
sd->completion_queue = NULL;
6467
INIT_LIST_HEAD(&sd->poll_list);
6468
sd->output_queue = NULL;
6469
sd->output_queue_tailp = &sd->output_queue;
6470
#ifdef CONFIG_RPS
6471
sd->csd.func = rps_trigger_softirq;
6472
sd->csd.info = sd;
6473
sd->csd.flags = 0;
6474
sd->cpu = i;
6475
#endif
6476
6477
sd->backlog.poll = process_backlog;
6478
sd->backlog.weight = weight_p;
6479
sd->backlog.gro_list = NULL;
6480
sd->backlog.gro_count = 0;
6481
}
6482
6483
dev_boot_phase = 0;
6484
6485
/* The loopback device is special if any other network devices
6486
* is present in a network namespace the loopback device must
6487
* be present. Since we now dynamically allocate and free the
6488
* loopback device ensure this invariant is maintained by
6489
* keeping the loopback device as the first device on the
6490
* list of network devices. Ensuring the loopback devices
6491
* is the first device that appears and the last network device
6492
* that disappears.
6493
*/
6494
if (register_pernet_device(&loopback_net_ops))
6495
goto out;
6496
6497
if (register_pernet_device(&default_device_ops))
6498
goto out;
6499
6500
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6501
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6502
6503
hotcpu_notifier(dev_cpu_callback, 0);
6504
dst_init();
6505
dev_mcast_init();
6506
rc = 0;
6507
out:
6508
return rc;
6509
}
6510
6511
subsys_initcall(net_dev_init);
6512
6513
static int __init initialize_hashrnd(void)
6514
{
6515
get_random_bytes(&hashrnd, sizeof(hashrnd));
6516
return 0;
6517
}
6518
6519
late_initcall_sync(initialize_hashrnd);
6520
6521
6522