CoCalc -- dev.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/core/dev.c
¹⁵¹¹¹ views
1
/*
2
 * 	NET3	Protocol independent device support routines.
3
 *
4
 *		This program is free software; you can redistribute it and/or
5
 *		modify it under the terms of the GNU General Public License
6
 *		as published by the Free Software Foundation; either version
7
 *		2 of the License, or (at your option) any later version.
8
 *
9
 *	Derived from the non IP parts of dev.c 1.0.19
10
 * 		Authors:	Ross Biro
11
 *				Fred N. van Kempen, <[email protected]>
12
 *				Mark Evans, <[email protected]>
13
 *
14
 *	Additional Authors:
15
 *		Florian la Roche <[email protected]>
16
 *		Alan Cox <[email protected]>
17
 *		David Hinds <[email protected]>
18
 *		Alexey Kuznetsov <[email protected]>
19
 *		Adam Sulmicki <[email protected]>
20
 *              Pekka Riikonen <[email protected]>
21
 *
22
 *	Changes:
23
 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24
 *              			to 2 if register_netdev gets called
25
 *              			before net_dev_init & also removed a
26
 *              			few lines of code in the process.
27
 *		Alan Cox	:	device private ioctl copies fields back.
28
 *		Alan Cox	:	Transmit queue code does relevant
29
 *					stunts to keep the queue safe.
30
 *		Alan Cox	:	Fixed double lock.
31
 *		Alan Cox	:	Fixed promisc NULL pointer trap
32
 *		????????	:	Support the full private ioctl range
33
 *		Alan Cox	:	Moved ioctl permission check into
34
 *					drivers
35
 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36
 *		Alan Cox	:	100 backlog just doesn't cut it when
37
 *					you start doing multicast video 8)
38
 *		Alan Cox	:	Rewrote net_bh and list manager.
39
 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40
 *		Alan Cox	:	Took out transmit every packet pass
41
 *					Saved a few bytes in the ioctl handler
42
 *		Alan Cox	:	Network driver sets packet type before
43
 *					calling netif_rx. Saves a function
44
 *					call a packet.
45
 *		Alan Cox	:	Hashed net_bh()
46
 *		Richard Kooijman:	Timestamp fixes.
47
 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48
 *		Alan Cox	:	Device lock protection.
49
 *		Alan Cox	: 	Fixed nasty side effect of device close
50
 *					changes.
51
 *		Rudi Cilibrasi	:	Pass the right thing to
52
 *					set_mac_address()
53
 *		Dave Miller	:	32bit quantity for the device lock to
54
 *					make it work out on a Sparc.
55
 *		Bjorn Ekwall	:	Added KERNELD hack.
56
 *		Alan Cox	:	Cleaned up the backlog initialise.
57
 *		Craig Metz	:	SIOCGIFCONF fix if space for under
58
 *					1 device.
59
 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60
 *					is no device open function.
61
 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62
 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63
 *		Cyrus Durgin	:	Cleaned for KMOD
64
 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65
 *					A network device unload needs to purge
66
 *					the backlog queue.
67
 *	Paul Rusty Russell	:	SIOCSIFNAME
68
 *              Pekka Riikonen  :	Netdev boot-time settings code
69
 *              Andrew Morton   :       Make unregister_netdevice wait
70
 *              			indefinitely on dev->refcnt
71
 * 		J Hadi Salim	:	- Backlog queue sampling
72
 *				        - netif_rx() feedback
73
 */
74

75
#include <asm/uaccess.h>
76
#include <asm/system.h>
77
#include <linux/bitops.h>
78
#include <linux/capability.h>
79
#include <linux/cpu.h>
80
#include <linux/types.h>
81
#include <linux/kernel.h>
82
#include <linux/hash.h>
83
#include <linux/slab.h>
84
#include <linux/sched.h>
85
#include <linux/mutex.h>
86
#include <linux/string.h>
87
#include <linux/mm.h>
88
#include <linux/socket.h>
89
#include <linux/sockios.h>
90
#include <linux/errno.h>
91
#include <linux/interrupt.h>
92
#include <linux/if_ether.h>
93
#include <linux/netdevice.h>
94
#include <linux/etherdevice.h>
95
#include <linux/ethtool.h>
96
#include <linux/notifier.h>
97
#include <linux/skbuff.h>
98
#include <net/net_namespace.h>
99
#include <net/sock.h>
100
#include <linux/rtnetlink.h>
101
#include <linux/proc_fs.h>
102
#include <linux/seq_file.h>
103
#include <linux/stat.h>
104
#include <net/dst.h>
105
#include <net/pkt_sched.h>
106
#include <net/checksum.h>
107
#include <net/xfrm.h>
108
#include <linux/highmem.h>
109
#include <linux/init.h>
110
#include <linux/kmod.h>
111
#include <linux/module.h>
112
#include <linux/netpoll.h>
113
#include <linux/rcupdate.h>
114
#include <linux/delay.h>
115
#include <net/wext.h>
116
#include <net/iw_handler.h>
117
#include <asm/current.h>
118
#include <linux/audit.h>
119
#include <linux/dmaengine.h>
120
#include <linux/err.h>
121
#include <linux/ctype.h>
122
#include <linux/if_arp.h>
123
#include <linux/if_vlan.h>
124
#include <linux/ip.h>
125
#include <net/ip.h>
126
#include <linux/ipv6.h>
127
#include <linux/in.h>
128
#include <linux/jhash.h>
129
#include <linux/random.h>
130
#include <trace/events/napi.h>
131
#include <trace/events/net.h>
132
#include <trace/events/skb.h>
133
#include <linux/pci.h>
134
#include <linux/inetdevice.h>
135
#include <linux/cpu_rmap.h>
136

137
#include "net-sysfs.h"
138

139
/* Instead of increasing this, you should create a hash table. */
140
#define MAX_GRO_SKBS 8
141

142
/* This should be increased if a protocol with a bigger head is added. */
143
#define GRO_MAX_HEAD (MAX_HEADER + 128)
144

145
/*
146
 *	The list of packet types we will receive (as opposed to discard)
147
 *	and the routines to invoke.
148
 *
149
 *	Why 16. Because with 16 the only overlap we get on a hash of the
150
 *	low nibble of the protocol value is RARP/SNAP/X.25.
151
 *
152
 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
153
 *             sure which should go first, but I bet it won't make much
154
 *             difference if we are running VLANs.  The good news is that
155
 *             this protocol won't be in the list unless compiled in, so
156
 *             the average user (w/out VLANs) will not be adversely affected.
157
 *             --BLG
158
 *
159
 *		0800	IP
160
 *		8100    802.1Q VLAN
161
 *		0001	802.3
162
 *		0002	AX.25
163
 *		0004	802.2
164
 *		8035	RARP
165
 *		0005	SNAP
166
 *		0805	X.25
167
 *		0806	ARP
168
 *		8137	IPX
169
 *		0009	Localtalk
170
 *		86DD	IPv6
171
 */
172

173
#define PTYPE_HASH_SIZE	(16)
174
#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
175

176
static DEFINE_SPINLOCK(ptype_lock);
177
static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178
static struct list_head ptype_all __read_mostly;	/* Taps */
179

180
/*
181
 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182
 * semaphore.
183
 *
184
 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
185
 *
186
 * Writers must hold the rtnl semaphore while they loop through the
187
 * dev_base_head list, and hold dev_base_lock for writing when they do the
188
 * actual updates.  This allows pure readers to access the list even
189
 * while a writer is preparing to update it.
190
 *
191
 * To put it another way, dev_base_lock is held for writing only to
192
 * protect against pure readers; the rtnl semaphore provides the
193
 * protection against other writers.
194
 *
195
 * See, for example usages, register_netdevice() and
196
 * unregister_netdevice(), which must be called with the rtnl
197
 * semaphore held.
198
 */
199
DEFINE_RWLOCK(dev_base_lock);
200
EXPORT_SYMBOL(dev_base_lock);
201

202
static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
203
{
204
	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
205
	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206
}
207

208
static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
209
{
210
	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211
}
212

213
static inline void rps_lock(struct softnet_data *sd)
214
{
215
#ifdef CONFIG_RPS
216
	spin_lock(&sd->input_pkt_queue.lock);
217
#endif
218
}
219

220
static inline void rps_unlock(struct softnet_data *sd)
221
{
222
#ifdef CONFIG_RPS
223
	spin_unlock(&sd->input_pkt_queue.lock);
224
#endif
225
}
226

227
/* Device list insertion */
228
static int list_netdevice(struct net_device *dev)
229
{
230
	struct net *net = dev_net(dev);
231

232
	ASSERT_RTNL();
233

234
	write_lock_bh(&dev_base_lock);
235
	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
236
	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
237
	hlist_add_head_rcu(&dev->index_hlist,
238
			   dev_index_hash(net, dev->ifindex));
239
	write_unlock_bh(&dev_base_lock);
240
	return 0;
241
}
242

243
/* Device list removal
244
 * caller must respect a RCU grace period before freeing/reusing dev
245
 */
246
static void unlist_netdevice(struct net_device *dev)
247
{
248
	ASSERT_RTNL();
249

250
	/* Unlink dev from the device chain */
251
	write_lock_bh(&dev_base_lock);
252
	list_del_rcu(&dev->dev_list);
253
	hlist_del_rcu(&dev->name_hlist);
254
	hlist_del_rcu(&dev->index_hlist);
255
	write_unlock_bh(&dev_base_lock);
256
}
257

258
/*
259
 *	Our notifier list
260
 */
261

262
static RAW_NOTIFIER_HEAD(netdev_chain);
263

264
/*
265
 *	Device drivers call our routines to queue packets here. We empty the
266
 *	queue in the local softnet handler.
267
 */
268

269
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270
EXPORT_PER_CPU_SYMBOL(softnet_data);
271

272
#ifdef CONFIG_LOCKDEP
273
/*
274
 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275
 * according to dev->type
276
 */
277
static const unsigned short netdev_lock_type[] =
278
	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279
	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280
	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281
	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282
	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283
	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284
	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285
	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286
	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287
	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288
	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289
	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290
	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
291
	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
292
	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
293
	 ARPHRD_VOID, ARPHRD_NONE};
294

295
static const char *const netdev_lock_name[] =
296
	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297
	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298
	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299
	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300
	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301
	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302
	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303
	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304
	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305
	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306
	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307
	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308
	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
309
	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
310
	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
311
	 "_xmit_VOID", "_xmit_NONE"};
312

313
static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
314
static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
315

316
static inline unsigned short netdev_lock_pos(unsigned short dev_type)
317
{
318
	int i;
319

320
	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
321
		if (netdev_lock_type[i] == dev_type)
322
			return i;
323
	/* the last key is used by default */
324
	return ARRAY_SIZE(netdev_lock_type) - 1;
325
}
326

327
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328
						 unsigned short dev_type)
329
{
330
	int i;
331

332
	i = netdev_lock_pos(dev_type);
333
	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
334
				   netdev_lock_name[i]);
335
}
336

337
static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
338
{
339
	int i;
340

341
	i = netdev_lock_pos(dev->type);
342
	lockdep_set_class_and_name(&dev->addr_list_lock,
343
				   &netdev_addr_lock_key[i],
344
				   netdev_lock_name[i]);
345
}
346
#else
347
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
348
						 unsigned short dev_type)
349
{
350
}
351
static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352
{
353
}
354
#endif
355

356
/*******************************************************************************
357

358
		Protocol management and registration routines
359

360
*******************************************************************************/
361

362
/*
363
 *	Add a protocol ID to the list. Now that the input handler is
364
 *	smarter we can dispense with all the messy stuff that used to be
365
 *	here.
366
 *
367
 *	BEWARE!!! Protocol handlers, mangling input packets,
368
 *	MUST BE last in hash buckets and checking protocol handlers
369
 *	MUST start from promiscuous ptype_all chain in net_bh.
370
 *	It is true now, do not change it.
371
 *	Explanation follows: if protocol handler, mangling packet, will
372
 *	be the first on list, it is not able to sense, that packet
373
 *	is cloned and should be copied-on-write, so that it will
374
 *	change it and subsequent readers will get broken packet.
375
 *							--ANK (980803)
376
 */
377

378
static inline struct list_head *ptype_head(const struct packet_type *pt)
379
{
380
	if (pt->type == htons(ETH_P_ALL))
381
		return &ptype_all;
382
	else
383
		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384
}
385

386
/**
387
 *	dev_add_pack - add packet handler
388
 *	@pt: packet type declaration
389
 *
390
 *	Add a protocol handler to the networking stack. The passed &packet_type
391
 *	is linked into kernel lists and may not be freed until it has been
392
 *	removed from the kernel lists.
393
 *
394
 *	This call does not sleep therefore it can not
395
 *	guarantee all CPU's that are in middle of receiving packets
396
 *	will see the new packet type (until the next received packet).
397
 */
398

399
void dev_add_pack(struct packet_type *pt)
400
{
401
	struct list_head *head = ptype_head(pt);
402

403
	spin_lock(&ptype_lock);
404
	list_add_rcu(&pt->list, head);
405
	spin_unlock(&ptype_lock);
406
}
407
EXPORT_SYMBOL(dev_add_pack);
408

409
/**
410
 *	__dev_remove_pack	 - remove packet handler
411
 *	@pt: packet type declaration
412
 *
413
 *	Remove a protocol handler that was previously added to the kernel
414
 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
415
 *	from the kernel lists and can be freed or reused once this function
416
 *	returns.
417
 *
418
 *      The packet type might still be in use by receivers
419
 *	and must not be freed until after all the CPU's have gone
420
 *	through a quiescent state.
421
 */
422
void __dev_remove_pack(struct packet_type *pt)
423
{
424
	struct list_head *head = ptype_head(pt);
425
	struct packet_type *pt1;
426

427
	spin_lock(&ptype_lock);
428

429
	list_for_each_entry(pt1, head, list) {
430
		if (pt == pt1) {
431
			list_del_rcu(&pt->list);
432
			goto out;
433
		}
434
	}
435

436
	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437
out:
438
	spin_unlock(&ptype_lock);
439
}
440
EXPORT_SYMBOL(__dev_remove_pack);
441

442
/**
443
 *	dev_remove_pack	 - remove packet handler
444
 *	@pt: packet type declaration
445
 *
446
 *	Remove a protocol handler that was previously added to the kernel
447
 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
448
 *	from the kernel lists and can be freed or reused once this function
449
 *	returns.
450
 *
451
 *	This call sleeps to guarantee that no CPU is looking at the packet
452
 *	type after return.
453
 */
454
void dev_remove_pack(struct packet_type *pt)
455
{
456
	__dev_remove_pack(pt);
457

458
	synchronize_net();
459
}
460
EXPORT_SYMBOL(dev_remove_pack);
461

462
/******************************************************************************
463

464
		      Device Boot-time Settings Routines
465

466
*******************************************************************************/
467

468
/* Boot time configuration table */
469
static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470

471
/**
472
 *	netdev_boot_setup_add	- add new setup entry
473
 *	@name: name of the device
474
 *	@map: configured settings for the device
475
 *
476
 *	Adds new setup entry to the dev_boot_setup list.  The function
477
 *	returns 0 on error and 1 on success.  This is a generic routine to
478
 *	all netdevices.
479
 */
480
static int netdev_boot_setup_add(char *name, struct ifmap *map)
481
{
482
	struct netdev_boot_setup *s;
483
	int i;
484

485
	s = dev_boot_setup;
486
	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487
		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488
			memset(s[i].name, 0, sizeof(s[i].name));
489
			strlcpy(s[i].name, name, IFNAMSIZ);
490
			memcpy(&s[i].map, map, sizeof(s[i].map));
491
			break;
492
		}
493
	}
494

495
	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496
}
497

498
/**
499
 *	netdev_boot_setup_check	- check boot time settings
500
 *	@dev: the netdevice
501
 *
502
 * 	Check boot time settings for the device.
503
 *	The found settings are set for the device to be used
504
 *	later in the device probing.
505
 *	Returns 0 if no settings found, 1 if they are.
506
 */
507
int netdev_boot_setup_check(struct net_device *dev)
508
{
509
	struct netdev_boot_setup *s = dev_boot_setup;
510
	int i;
511

512
	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513
		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514
		    !strcmp(dev->name, s[i].name)) {
515
			dev->irq 	= s[i].map.irq;
516
			dev->base_addr 	= s[i].map.base_addr;
517
			dev->mem_start 	= s[i].map.mem_start;
518
			dev->mem_end 	= s[i].map.mem_end;
519
			return 1;
520
		}
521
	}
522
	return 0;
523
}
524
EXPORT_SYMBOL(netdev_boot_setup_check);
525

526

527
/**
528
 *	netdev_boot_base	- get address from boot time settings
529
 *	@prefix: prefix for network device
530
 *	@unit: id for network device
531
 *
532
 * 	Check boot time settings for the base address of device.
533
 *	The found settings are set for the device to be used
534
 *	later in the device probing.
535
 *	Returns 0 if no settings found.
536
 */
537
unsigned long netdev_boot_base(const char *prefix, int unit)
538
{
539
	const struct netdev_boot_setup *s = dev_boot_setup;
540
	char name[IFNAMSIZ];
541
	int i;
542

543
	sprintf(name, "%s%d", prefix, unit);
544

545
	/*
546
	 * If device already registered then return base of 1
547
	 * to indicate not to probe for this interface
548
	 */
549
	if (__dev_get_by_name(&init_net, name))
550
		return 1;
551

552
	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553
		if (!strcmp(name, s[i].name))
554
			return s[i].map.base_addr;
555
	return 0;
556
}
557

558
/*
559
 * Saves at boot time configured settings for any netdevice.
560
 */
561
int __init netdev_boot_setup(char *str)
562
{
563
	int ints[5];
564
	struct ifmap map;
565

566
	str = get_options(str, ARRAY_SIZE(ints), ints);
567
	if (!str || !*str)
568
		return 0;
569

570
	/* Save settings */
571
	memset(&map, 0, sizeof(map));
572
	if (ints[0] > 0)
573
		map.irq = ints[1];
574
	if (ints[0] > 1)
575
		map.base_addr = ints[2];
576
	if (ints[0] > 2)
577
		map.mem_start = ints[3];
578
	if (ints[0] > 3)
579
		map.mem_end = ints[4];
580

581
	/* Add new entry to the list */
582
	return netdev_boot_setup_add(str, &map);
583
}
584

585
__setup("netdev=", netdev_boot_setup);
586

587
/*******************************************************************************
588

589
			    Device Interface Subroutines
590

591
*******************************************************************************/
592

593
/**
594
 *	__dev_get_by_name	- find a device by its name
595
 *	@net: the applicable net namespace
596
 *	@name: name to find
597
 *
598
 *	Find an interface by name. Must be called under RTNL semaphore
599
 *	or @dev_base_lock. If the name is found a pointer to the device
600
 *	is returned. If the name is not found then %NULL is returned. The
601
 *	reference counters are not incremented so the caller must be
602
 *	careful with locks.
603
 */
604

605
struct net_device *__dev_get_by_name(struct net *net, const char *name)
606
{
607
	struct hlist_node *p;
608
	struct net_device *dev;
609
	struct hlist_head *head = dev_name_hash(net, name);
610

611
	hlist_for_each_entry(dev, p, head, name_hlist)
612
		if (!strncmp(dev->name, name, IFNAMSIZ))
613
			return dev;
614

615
	return NULL;
616
}
617
EXPORT_SYMBOL(__dev_get_by_name);
618

619
/**
620
 *	dev_get_by_name_rcu	- find a device by its name
621
 *	@net: the applicable net namespace
622
 *	@name: name to find
623
 *
624
 *	Find an interface by name.
625
 *	If the name is found a pointer to the device is returned.
626
 * 	If the name is not found then %NULL is returned.
627
 *	The reference counters are not incremented so the caller must be
628
 *	careful with locks. The caller must hold RCU lock.
629
 */
630

631
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632
{
633
	struct hlist_node *p;
634
	struct net_device *dev;
635
	struct hlist_head *head = dev_name_hash(net, name);
636

637
	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638
		if (!strncmp(dev->name, name, IFNAMSIZ))
639
			return dev;
640

641
	return NULL;
642
}
643
EXPORT_SYMBOL(dev_get_by_name_rcu);
644

645
/**
646
 *	dev_get_by_name		- find a device by its name
647
 *	@net: the applicable net namespace
648
 *	@name: name to find
649
 *
650
 *	Find an interface by name. This can be called from any
651
 *	context and does its own locking. The returned handle has
652
 *	the usage count incremented and the caller must use dev_put() to
653
 *	release it when it is no longer needed. %NULL is returned if no
654
 *	matching device is found.
655
 */
656

657
struct net_device *dev_get_by_name(struct net *net, const char *name)
658
{
659
	struct net_device *dev;
660

661
	rcu_read_lock();
662
	dev = dev_get_by_name_rcu(net, name);
663
	if (dev)
664
		dev_hold(dev);
665
	rcu_read_unlock();
666
	return dev;
667
}
668
EXPORT_SYMBOL(dev_get_by_name);
669

670
/**
671
 *	__dev_get_by_index - find a device by its ifindex
672
 *	@net: the applicable net namespace
673
 *	@ifindex: index of device
674
 *
675
 *	Search for an interface by index. Returns %NULL if the device
676
 *	is not found or a pointer to the device. The device has not
677
 *	had its reference counter increased so the caller must be careful
678
 *	about locking. The caller must hold either the RTNL semaphore
679
 *	or @dev_base_lock.
680
 */
681

682
struct net_device *__dev_get_by_index(struct net *net, int ifindex)
683
{
684
	struct hlist_node *p;
685
	struct net_device *dev;
686
	struct hlist_head *head = dev_index_hash(net, ifindex);
687

688
	hlist_for_each_entry(dev, p, head, index_hlist)
689
		if (dev->ifindex == ifindex)
690
			return dev;
691

692
	return NULL;
693
}
694
EXPORT_SYMBOL(__dev_get_by_index);
695

696
/**
697
 *	dev_get_by_index_rcu - find a device by its ifindex
698
 *	@net: the applicable net namespace
699
 *	@ifindex: index of device
700
 *
701
 *	Search for an interface by index. Returns %NULL if the device
702
 *	is not found or a pointer to the device. The device has not
703
 *	had its reference counter increased so the caller must be careful
704
 *	about locking. The caller must hold RCU lock.
705
 */
706

707
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708
{
709
	struct hlist_node *p;
710
	struct net_device *dev;
711
	struct hlist_head *head = dev_index_hash(net, ifindex);
712

713
	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714
		if (dev->ifindex == ifindex)
715
			return dev;
716

717
	return NULL;
718
}
719
EXPORT_SYMBOL(dev_get_by_index_rcu);
720

721

722
/**
723
 *	dev_get_by_index - find a device by its ifindex
724
 *	@net: the applicable net namespace
725
 *	@ifindex: index of device
726
 *
727
 *	Search for an interface by index. Returns NULL if the device
728
 *	is not found or a pointer to the device. The device returned has
729
 *	had a reference added and the pointer is safe until the user calls
730
 *	dev_put to indicate they have finished with it.
731
 */
732

733
struct net_device *dev_get_by_index(struct net *net, int ifindex)
734
{
735
	struct net_device *dev;
736

737
	rcu_read_lock();
738
	dev = dev_get_by_index_rcu(net, ifindex);
739
	if (dev)
740
		dev_hold(dev);
741
	rcu_read_unlock();
742
	return dev;
743
}
744
EXPORT_SYMBOL(dev_get_by_index);
745

746
/**
747
 *	dev_getbyhwaddr_rcu - find a device by its hardware address
748
 *	@net: the applicable net namespace
749
 *	@type: media type of device
750
 *	@ha: hardware address
751
 *
752
 *	Search for an interface by MAC address. Returns NULL if the device
753
 *	is not found or a pointer to the device.
754
 *	The caller must hold RCU or RTNL.
755
 *	The returned device has not had its ref count increased
756
 *	and the caller must therefore be careful about locking
757
 *
758
 */
759

760
struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
761
				       const char *ha)
762
{
763
	struct net_device *dev;
764

765
	for_each_netdev_rcu(net, dev)
766
		if (dev->type == type &&
767
		    !memcmp(dev->dev_addr, ha, dev->addr_len))
768
			return dev;
769

770
	return NULL;
771
}
772
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
773

774
struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
775
{
776
	struct net_device *dev;
777

778
	ASSERT_RTNL();
779
	for_each_netdev(net, dev)
780
		if (dev->type == type)
781
			return dev;
782

783
	return NULL;
784
}
785
EXPORT_SYMBOL(__dev_getfirstbyhwtype);
786

787
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
788
{
789
	struct net_device *dev, *ret = NULL;
790

791
	rcu_read_lock();
792
	for_each_netdev_rcu(net, dev)
793
		if (dev->type == type) {
794
			dev_hold(dev);
795
			ret = dev;
796
			break;
797
		}
798
	rcu_read_unlock();
799
	return ret;
800
}
801
EXPORT_SYMBOL(dev_getfirstbyhwtype);
802

803
/**
804
 *	dev_get_by_flags_rcu - find any device with given flags
805
 *	@net: the applicable net namespace
806
 *	@if_flags: IFF_* values
807
 *	@mask: bitmask of bits in if_flags to check
808
 *
809
 *	Search for any interface with the given flags. Returns NULL if a device
810
 *	is not found or a pointer to the device. Must be called inside
811
 *	rcu_read_lock(), and result refcount is unchanged.
812
 */
813

814
struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
815
				    unsigned short mask)
816
{
817
	struct net_device *dev, *ret;
818

819
	ret = NULL;
820
	for_each_netdev_rcu(net, dev) {
821
		if (((dev->flags ^ if_flags) & mask) == 0) {
822
			ret = dev;
823
			break;
824
		}
825
	}
826
	return ret;
827
}
828
EXPORT_SYMBOL(dev_get_by_flags_rcu);
829

830
/**
831
 *	dev_valid_name - check if name is okay for network device
832
 *	@name: name string
833
 *
834
 *	Network device names need to be valid file names to
835
 *	to allow sysfs to work.  We also disallow any kind of
836
 *	whitespace.
837
 */
838
int dev_valid_name(const char *name)
839
{
840
	if (*name == '\0')
841
		return 0;
842
	if (strlen(name) >= IFNAMSIZ)
843
		return 0;
844
	if (!strcmp(name, ".") || !strcmp(name, ".."))
845
		return 0;
846

847
	while (*name) {
848
		if (*name == '/' || isspace(*name))
849
			return 0;
850
		name++;
851
	}
852
	return 1;
853
}
854
EXPORT_SYMBOL(dev_valid_name);
855

856
/**
857
 *	__dev_alloc_name - allocate a name for a device
858
 *	@net: network namespace to allocate the device name in
859
 *	@name: name format string
860
 *	@buf:  scratch buffer and result name string
861
 *
862
 *	Passed a format string - eg "lt%d" it will try and find a suitable
863
 *	id. It scans list of devices to build up a free map, then chooses
864
 *	the first empty slot. The caller must hold the dev_base or rtnl lock
865
 *	while allocating the name and adding the device in order to avoid
866
 *	duplicates.
867
 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
868
 *	Returns the number of the unit assigned or a negative errno code.
869
 */
870

871
static int __dev_alloc_name(struct net *net, const char *name, char *buf)
872
{
873
	int i = 0;
874
	const char *p;
875
	const int max_netdevices = 8*PAGE_SIZE;
876
	unsigned long *inuse;
877
	struct net_device *d;
878

879
	p = strnchr(name, IFNAMSIZ-1, '%');
880
	if (p) {
881
		/*
882
		 * Verify the string as this thing may have come from
883
		 * the user.  There must be either one "%d" and no other "%"
884
		 * characters.
885
		 */
886
		if (p[1] != 'd' || strchr(p + 2, '%'))
887
			return -EINVAL;
888

889
		/* Use one page as a bit array of possible slots */
890
		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
891
		if (!inuse)
892
			return -ENOMEM;
893

894
		for_each_netdev(net, d) {
895
			if (!sscanf(d->name, name, &i))
896
				continue;
897
			if (i < 0 || i >= max_netdevices)
898
				continue;
899

900
			/*  avoid cases where sscanf is not exact inverse of printf */
901
			snprintf(buf, IFNAMSIZ, name, i);
902
			if (!strncmp(buf, d->name, IFNAMSIZ))
903
				set_bit(i, inuse);
904
		}
905

906
		i = find_first_zero_bit(inuse, max_netdevices);
907
		free_page((unsigned long) inuse);
908
	}
909

910
	if (buf != name)
911
		snprintf(buf, IFNAMSIZ, name, i);
912
	if (!__dev_get_by_name(net, buf))
913
		return i;
914

915
	/* It is possible to run out of possible slots
916
	 * when the name is long and there isn't enough space left
917
	 * for the digits, or if all bits are used.
918
	 */
919
	return -ENFILE;
920
}
921

922
/**
923
 *	dev_alloc_name - allocate a name for a device
924
 *	@dev: device
925
 *	@name: name format string
926
 *
927
 *	Passed a format string - eg "lt%d" it will try and find a suitable
928
 *	id. It scans list of devices to build up a free map, then chooses
929
 *	the first empty slot. The caller must hold the dev_base or rtnl lock
930
 *	while allocating the name and adding the device in order to avoid
931
 *	duplicates.
932
 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
933
 *	Returns the number of the unit assigned or a negative errno code.
934
 */
935

936
int dev_alloc_name(struct net_device *dev, const char *name)
937
{
938
	char buf[IFNAMSIZ];
939
	struct net *net;
940
	int ret;
941

942
	BUG_ON(!dev_net(dev));
943
	net = dev_net(dev);
944
	ret = __dev_alloc_name(net, name, buf);
945
	if (ret >= 0)
946
		strlcpy(dev->name, buf, IFNAMSIZ);
947
	return ret;
948
}
949
EXPORT_SYMBOL(dev_alloc_name);
950

951
static int dev_get_valid_name(struct net_device *dev, const char *name)
952
{
953
	struct net *net;
954

955
	BUG_ON(!dev_net(dev));
956
	net = dev_net(dev);
957

958
	if (!dev_valid_name(name))
959
		return -EINVAL;
960

961
	if (strchr(name, '%'))
962
		return dev_alloc_name(dev, name);
963
	else if (__dev_get_by_name(net, name))
964
		return -EEXIST;
965
	else if (dev->name != name)
966
		strlcpy(dev->name, name, IFNAMSIZ);
967

968
	return 0;
969
}
970

971
/**
972
 *	dev_change_name - change name of a device
973
 *	@dev: device
974
 *	@newname: name (or format string) must be at least IFNAMSIZ
975
 *
976
 *	Change name of a device, can pass format strings "eth%d".
977
 *	for wildcarding.
978
 */
979
int dev_change_name(struct net_device *dev, const char *newname)
980
{
981
	char oldname[IFNAMSIZ];
982
	int err = 0;
983
	int ret;
984
	struct net *net;
985

986
	ASSERT_RTNL();
987
	BUG_ON(!dev_net(dev));
988

989
	net = dev_net(dev);
990
	if (dev->flags & IFF_UP)
991
		return -EBUSY;
992

993
	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
994
		return 0;
995

996
	memcpy(oldname, dev->name, IFNAMSIZ);
997

998
	err = dev_get_valid_name(dev, newname);
999
	if (err < 0)
1000
		return err;
1001

1002
rollback:
1003
	ret = device_rename(&dev->dev, dev->name);
1004
	if (ret) {
1005
		memcpy(dev->name, oldname, IFNAMSIZ);
1006
		return ret;
1007
	}
1008

1009
	write_lock_bh(&dev_base_lock);
1010
	hlist_del_rcu(&dev->name_hlist);
1011
	write_unlock_bh(&dev_base_lock);
1012

1013
	synchronize_rcu();
1014

1015
	write_lock_bh(&dev_base_lock);
1016
	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017
	write_unlock_bh(&dev_base_lock);
1018

1019
	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020
	ret = notifier_to_errno(ret);
1021

1022
	if (ret) {
1023
		/* err >= 0 after dev_alloc_name() or stores the first errno */
1024
		if (err >= 0) {
1025
			err = ret;
1026
			memcpy(dev->name, oldname, IFNAMSIZ);
1027
			goto rollback;
1028
		} else {
1029
			printk(KERN_ERR
1030
			       "%s: name change rollback failed: %d.\n",
1031
			       dev->name, ret);
1032
		}
1033
	}
1034

1035
	return err;
1036
}
1037

1038
/**
1039
 *	dev_set_alias - change ifalias of a device
1040
 *	@dev: device
1041
 *	@alias: name up to IFALIASZ
1042
 *	@len: limit of bytes to copy from info
1043
 *
1044
 *	Set ifalias for a device,
1045
 */
1046
int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047
{
1048
	ASSERT_RTNL();
1049

1050
	if (len >= IFALIASZ)
1051
		return -EINVAL;
1052

1053
	if (!len) {
1054
		if (dev->ifalias) {
1055
			kfree(dev->ifalias);
1056
			dev->ifalias = NULL;
1057
		}
1058
		return 0;
1059
	}
1060

1061
	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062
	if (!dev->ifalias)
1063
		return -ENOMEM;
1064

1065
	strlcpy(dev->ifalias, alias, len+1);
1066
	return len;
1067
}
1068

1069

1070
/**
1071
 *	netdev_features_change - device changes features
1072
 *	@dev: device to cause notification
1073
 *
1074
 *	Called to indicate a device has changed features.
1075
 */
1076
void netdev_features_change(struct net_device *dev)
1077
{
1078
	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079
}
1080
EXPORT_SYMBOL(netdev_features_change);
1081

1082
/**
1083
 *	netdev_state_change - device changes state
1084
 *	@dev: device to cause notification
1085
 *
1086
 *	Called to indicate a device has changed state. This function calls
1087
 *	the notifier chains for netdev_chain and sends a NEWLINK message
1088
 *	to the routing socket.
1089
 */
1090
void netdev_state_change(struct net_device *dev)
1091
{
1092
	if (dev->flags & IFF_UP) {
1093
		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094
		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095
	}
1096
}
1097
EXPORT_SYMBOL(netdev_state_change);
1098

1099
int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100
{
1101
	return call_netdevice_notifiers(event, dev);
1102
}
1103
EXPORT_SYMBOL(netdev_bonding_change);
1104

1105
/**
1106
 *	dev_load 	- load a network module
1107
 *	@net: the applicable net namespace
1108
 *	@name: name of interface
1109
 *
1110
 *	If a network interface is not present and the process has suitable
1111
 *	privileges this function loads the module. If module loading is not
1112
 *	available in this kernel then it becomes a nop.
1113
 */
1114

1115
void dev_load(struct net *net, const char *name)
1116
{
1117
	struct net_device *dev;
1118
	int no_module;
1119

1120
	rcu_read_lock();
1121
	dev = dev_get_by_name_rcu(net, name);
1122
	rcu_read_unlock();
1123

1124
	no_module = !dev;
1125
	if (no_module && capable(CAP_NET_ADMIN))
1126
		no_module = request_module("netdev-%s", name);
1127
	if (no_module && capable(CAP_SYS_MODULE)) {
1128
		if (!request_module("%s", name))
1129
			pr_err("Loading kernel module for a network device "
1130
"with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1131
"instead\n", name);
1132
	}
1133
}
1134
EXPORT_SYMBOL(dev_load);
1135

1136
static int __dev_open(struct net_device *dev)
1137
{
1138
	const struct net_device_ops *ops = dev->netdev_ops;
1139
	int ret;
1140

1141
	ASSERT_RTNL();
1142

1143
	if (!netif_device_present(dev))
1144
		return -ENODEV;
1145

1146
	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1147
	ret = notifier_to_errno(ret);
1148
	if (ret)
1149
		return ret;
1150

1151
	set_bit(__LINK_STATE_START, &dev->state);
1152

1153
	if (ops->ndo_validate_addr)
1154
		ret = ops->ndo_validate_addr(dev);
1155

1156
	if (!ret && ops->ndo_open)
1157
		ret = ops->ndo_open(dev);
1158

1159
	if (ret)
1160
		clear_bit(__LINK_STATE_START, &dev->state);
1161
	else {
1162
		dev->flags |= IFF_UP;
1163
		net_dmaengine_get();
1164
		dev_set_rx_mode(dev);
1165
		dev_activate(dev);
1166
	}
1167

1168
	return ret;
1169
}
1170

1171
/**
1172
 *	dev_open	- prepare an interface for use.
1173
 *	@dev:	device to open
1174
 *
1175
 *	Takes a device from down to up state. The device's private open
1176
 *	function is invoked and then the multicast lists are loaded. Finally
1177
 *	the device is moved into the up state and a %NETDEV_UP message is
1178
 *	sent to the netdev notifier chain.
1179
 *
1180
 *	Calling this function on an active interface is a nop. On a failure
1181
 *	a negative errno code is returned.
1182
 */
1183
int dev_open(struct net_device *dev)
1184
{
1185
	int ret;
1186

1187
	if (dev->flags & IFF_UP)
1188
		return 0;
1189

1190
	ret = __dev_open(dev);
1191
	if (ret < 0)
1192
		return ret;
1193

1194
	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1195
	call_netdevice_notifiers(NETDEV_UP, dev);
1196

1197
	return ret;
1198
}
1199
EXPORT_SYMBOL(dev_open);
1200

1201
static int __dev_close_many(struct list_head *head)
1202
{
1203
	struct net_device *dev;
1204

1205
	ASSERT_RTNL();
1206
	might_sleep();
1207

1208
	list_for_each_entry(dev, head, unreg_list) {
1209
		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1210

1211
		clear_bit(__LINK_STATE_START, &dev->state);
1212

1213
		/* Synchronize to scheduled poll. We cannot touch poll list, it
1214
		 * can be even on different cpu. So just clear netif_running().
1215
		 *
1216
		 * dev->stop() will invoke napi_disable() on all of it's
1217
		 * napi_struct instances on this device.
1218
		 */
1219
		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220
	}
1221

1222
	dev_deactivate_many(head);
1223

1224
	list_for_each_entry(dev, head, unreg_list) {
1225
		const struct net_device_ops *ops = dev->netdev_ops;
1226

1227
		/*
1228
		 *	Call the device specific close. This cannot fail.
1229
		 *	Only if device is UP
1230
		 *
1231
		 *	We allow it to be called even after a DETACH hot-plug
1232
		 *	event.
1233
		 */
1234
		if (ops->ndo_stop)
1235
			ops->ndo_stop(dev);
1236

1237
		dev->flags &= ~IFF_UP;
1238
		net_dmaengine_put();
1239
	}
1240

1241
	return 0;
1242
}
1243

1244
static int __dev_close(struct net_device *dev)
1245
{
1246
	int retval;
1247
	LIST_HEAD(single);
1248

1249
	list_add(&dev->unreg_list, &single);
1250
	retval = __dev_close_many(&single);
1251
	list_del(&single);
1252
	return retval;
1253
}
1254

1255
static int dev_close_many(struct list_head *head)
1256
{
1257
	struct net_device *dev, *tmp;
1258
	LIST_HEAD(tmp_list);
1259

1260
	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261
		if (!(dev->flags & IFF_UP))
1262
			list_move(&dev->unreg_list, &tmp_list);
1263

1264
	__dev_close_many(head);
1265

1266
	list_for_each_entry(dev, head, unreg_list) {
1267
		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268
		call_netdevice_notifiers(NETDEV_DOWN, dev);
1269
	}
1270

1271
	/* rollback_registered_many needs the complete original list */
1272
	list_splice(&tmp_list, head);
1273
	return 0;
1274
}
1275

1276
/**
1277
 *	dev_close - shutdown an interface.
1278
 *	@dev: device to shutdown
1279
 *
1280
 *	This function moves an active device into down state. A
1281
 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282
 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1283
 *	chain.
1284
 */
1285
int dev_close(struct net_device *dev)
1286
{
1287
	if (dev->flags & IFF_UP) {
1288
		LIST_HEAD(single);
1289

1290
		list_add(&dev->unreg_list, &single);
1291
		dev_close_many(&single);
1292
		list_del(&single);
1293
	}
1294
	return 0;
1295
}
1296
EXPORT_SYMBOL(dev_close);
1297

1298

1299
/**
1300
 *	dev_disable_lro - disable Large Receive Offload on a device
1301
 *	@dev: device
1302
 *
1303
 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1304
 *	called under RTNL.  This is needed if received packets may be
1305
 *	forwarded to another interface.
1306
 */
1307
void dev_disable_lro(struct net_device *dev)
1308
{
1309
	u32 flags;
1310

1311
	/*
1312
	 * If we're trying to disable lro on a vlan device
1313
	 * use the underlying physical device instead
1314
	 */
1315
	if (is_vlan_dev(dev))
1316
		dev = vlan_dev_real_dev(dev);
1317

1318
	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1319
		flags = dev->ethtool_ops->get_flags(dev);
1320
	else
1321
		flags = ethtool_op_get_flags(dev);
1322

1323
	if (!(flags & ETH_FLAG_LRO))
1324
		return;
1325

1326
	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1327
	if (unlikely(dev->features & NETIF_F_LRO))
1328
		netdev_WARN(dev, "failed to disable LRO!\n");
1329
}
1330
EXPORT_SYMBOL(dev_disable_lro);
1331

1332

1333
static int dev_boot_phase = 1;
1334

1335
/**
1336
 *	register_netdevice_notifier - register a network notifier block
1337
 *	@nb: notifier
1338
 *
1339
 *	Register a notifier to be called when network device events occur.
1340
 *	The notifier passed is linked into the kernel structures and must
1341
 *	not be reused until it has been unregistered. A negative errno code
1342
 *	is returned on a failure.
1343
 *
1344
 * 	When registered all registration and up events are replayed
1345
 *	to the new notifier to allow device to have a race free
1346
 *	view of the network device list.
1347
 */
1348

1349
int register_netdevice_notifier(struct notifier_block *nb)
1350
{
1351
	struct net_device *dev;
1352
	struct net_device *last;
1353
	struct net *net;
1354
	int err;
1355

1356
	rtnl_lock();
1357
	err = raw_notifier_chain_register(&netdev_chain, nb);
1358
	if (err)
1359
		goto unlock;
1360
	if (dev_boot_phase)
1361
		goto unlock;
1362
	for_each_net(net) {
1363
		for_each_netdev(net, dev) {
1364
			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1365
			err = notifier_to_errno(err);
1366
			if (err)
1367
				goto rollback;
1368

1369
			if (!(dev->flags & IFF_UP))
1370
				continue;
1371

1372
			nb->notifier_call(nb, NETDEV_UP, dev);
1373
		}
1374
	}
1375

1376
unlock:
1377
	rtnl_unlock();
1378
	return err;
1379

1380
rollback:
1381
	last = dev;
1382
	for_each_net(net) {
1383
		for_each_netdev(net, dev) {
1384
			if (dev == last)
1385
				break;
1386

1387
			if (dev->flags & IFF_UP) {
1388
				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1389
				nb->notifier_call(nb, NETDEV_DOWN, dev);
1390
			}
1391
			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1392
			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1393
		}
1394
	}
1395

1396
	raw_notifier_chain_unregister(&netdev_chain, nb);
1397
	goto unlock;
1398
}
1399
EXPORT_SYMBOL(register_netdevice_notifier);
1400

1401
/**
1402
 *	unregister_netdevice_notifier - unregister a network notifier block
1403
 *	@nb: notifier
1404
 *
1405
 *	Unregister a notifier previously registered by
1406
 *	register_netdevice_notifier(). The notifier is unlinked into the
1407
 *	kernel structures and may then be reused. A negative errno code
1408
 *	is returned on a failure.
1409
 */
1410

1411
int unregister_netdevice_notifier(struct notifier_block *nb)
1412
{
1413
	int err;
1414

1415
	rtnl_lock();
1416
	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1417
	rtnl_unlock();
1418
	return err;
1419
}
1420
EXPORT_SYMBOL(unregister_netdevice_notifier);
1421

1422
/**
1423
 *	call_netdevice_notifiers - call all network notifier blocks
1424
 *      @val: value passed unmodified to notifier function
1425
 *      @dev: net_device pointer passed unmodified to notifier function
1426
 *
1427
 *	Call all network notifier blocks.  Parameters and return value
1428
 *	are as for raw_notifier_call_chain().
1429
 */
1430

1431
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1432
{
1433
	ASSERT_RTNL();
1434
	return raw_notifier_call_chain(&netdev_chain, val, dev);
1435
}
1436
EXPORT_SYMBOL(call_netdevice_notifiers);
1437

1438
/* When > 0 there are consumers of rx skb time stamps */
1439
static atomic_t netstamp_needed = ATOMIC_INIT(0);
1440

1441
void net_enable_timestamp(void)
1442
{
1443
	atomic_inc(&netstamp_needed);
1444
}
1445
EXPORT_SYMBOL(net_enable_timestamp);
1446

1447
void net_disable_timestamp(void)
1448
{
1449
	atomic_dec(&netstamp_needed);
1450
}
1451
EXPORT_SYMBOL(net_disable_timestamp);
1452

1453
static inline void net_timestamp_set(struct sk_buff *skb)
1454
{
1455
	if (atomic_read(&netstamp_needed))
1456
		__net_timestamp(skb);
1457
	else
1458
		skb->tstamp.tv64 = 0;
1459
}
1460

1461
static inline void net_timestamp_check(struct sk_buff *skb)
1462
{
1463
	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1464
		__net_timestamp(skb);
1465
}
1466

1467
static inline bool is_skb_forwardable(struct net_device *dev,
1468
				      struct sk_buff *skb)
1469
{
1470
	unsigned int len;
1471

1472
	if (!(dev->flags & IFF_UP))
1473
		return false;
1474

1475
	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1476
	if (skb->len <= len)
1477
		return true;
1478

1479
	/* if TSO is enabled, we don't care about the length as the packet
1480
	 * could be forwarded without being segmented before
1481
	 */
1482
	if (skb_is_gso(skb))
1483
		return true;
1484

1485
	return false;
1486
}
1487

1488
/**
1489
 * dev_forward_skb - loopback an skb to another netif
1490
 *
1491
 * @dev: destination network device
1492
 * @skb: buffer to forward
1493
 *
1494
 * return values:
1495
 *	NET_RX_SUCCESS	(no congestion)
1496
 *	NET_RX_DROP     (packet was dropped, but freed)
1497
 *
1498
 * dev_forward_skb can be used for injecting an skb from the
1499
 * start_xmit function of one device into the receive queue
1500
 * of another device.
1501
 *
1502
 * The receiving device may be in another namespace, so
1503
 * we have to clear all information in the skb that could
1504
 * impact namespace isolation.
1505
 */
1506
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1507
{
1508
	skb_orphan(skb);
1509
	nf_reset(skb);
1510

1511
	if (unlikely(!is_skb_forwardable(dev, skb))) {
1512
		atomic_long_inc(&dev->rx_dropped);
1513
		kfree_skb(skb);
1514
		return NET_RX_DROP;
1515
	}
1516
	skb_set_dev(skb, dev);
1517
	skb->tstamp.tv64 = 0;
1518
	skb->pkt_type = PACKET_HOST;
1519
	skb->protocol = eth_type_trans(skb, dev);
1520
	return netif_rx(skb);
1521
}
1522
EXPORT_SYMBOL_GPL(dev_forward_skb);
1523

1524
static inline int deliver_skb(struct sk_buff *skb,
1525
			      struct packet_type *pt_prev,
1526
			      struct net_device *orig_dev)
1527
{
1528
	atomic_inc(&skb->users);
1529
	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1530
}
1531

1532
/*
1533
 *	Support routine. Sends outgoing frames to any network
1534
 *	taps currently in use.
1535
 */
1536

1537
static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1538
{
1539
	struct packet_type *ptype;
1540
	struct sk_buff *skb2 = NULL;
1541
	struct packet_type *pt_prev = NULL;
1542

1543
	rcu_read_lock();
1544
	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1545
		/* Never send packets back to the socket
1546
		 * they originated from - MvS ([email protected])
1547
		 */
1548
		if ((ptype->dev == dev || !ptype->dev) &&
1549
		    (ptype->af_packet_priv == NULL ||
1550
		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1551
			if (pt_prev) {
1552
				deliver_skb(skb2, pt_prev, skb->dev);
1553
				pt_prev = ptype;
1554
				continue;
1555
			}
1556

1557
			skb2 = skb_clone(skb, GFP_ATOMIC);
1558
			if (!skb2)
1559
				break;
1560

1561
			net_timestamp_set(skb2);
1562

1563
			/* skb->nh should be correctly
1564
			   set by sender, so that the second statement is
1565
			   just protection against buggy protocols.
1566
			 */
1567
			skb_reset_mac_header(skb2);
1568

1569
			if (skb_network_header(skb2) < skb2->data ||
1570
			    skb2->network_header > skb2->tail) {
1571
				if (net_ratelimit())
1572
					printk(KERN_CRIT "protocol %04x is "
1573
					       "buggy, dev %s\n",
1574
					       ntohs(skb2->protocol),
1575
					       dev->name);
1576
				skb_reset_network_header(skb2);
1577
			}
1578

1579
			skb2->transport_header = skb2->network_header;
1580
			skb2->pkt_type = PACKET_OUTGOING;
1581
			pt_prev = ptype;
1582
		}
1583
	}
1584
	if (pt_prev)
1585
		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1586
	rcu_read_unlock();
1587
}
1588

1589
/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1590
 * @dev: Network device
1591
 * @txq: number of queues available
1592
 *
1593
 * If real_num_tx_queues is changed the tc mappings may no longer be
1594
 * valid. To resolve this verify the tc mapping remains valid and if
1595
 * not NULL the mapping. With no priorities mapping to this
1596
 * offset/count pair it will no longer be used. In the worst case TC0
1597
 * is invalid nothing can be done so disable priority mappings. If is
1598
 * expected that drivers will fix this mapping if they can before
1599
 * calling netif_set_real_num_tx_queues.
1600
 */
1601
static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1602
{
1603
	int i;
1604
	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1605

1606
	/* If TC0 is invalidated disable TC mapping */
1607
	if (tc->offset + tc->count > txq) {
1608
		pr_warning("Number of in use tx queues changed "
1609
			   "invalidating tc mappings. Priority "
1610
			   "traffic classification disabled!\n");
1611
		dev->num_tc = 0;
1612
		return;
1613
	}
1614

1615
	/* Invalidated prio to tc mappings set to TC0 */
1616
	for (i = 1; i < TC_BITMASK + 1; i++) {
1617
		int q = netdev_get_prio_tc_map(dev, i);
1618

1619
		tc = &dev->tc_to_txq[q];
1620
		if (tc->offset + tc->count > txq) {
1621
			pr_warning("Number of in use tx queues "
1622
				   "changed. Priority %i to tc "
1623
				   "mapping %i is no longer valid "
1624
				   "setting map to 0\n",
1625
				   i, q);
1626
			netdev_set_prio_tc_map(dev, i, 0);
1627
		}
1628
	}
1629
}
1630

1631
/*
1632
 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1633
 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1634
 */
1635
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1636
{
1637
	int rc;
1638

1639
	if (txq < 1 || txq > dev->num_tx_queues)
1640
		return -EINVAL;
1641

1642
	if (dev->reg_state == NETREG_REGISTERED ||
1643
	    dev->reg_state == NETREG_UNREGISTERING) {
1644
		ASSERT_RTNL();
1645

1646
		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1647
						  txq);
1648
		if (rc)
1649
			return rc;
1650

1651
		if (dev->num_tc)
1652
			netif_setup_tc(dev, txq);
1653

1654
		if (txq < dev->real_num_tx_queues)
1655
			qdisc_reset_all_tx_gt(dev, txq);
1656
	}
1657

1658
	dev->real_num_tx_queues = txq;
1659
	return 0;
1660
}
1661
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1662

1663
#ifdef CONFIG_RPS
1664
/**
1665
 *	netif_set_real_num_rx_queues - set actual number of RX queues used
1666
 *	@dev: Network device
1667
 *	@rxq: Actual number of RX queues
1668
 *
1669
 *	This must be called either with the rtnl_lock held or before
1670
 *	registration of the net device.  Returns 0 on success, or a
1671
 *	negative error code.  If called before registration, it always
1672
 *	succeeds.
1673
 */
1674
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1675
{
1676
	int rc;
1677

1678
	if (rxq < 1 || rxq > dev->num_rx_queues)
1679
		return -EINVAL;
1680

1681
	if (dev->reg_state == NETREG_REGISTERED) {
1682
		ASSERT_RTNL();
1683

1684
		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1685
						  rxq);
1686
		if (rc)
1687
			return rc;
1688
	}
1689

1690
	dev->real_num_rx_queues = rxq;
1691
	return 0;
1692
}
1693
EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1694
#endif
1695

1696
static inline void __netif_reschedule(struct Qdisc *q)
1697
{
1698
	struct softnet_data *sd;
1699
	unsigned long flags;
1700

1701
	local_irq_save(flags);
1702
	sd = &__get_cpu_var(softnet_data);
1703
	q->next_sched = NULL;
1704
	*sd->output_queue_tailp = q;
1705
	sd->output_queue_tailp = &q->next_sched;
1706
	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1707
	local_irq_restore(flags);
1708
}
1709

1710
void __netif_schedule(struct Qdisc *q)
1711
{
1712
	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1713
		__netif_reschedule(q);
1714
}
1715
EXPORT_SYMBOL(__netif_schedule);
1716

1717
void dev_kfree_skb_irq(struct sk_buff *skb)
1718
{
1719
	if (atomic_dec_and_test(&skb->users)) {
1720
		struct softnet_data *sd;
1721
		unsigned long flags;
1722

1723
		local_irq_save(flags);
1724
		sd = &__get_cpu_var(softnet_data);
1725
		skb->next = sd->completion_queue;
1726
		sd->completion_queue = skb;
1727
		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1728
		local_irq_restore(flags);
1729
	}
1730
}
1731
EXPORT_SYMBOL(dev_kfree_skb_irq);
1732

1733
void dev_kfree_skb_any(struct sk_buff *skb)
1734
{
1735
	if (in_irq() || irqs_disabled())
1736
		dev_kfree_skb_irq(skb);
1737
	else
1738
		dev_kfree_skb(skb);
1739
}
1740
EXPORT_SYMBOL(dev_kfree_skb_any);
1741

1742

1743
/**
1744
 * netif_device_detach - mark device as removed
1745
 * @dev: network device
1746
 *
1747
 * Mark device as removed from system and therefore no longer available.
1748
 */
1749
void netif_device_detach(struct net_device *dev)
1750
{
1751
	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1752
	    netif_running(dev)) {
1753
		netif_tx_stop_all_queues(dev);
1754
	}
1755
}
1756
EXPORT_SYMBOL(netif_device_detach);
1757

1758
/**
1759
 * netif_device_attach - mark device as attached
1760
 * @dev: network device
1761
 *
1762
 * Mark device as attached from system and restart if needed.
1763
 */
1764
void netif_device_attach(struct net_device *dev)
1765
{
1766
	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1767
	    netif_running(dev)) {
1768
		netif_tx_wake_all_queues(dev);
1769
		__netdev_watchdog_up(dev);
1770
	}
1771
}
1772
EXPORT_SYMBOL(netif_device_attach);
1773

1774
/**
1775
 * skb_dev_set -- assign a new device to a buffer
1776
 * @skb: buffer for the new device
1777
 * @dev: network device
1778
 *
1779
 * If an skb is owned by a device already, we have to reset
1780
 * all data private to the namespace a device belongs to
1781
 * before assigning it a new device.
1782
 */
1783
#ifdef CONFIG_NET_NS
1784
void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1785
{
1786
	skb_dst_drop(skb);
1787
	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1788
		secpath_reset(skb);
1789
		nf_reset(skb);
1790
		skb_init_secmark(skb);
1791
		skb->mark = 0;
1792
		skb->priority = 0;
1793
		skb->nf_trace = 0;
1794
		skb->ipvs_property = 0;
1795
#ifdef CONFIG_NET_SCHED
1796
		skb->tc_index = 0;
1797
#endif
1798
	}
1799
	skb->dev = dev;
1800
}
1801
EXPORT_SYMBOL(skb_set_dev);
1802
#endif /* CONFIG_NET_NS */
1803

1804
/*
1805
 * Invalidate hardware checksum when packet is to be mangled, and
1806
 * complete checksum manually on outgoing path.
1807
 */
1808
int skb_checksum_help(struct sk_buff *skb)
1809
{
1810
	__wsum csum;
1811
	int ret = 0, offset;
1812

1813
	if (skb->ip_summed == CHECKSUM_COMPLETE)
1814
		goto out_set_summed;
1815

1816
	if (unlikely(skb_shinfo(skb)->gso_size)) {
1817
		/* Let GSO fix up the checksum. */
1818
		goto out_set_summed;
1819
	}
1820

1821
	offset = skb_checksum_start_offset(skb);
1822
	BUG_ON(offset >= skb_headlen(skb));
1823
	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1824

1825
	offset += skb->csum_offset;
1826
	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1827

1828
	if (skb_cloned(skb) &&
1829
	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1830
		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1831
		if (ret)
1832
			goto out;
1833
	}
1834

1835
	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1836
out_set_summed:
1837
	skb->ip_summed = CHECKSUM_NONE;
1838
out:
1839
	return ret;
1840
}
1841
EXPORT_SYMBOL(skb_checksum_help);
1842

1843
/**
1844
 *	skb_gso_segment - Perform segmentation on skb.
1845
 *	@skb: buffer to segment
1846
 *	@features: features for the output path (see dev->features)
1847
 *
1848
 *	This function segments the given skb and returns a list of segments.
1849
 *
1850
 *	It may return NULL if the skb requires no segmentation.  This is
1851
 *	only possible when GSO is used for verifying header integrity.
1852
 */
1853
struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1854
{
1855
	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1856
	struct packet_type *ptype;
1857
	__be16 type = skb->protocol;
1858
	int vlan_depth = ETH_HLEN;
1859
	int err;
1860

1861
	while (type == htons(ETH_P_8021Q)) {
1862
		struct vlan_hdr *vh;
1863

1864
		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1865
			return ERR_PTR(-EINVAL);
1866

1867
		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1868
		type = vh->h_vlan_encapsulated_proto;
1869
		vlan_depth += VLAN_HLEN;
1870
	}
1871

1872
	skb_reset_mac_header(skb);
1873
	skb->mac_len = skb->network_header - skb->mac_header;
1874
	__skb_pull(skb, skb->mac_len);
1875

1876
	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1877
		struct net_device *dev = skb->dev;
1878
		struct ethtool_drvinfo info = {};
1879

1880
		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1881
			dev->ethtool_ops->get_drvinfo(dev, &info);
1882

1883
		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1884
		     info.driver, dev ? dev->features : 0L,
1885
		     skb->sk ? skb->sk->sk_route_caps : 0L,
1886
		     skb->len, skb->data_len, skb->ip_summed);
1887

1888
		if (skb_header_cloned(skb) &&
1889
		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1890
			return ERR_PTR(err);
1891
	}
1892

1893
	rcu_read_lock();
1894
	list_for_each_entry_rcu(ptype,
1895
			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1896
		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1897
			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1898
				err = ptype->gso_send_check(skb);
1899
				segs = ERR_PTR(err);
1900
				if (err || skb_gso_ok(skb, features))
1901
					break;
1902
				__skb_push(skb, (skb->data -
1903
						 skb_network_header(skb)));
1904
			}
1905
			segs = ptype->gso_segment(skb, features);
1906
			break;
1907
		}
1908
	}
1909
	rcu_read_unlock();
1910

1911
	__skb_push(skb, skb->data - skb_mac_header(skb));
1912

1913
	return segs;
1914
}
1915
EXPORT_SYMBOL(skb_gso_segment);
1916

1917
/* Take action when hardware reception checksum errors are detected. */
1918
#ifdef CONFIG_BUG
1919
void netdev_rx_csum_fault(struct net_device *dev)
1920
{
1921
	if (net_ratelimit()) {
1922
		printk(KERN_ERR "%s: hw csum failure.\n",
1923
			dev ? dev->name : "<unknown>");
1924
		dump_stack();
1925
	}
1926
}
1927
EXPORT_SYMBOL(netdev_rx_csum_fault);
1928
#endif
1929

1930
/* Actually, we should eliminate this check as soon as we know, that:
1931
 * 1. IOMMU is present and allows to map all the memory.
1932
 * 2. No high memory really exists on this machine.
1933
 */
1934

1935
static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1936
{
1937
#ifdef CONFIG_HIGHMEM
1938
	int i;
1939
	if (!(dev->features & NETIF_F_HIGHDMA)) {
1940
		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1941
			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1942
				return 1;
1943
	}
1944

1945
	if (PCI_DMA_BUS_IS_PHYS) {
1946
		struct device *pdev = dev->dev.parent;
1947

1948
		if (!pdev)
1949
			return 0;
1950
		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1951
			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1952
			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1953
				return 1;
1954
		}
1955
	}
1956
#endif
1957
	return 0;
1958
}
1959

1960
struct dev_gso_cb {
1961
	void (*destructor)(struct sk_buff *skb);
1962
};
1963

1964
#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1965

1966
static void dev_gso_skb_destructor(struct sk_buff *skb)
1967
{
1968
	struct dev_gso_cb *cb;
1969

1970
	do {
1971
		struct sk_buff *nskb = skb->next;
1972

1973
		skb->next = nskb->next;
1974
		nskb->next = NULL;
1975
		kfree_skb(nskb);
1976
	} while (skb->next);
1977

1978
	cb = DEV_GSO_CB(skb);
1979
	if (cb->destructor)
1980
		cb->destructor(skb);
1981
}
1982

1983
/**
1984
 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1985
 *	@skb: buffer to segment
1986
 *	@features: device features as applicable to this skb
1987
 *
1988
 *	This function segments the given skb and stores the list of segments
1989
 *	in skb->next.
1990
 */
1991
static int dev_gso_segment(struct sk_buff *skb, int features)
1992
{
1993
	struct sk_buff *segs;
1994

1995
	segs = skb_gso_segment(skb, features);
1996

1997
	/* Verifying header integrity only. */
1998
	if (!segs)
1999
		return 0;
2000

2001
	if (IS_ERR(segs))
2002
		return PTR_ERR(segs);
2003

2004
	skb->next = segs;
2005
	DEV_GSO_CB(skb)->destructor = skb->destructor;
2006
	skb->destructor = dev_gso_skb_destructor;
2007

2008
	return 0;
2009
}
2010

2011
/*
2012
 * Try to orphan skb early, right before transmission by the device.
2013
 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2014
 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2015
 */
2016
static inline void skb_orphan_try(struct sk_buff *skb)
2017
{
2018
	struct sock *sk = skb->sk;
2019

2020
	if (sk && !skb_shinfo(skb)->tx_flags) {
2021
		/* skb_tx_hash() wont be able to get sk.
2022
		 * We copy sk_hash into skb->rxhash
2023
		 */
2024
		if (!skb->rxhash)
2025
			skb->rxhash = sk->sk_hash;
2026
		skb_orphan(skb);
2027
	}
2028
}
2029

2030
static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2031
{
2032
	return ((features & NETIF_F_GEN_CSUM) ||
2033
		((features & NETIF_F_V4_CSUM) &&
2034
		 protocol == htons(ETH_P_IP)) ||
2035
		((features & NETIF_F_V6_CSUM) &&
2036
		 protocol == htons(ETH_P_IPV6)) ||
2037
		((features & NETIF_F_FCOE_CRC) &&
2038
		 protocol == htons(ETH_P_FCOE)));
2039
}
2040

2041
static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2042
{
2043
	if (!can_checksum_protocol(features, protocol)) {
2044
		features &= ~NETIF_F_ALL_CSUM;
2045
		features &= ~NETIF_F_SG;
2046
	} else if (illegal_highdma(skb->dev, skb)) {
2047
		features &= ~NETIF_F_SG;
2048
	}
2049

2050
	return features;
2051
}
2052

2053
u32 netif_skb_features(struct sk_buff *skb)
2054
{
2055
	__be16 protocol = skb->protocol;
2056
	u32 features = skb->dev->features;
2057

2058
	if (protocol == htons(ETH_P_8021Q)) {
2059
		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2060
		protocol = veh->h_vlan_encapsulated_proto;
2061
	} else if (!vlan_tx_tag_present(skb)) {
2062
		return harmonize_features(skb, protocol, features);
2063
	}
2064

2065
	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2066

2067
	if (protocol != htons(ETH_P_8021Q)) {
2068
		return harmonize_features(skb, protocol, features);
2069
	} else {
2070
		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2071
				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2072
		return harmonize_features(skb, protocol, features);
2073
	}
2074
}
2075
EXPORT_SYMBOL(netif_skb_features);
2076

2077
/*
2078
 * Returns true if either:
2079
 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2080
 *	2. skb is fragmented and the device does not support SG, or if
2081
 *	   at least one of fragments is in highmem and device does not
2082
 *	   support DMA from it.
2083
 */
2084
static inline int skb_needs_linearize(struct sk_buff *skb,
2085
				      int features)
2086
{
2087
	return skb_is_nonlinear(skb) &&
2088
			((skb_has_frag_list(skb) &&
2089
				!(features & NETIF_F_FRAGLIST)) ||
2090
			(skb_shinfo(skb)->nr_frags &&
2091
				!(features & NETIF_F_SG)));
2092
}
2093

2094
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2095
			struct netdev_queue *txq)
2096
{
2097
	const struct net_device_ops *ops = dev->netdev_ops;
2098
	int rc = NETDEV_TX_OK;
2099
	unsigned int skb_len;
2100

2101
	if (likely(!skb->next)) {
2102
		u32 features;
2103

2104
		/*
2105
		 * If device doesn't need skb->dst, release it right now while
2106
		 * its hot in this cpu cache
2107
		 */
2108
		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2109
			skb_dst_drop(skb);
2110

2111
		if (!list_empty(&ptype_all))
2112
			dev_queue_xmit_nit(skb, dev);
2113

2114
		skb_orphan_try(skb);
2115

2116
		features = netif_skb_features(skb);
2117

2118
		if (vlan_tx_tag_present(skb) &&
2119
		    !(features & NETIF_F_HW_VLAN_TX)) {
2120
			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2121
			if (unlikely(!skb))
2122
				goto out;
2123

2124
			skb->vlan_tci = 0;
2125
		}
2126

2127
		if (netif_needs_gso(skb, features)) {
2128
			if (unlikely(dev_gso_segment(skb, features)))
2129
				goto out_kfree_skb;
2130
			if (skb->next)
2131
				goto gso;
2132
		} else {
2133
			if (skb_needs_linearize(skb, features) &&
2134
			    __skb_linearize(skb))
2135
				goto out_kfree_skb;
2136

2137
			/* If packet is not checksummed and device does not
2138
			 * support checksumming for this protocol, complete
2139
			 * checksumming here.
2140
			 */
2141
			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2142
				skb_set_transport_header(skb,
2143
					skb_checksum_start_offset(skb));
2144
				if (!(features & NETIF_F_ALL_CSUM) &&
2145
				     skb_checksum_help(skb))
2146
					goto out_kfree_skb;
2147
			}
2148
		}
2149

2150
		skb_len = skb->len;
2151
		rc = ops->ndo_start_xmit(skb, dev);
2152
		trace_net_dev_xmit(skb, rc, dev, skb_len);
2153
		if (rc == NETDEV_TX_OK)
2154
			txq_trans_update(txq);
2155
		return rc;
2156
	}
2157

2158
gso:
2159
	do {
2160
		struct sk_buff *nskb = skb->next;
2161

2162
		skb->next = nskb->next;
2163
		nskb->next = NULL;
2164

2165
		/*
2166
		 * If device doesn't need nskb->dst, release it right now while
2167
		 * its hot in this cpu cache
2168
		 */
2169
		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2170
			skb_dst_drop(nskb);
2171

2172
		skb_len = nskb->len;
2173
		rc = ops->ndo_start_xmit(nskb, dev);
2174
		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2175
		if (unlikely(rc != NETDEV_TX_OK)) {
2176
			if (rc & ~NETDEV_TX_MASK)
2177
				goto out_kfree_gso_skb;
2178
			nskb->next = skb->next;
2179
			skb->next = nskb;
2180
			return rc;
2181
		}
2182
		txq_trans_update(txq);
2183
		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2184
			return NETDEV_TX_BUSY;
2185
	} while (skb->next);
2186

2187
out_kfree_gso_skb:
2188
	if (likely(skb->next == NULL))
2189
		skb->destructor = DEV_GSO_CB(skb)->destructor;
2190
out_kfree_skb:
2191
	kfree_skb(skb);
2192
out:
2193
	return rc;
2194
}
2195

2196
static u32 hashrnd __read_mostly;
2197

2198
/*
2199
 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2200
 * to be used as a distribution range.
2201
 */
2202
u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2203
		  unsigned int num_tx_queues)
2204
{
2205
	u32 hash;
2206
	u16 qoffset = 0;
2207
	u16 qcount = num_tx_queues;
2208

2209
	if (skb_rx_queue_recorded(skb)) {
2210
		hash = skb_get_rx_queue(skb);
2211
		while (unlikely(hash >= num_tx_queues))
2212
			hash -= num_tx_queues;
2213
		return hash;
2214
	}
2215

2216
	if (dev->num_tc) {
2217
		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2218
		qoffset = dev->tc_to_txq[tc].offset;
2219
		qcount = dev->tc_to_txq[tc].count;
2220
	}
2221

2222
	if (skb->sk && skb->sk->sk_hash)
2223
		hash = skb->sk->sk_hash;
2224
	else
2225
		hash = (__force u16) skb->protocol ^ skb->rxhash;
2226
	hash = jhash_1word(hash, hashrnd);
2227

2228
	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2229
}
2230
EXPORT_SYMBOL(__skb_tx_hash);
2231

2232
static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2233
{
2234
	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2235
		if (net_ratelimit()) {
2236
			pr_warning("%s selects TX queue %d, but "
2237
				"real number of TX queues is %d\n",
2238
				dev->name, queue_index, dev->real_num_tx_queues);
2239
		}
2240
		return 0;
2241
	}
2242
	return queue_index;
2243
}
2244

2245
static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2246
{
2247
#ifdef CONFIG_XPS
2248
	struct xps_dev_maps *dev_maps;
2249
	struct xps_map *map;
2250
	int queue_index = -1;
2251

2252
	rcu_read_lock();
2253
	dev_maps = rcu_dereference(dev->xps_maps);
2254
	if (dev_maps) {
2255
		map = rcu_dereference(
2256
		    dev_maps->cpu_map[raw_smp_processor_id()]);
2257
		if (map) {
2258
			if (map->len == 1)
2259
				queue_index = map->queues[0];
2260
			else {
2261
				u32 hash;
2262
				if (skb->sk && skb->sk->sk_hash)
2263
					hash = skb->sk->sk_hash;
2264
				else
2265
					hash = (__force u16) skb->protocol ^
2266
					    skb->rxhash;
2267
				hash = jhash_1word(hash, hashrnd);
2268
				queue_index = map->queues[
2269
				    ((u64)hash * map->len) >> 32];
2270
			}
2271
			if (unlikely(queue_index >= dev->real_num_tx_queues))
2272
				queue_index = -1;
2273
		}
2274
	}
2275
	rcu_read_unlock();
2276

2277
	return queue_index;
2278
#else
2279
	return -1;
2280
#endif
2281
}
2282

2283
static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2284
					struct sk_buff *skb)
2285
{
2286
	int queue_index;
2287
	const struct net_device_ops *ops = dev->netdev_ops;
2288

2289
	if (dev->real_num_tx_queues == 1)
2290
		queue_index = 0;
2291
	else if (ops->ndo_select_queue) {
2292
		queue_index = ops->ndo_select_queue(dev, skb);
2293
		queue_index = dev_cap_txqueue(dev, queue_index);
2294
	} else {
2295
		struct sock *sk = skb->sk;
2296
		queue_index = sk_tx_queue_get(sk);
2297

2298
		if (queue_index < 0 || skb->ooo_okay ||
2299
		    queue_index >= dev->real_num_tx_queues) {
2300
			int old_index = queue_index;
2301

2302
			queue_index = get_xps_queue(dev, skb);
2303
			if (queue_index < 0)
2304
				queue_index = skb_tx_hash(dev, skb);
2305

2306
			if (queue_index != old_index && sk) {
2307
				struct dst_entry *dst =
2308
				    rcu_dereference_check(sk->sk_dst_cache, 1);
2309

2310
				if (dst && skb_dst(skb) == dst)
2311
					sk_tx_queue_set(sk, queue_index);
2312
			}
2313
		}
2314
	}
2315

2316
	skb_set_queue_mapping(skb, queue_index);
2317
	return netdev_get_tx_queue(dev, queue_index);
2318
}
2319

2320
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2321
				 struct net_device *dev,
2322
				 struct netdev_queue *txq)
2323
{
2324
	spinlock_t *root_lock = qdisc_lock(q);
2325
	bool contended;
2326
	int rc;
2327

2328
	qdisc_skb_cb(skb)->pkt_len = skb->len;
2329
	qdisc_calculate_pkt_len(skb, q);
2330
	/*
2331
	 * Heuristic to force contended enqueues to serialize on a
2332
	 * separate lock before trying to get qdisc main lock.
2333
	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2334
	 * and dequeue packets faster.
2335
	 */
2336
	contended = qdisc_is_running(q);
2337
	if (unlikely(contended))
2338
		spin_lock(&q->busylock);
2339

2340
	spin_lock(root_lock);
2341
	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2342
		kfree_skb(skb);
2343
		rc = NET_XMIT_DROP;
2344
	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2345
		   qdisc_run_begin(q)) {
2346
		/*
2347
		 * This is a work-conserving queue; there are no old skbs
2348
		 * waiting to be sent out; and the qdisc is not running -
2349
		 * xmit the skb directly.
2350
		 */
2351
		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2352
			skb_dst_force(skb);
2353

2354
		qdisc_bstats_update(q, skb);
2355

2356
		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2357
			if (unlikely(contended)) {
2358
				spin_unlock(&q->busylock);
2359
				contended = false;
2360
			}
2361
			__qdisc_run(q);
2362
		} else
2363
			qdisc_run_end(q);
2364

2365
		rc = NET_XMIT_SUCCESS;
2366
	} else {
2367
		skb_dst_force(skb);
2368
		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2369
		if (qdisc_run_begin(q)) {
2370
			if (unlikely(contended)) {
2371
				spin_unlock(&q->busylock);
2372
				contended = false;
2373
			}
2374
			__qdisc_run(q);
2375
		}
2376
	}
2377
	spin_unlock(root_lock);
2378
	if (unlikely(contended))
2379
		spin_unlock(&q->busylock);
2380
	return rc;
2381
}
2382

2383
static DEFINE_PER_CPU(int, xmit_recursion);
2384
#define RECURSION_LIMIT 10
2385

2386
/**
2387
 *	dev_queue_xmit - transmit a buffer
2388
 *	@skb: buffer to transmit
2389
 *
2390
 *	Queue a buffer for transmission to a network device. The caller must
2391
 *	have set the device and priority and built the buffer before calling
2392
 *	this function. The function can be called from an interrupt.
2393
 *
2394
 *	A negative errno code is returned on a failure. A success does not
2395
 *	guarantee the frame will be transmitted as it may be dropped due
2396
 *	to congestion or traffic shaping.
2397
 *
2398
 * -----------------------------------------------------------------------------------
2399
 *      I notice this method can also return errors from the queue disciplines,
2400
 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2401
 *      be positive.
2402
 *
2403
 *      Regardless of the return value, the skb is consumed, so it is currently
2404
 *      difficult to retry a send to this method.  (You can bump the ref count
2405
 *      before sending to hold a reference for retry if you are careful.)
2406
 *
2407
 *      When calling this method, interrupts MUST be enabled.  This is because
2408
 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2409
 *          --BLG
2410
 */
2411
int dev_queue_xmit(struct sk_buff *skb)
2412
{
2413
	struct net_device *dev = skb->dev;
2414
	struct netdev_queue *txq;
2415
	struct Qdisc *q;
2416
	int rc = -ENOMEM;
2417

2418
	/* Disable soft irqs for various locks below. Also
2419
	 * stops preemption for RCU.
2420
	 */
2421
	rcu_read_lock_bh();
2422

2423
	txq = dev_pick_tx(dev, skb);
2424
	q = rcu_dereference_bh(txq->qdisc);
2425

2426
#ifdef CONFIG_NET_CLS_ACT
2427
	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2428
#endif
2429
	trace_net_dev_queue(skb);
2430
	if (q->enqueue) {
2431
		rc = __dev_xmit_skb(skb, q, dev, txq);
2432
		goto out;
2433
	}
2434

2435
	/* The device has no queue. Common case for software devices:
2436
	   loopback, all the sorts of tunnels...
2437

2438
	   Really, it is unlikely that netif_tx_lock protection is necessary
2439
	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2440
	   counters.)
2441
	   However, it is possible, that they rely on protection
2442
	   made by us here.
2443

2444
	   Check this and shot the lock. It is not prone from deadlocks.
2445
	   Either shot noqueue qdisc, it is even simpler 8)
2446
	 */
2447
	if (dev->flags & IFF_UP) {
2448
		int cpu = smp_processor_id(); /* ok because BHs are off */
2449

2450
		if (txq->xmit_lock_owner != cpu) {
2451

2452
			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2453
				goto recursion_alert;
2454

2455
			HARD_TX_LOCK(dev, txq, cpu);
2456

2457
			if (!netif_tx_queue_stopped(txq)) {
2458
				__this_cpu_inc(xmit_recursion);
2459
				rc = dev_hard_start_xmit(skb, dev, txq);
2460
				__this_cpu_dec(xmit_recursion);
2461
				if (dev_xmit_complete(rc)) {
2462
					HARD_TX_UNLOCK(dev, txq);
2463
					goto out;
2464
				}
2465
			}
2466
			HARD_TX_UNLOCK(dev, txq);
2467
			if (net_ratelimit())
2468
				printk(KERN_CRIT "Virtual device %s asks to "
2469
				       "queue packet!\n", dev->name);
2470
		} else {
2471
			/* Recursion is detected! It is possible,
2472
			 * unfortunately
2473
			 */
2474
recursion_alert:
2475
			if (net_ratelimit())
2476
				printk(KERN_CRIT "Dead loop on virtual device "
2477
				       "%s, fix it urgently!\n", dev->name);
2478
		}
2479
	}
2480

2481
	rc = -ENETDOWN;
2482
	rcu_read_unlock_bh();
2483

2484
	kfree_skb(skb);
2485
	return rc;
2486
out:
2487
	rcu_read_unlock_bh();
2488
	return rc;
2489
}
2490
EXPORT_SYMBOL(dev_queue_xmit);
2491

2492

2493
/*=======================================================================
2494
			Receiver routines
2495
  =======================================================================*/
2496

2497
int netdev_max_backlog __read_mostly = 1000;
2498
int netdev_tstamp_prequeue __read_mostly = 1;
2499
int netdev_budget __read_mostly = 300;
2500
int weight_p __read_mostly = 64;            /* old backlog weight */
2501

2502
/* Called with irq disabled */
2503
static inline void ____napi_schedule(struct softnet_data *sd,
2504
				     struct napi_struct *napi)
2505
{
2506
	list_add_tail(&napi->poll_list, &sd->poll_list);
2507
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2508
}
2509

2510
/*
2511
 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2512
 * and src/dst port numbers. Returns a non-zero hash number on success
2513
 * and 0 on failure.
2514
 */
2515
__u32 __skb_get_rxhash(struct sk_buff *skb)
2516
{
2517
	int nhoff, hash = 0, poff;
2518
	const struct ipv6hdr *ip6;
2519
	const struct iphdr *ip;
2520
	u8 ip_proto;
2521
	u32 addr1, addr2, ihl;
2522
	union {
2523
		u32 v32;
2524
		u16 v16[2];
2525
	} ports;
2526

2527
	nhoff = skb_network_offset(skb);
2528

2529
	switch (skb->protocol) {
2530
	case __constant_htons(ETH_P_IP):
2531
		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2532
			goto done;
2533

2534
		ip = (const struct iphdr *) (skb->data + nhoff);
2535
		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2536
			ip_proto = 0;
2537
		else
2538
			ip_proto = ip->protocol;
2539
		addr1 = (__force u32) ip->saddr;
2540
		addr2 = (__force u32) ip->daddr;
2541
		ihl = ip->ihl;
2542
		break;
2543
	case __constant_htons(ETH_P_IPV6):
2544
		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2545
			goto done;
2546

2547
		ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2548
		ip_proto = ip6->nexthdr;
2549
		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2550
		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2551
		ihl = (40 >> 2);
2552
		break;
2553
	default:
2554
		goto done;
2555
	}
2556

2557
	ports.v32 = 0;
2558
	poff = proto_ports_offset(ip_proto);
2559
	if (poff >= 0) {
2560
		nhoff += ihl * 4 + poff;
2561
		if (pskb_may_pull(skb, nhoff + 4)) {
2562
			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2563
			if (ports.v16[1] < ports.v16[0])
2564
				swap(ports.v16[0], ports.v16[1]);
2565
		}
2566
	}
2567

2568
	/* get a consistent hash (same value on both flow directions) */
2569
	if (addr2 < addr1)
2570
		swap(addr1, addr2);
2571

2572
	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2573
	if (!hash)
2574
		hash = 1;
2575

2576
done:
2577
	return hash;
2578
}
2579
EXPORT_SYMBOL(__skb_get_rxhash);
2580

2581
#ifdef CONFIG_RPS
2582

2583
/* One global table that all flow-based protocols share. */
2584
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2585
EXPORT_SYMBOL(rps_sock_flow_table);
2586

2587
static struct rps_dev_flow *
2588
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2589
	    struct rps_dev_flow *rflow, u16 next_cpu)
2590
{
2591
	u16 tcpu;
2592

2593
	tcpu = rflow->cpu = next_cpu;
2594
	if (tcpu != RPS_NO_CPU) {
2595
#ifdef CONFIG_RFS_ACCEL
2596
		struct netdev_rx_queue *rxqueue;
2597
		struct rps_dev_flow_table *flow_table;
2598
		struct rps_dev_flow *old_rflow;
2599
		u32 flow_id;
2600
		u16 rxq_index;
2601
		int rc;
2602

2603
		/* Should we steer this flow to a different hardware queue? */
2604
		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2605
		    !(dev->features & NETIF_F_NTUPLE))
2606
			goto out;
2607
		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2608
		if (rxq_index == skb_get_rx_queue(skb))
2609
			goto out;
2610

2611
		rxqueue = dev->_rx + rxq_index;
2612
		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2613
		if (!flow_table)
2614
			goto out;
2615
		flow_id = skb->rxhash & flow_table->mask;
2616
		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2617
							rxq_index, flow_id);
2618
		if (rc < 0)
2619
			goto out;
2620
		old_rflow = rflow;
2621
		rflow = &flow_table->flows[flow_id];
2622
		rflow->cpu = next_cpu;
2623
		rflow->filter = rc;
2624
		if (old_rflow->filter == rflow->filter)
2625
			old_rflow->filter = RPS_NO_FILTER;
2626
	out:
2627
#endif
2628
		rflow->last_qtail =
2629
			per_cpu(softnet_data, tcpu).input_queue_head;
2630
	}
2631

2632
	return rflow;
2633
}
2634

2635
/*
2636
 * get_rps_cpu is called from netif_receive_skb and returns the target
2637
 * CPU from the RPS map of the receiving queue for a given skb.
2638
 * rcu_read_lock must be held on entry.
2639
 */
2640
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2641
		       struct rps_dev_flow **rflowp)
2642
{
2643
	struct netdev_rx_queue *rxqueue;
2644
	struct rps_map *map;
2645
	struct rps_dev_flow_table *flow_table;
2646
	struct rps_sock_flow_table *sock_flow_table;
2647
	int cpu = -1;
2648
	u16 tcpu;
2649

2650
	if (skb_rx_queue_recorded(skb)) {
2651
		u16 index = skb_get_rx_queue(skb);
2652
		if (unlikely(index >= dev->real_num_rx_queues)) {
2653
			WARN_ONCE(dev->real_num_rx_queues > 1,
2654
				  "%s received packet on queue %u, but number "
2655
				  "of RX queues is %u\n",
2656
				  dev->name, index, dev->real_num_rx_queues);
2657
			goto done;
2658
		}
2659
		rxqueue = dev->_rx + index;
2660
	} else
2661
		rxqueue = dev->_rx;
2662

2663
	map = rcu_dereference(rxqueue->rps_map);
2664
	if (map) {
2665
		if (map->len == 1 &&
2666
		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2667
			tcpu = map->cpus[0];
2668
			if (cpu_online(tcpu))
2669
				cpu = tcpu;
2670
			goto done;
2671
		}
2672
	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2673
		goto done;
2674
	}
2675

2676
	skb_reset_network_header(skb);
2677
	if (!skb_get_rxhash(skb))
2678
		goto done;
2679

2680
	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2681
	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2682
	if (flow_table && sock_flow_table) {
2683
		u16 next_cpu;
2684
		struct rps_dev_flow *rflow;
2685

2686
		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2687
		tcpu = rflow->cpu;
2688

2689
		next_cpu = sock_flow_table->ents[skb->rxhash &
2690
		    sock_flow_table->mask];
2691

2692
		/*
2693
		 * If the desired CPU (where last recvmsg was done) is
2694
		 * different from current CPU (one in the rx-queue flow
2695
		 * table entry), switch if one of the following holds:
2696
		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2697
		 *   - Current CPU is offline.
2698
		 *   - The current CPU's queue tail has advanced beyond the
2699
		 *     last packet that was enqueued using this table entry.
2700
		 *     This guarantees that all previous packets for the flow
2701
		 *     have been dequeued, thus preserving in order delivery.
2702
		 */
2703
		if (unlikely(tcpu != next_cpu) &&
2704
		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2705
		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2706
		      rflow->last_qtail)) >= 0))
2707
			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2708

2709
		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2710
			*rflowp = rflow;
2711
			cpu = tcpu;
2712
			goto done;
2713
		}
2714
	}
2715

2716
	if (map) {
2717
		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2718

2719
		if (cpu_online(tcpu)) {
2720
			cpu = tcpu;
2721
			goto done;
2722
		}
2723
	}
2724

2725
done:
2726
	return cpu;
2727
}
2728

2729
#ifdef CONFIG_RFS_ACCEL
2730

2731
/**
2732
 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2733
 * @dev: Device on which the filter was set
2734
 * @rxq_index: RX queue index
2735
 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2736
 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2737
 *
2738
 * Drivers that implement ndo_rx_flow_steer() should periodically call
2739
 * this function for each installed filter and remove the filters for
2740
 * which it returns %true.
2741
 */
2742
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2743
			 u32 flow_id, u16 filter_id)
2744
{
2745
	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2746
	struct rps_dev_flow_table *flow_table;
2747
	struct rps_dev_flow *rflow;
2748
	bool expire = true;
2749
	int cpu;
2750

2751
	rcu_read_lock();
2752
	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2753
	if (flow_table && flow_id <= flow_table->mask) {
2754
		rflow = &flow_table->flows[flow_id];
2755
		cpu = ACCESS_ONCE(rflow->cpu);
2756
		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2757
		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2758
			   rflow->last_qtail) <
2759
		     (int)(10 * flow_table->mask)))
2760
			expire = false;
2761
	}
2762
	rcu_read_unlock();
2763
	return expire;
2764
}
2765
EXPORT_SYMBOL(rps_may_expire_flow);
2766

2767
#endif /* CONFIG_RFS_ACCEL */
2768

2769
/* Called from hardirq (IPI) context */
2770
static void rps_trigger_softirq(void *data)
2771
{
2772
	struct softnet_data *sd = data;
2773

2774
	____napi_schedule(sd, &sd->backlog);
2775
	sd->received_rps++;
2776
}
2777

2778
#endif /* CONFIG_RPS */
2779

2780
/*
2781
 * Check if this softnet_data structure is another cpu one
2782
 * If yes, queue it to our IPI list and return 1
2783
 * If no, return 0
2784
 */
2785
static int rps_ipi_queued(struct softnet_data *sd)
2786
{
2787
#ifdef CONFIG_RPS
2788
	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2789

2790
	if (sd != mysd) {
2791
		sd->rps_ipi_next = mysd->rps_ipi_list;
2792
		mysd->rps_ipi_list = sd;
2793

2794
		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2795
		return 1;
2796
	}
2797
#endif /* CONFIG_RPS */
2798
	return 0;
2799
}
2800

2801
/*
2802
 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2803
 * queue (may be a remote CPU queue).
2804
 */
2805
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2806
			      unsigned int *qtail)
2807
{
2808
	struct softnet_data *sd;
2809
	unsigned long flags;
2810

2811
	sd = &per_cpu(softnet_data, cpu);
2812

2813
	local_irq_save(flags);
2814

2815
	rps_lock(sd);
2816
	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2817
		if (skb_queue_len(&sd->input_pkt_queue)) {
2818
enqueue:
2819
			__skb_queue_tail(&sd->input_pkt_queue, skb);
2820
			input_queue_tail_incr_save(sd, qtail);
2821
			rps_unlock(sd);
2822
			local_irq_restore(flags);
2823
			return NET_RX_SUCCESS;
2824
		}
2825

2826
		/* Schedule NAPI for backlog device
2827
		 * We can use non atomic operation since we own the queue lock
2828
		 */
2829
		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2830
			if (!rps_ipi_queued(sd))
2831
				____napi_schedule(sd, &sd->backlog);
2832
		}
2833
		goto enqueue;
2834
	}
2835

2836
	sd->dropped++;
2837
	rps_unlock(sd);
2838

2839
	local_irq_restore(flags);
2840

2841
	atomic_long_inc(&skb->dev->rx_dropped);
2842
	kfree_skb(skb);
2843
	return NET_RX_DROP;
2844
}
2845

2846
/**
2847
 *	netif_rx	-	post buffer to the network code
2848
 *	@skb: buffer to post
2849
 *
2850
 *	This function receives a packet from a device driver and queues it for
2851
 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2852
 *	may be dropped during processing for congestion control or by the
2853
 *	protocol layers.
2854
 *
2855
 *	return values:
2856
 *	NET_RX_SUCCESS	(no congestion)
2857
 *	NET_RX_DROP     (packet was dropped)
2858
 *
2859
 */
2860

2861
int netif_rx(struct sk_buff *skb)
2862
{
2863
	int ret;
2864

2865
	/* if netpoll wants it, pretend we never saw it */
2866
	if (netpoll_rx(skb))
2867
		return NET_RX_DROP;
2868

2869
	if (netdev_tstamp_prequeue)
2870
		net_timestamp_check(skb);
2871

2872
	trace_netif_rx(skb);
2873
#ifdef CONFIG_RPS
2874
	{
2875
		struct rps_dev_flow voidflow, *rflow = &voidflow;
2876
		int cpu;
2877

2878
		preempt_disable();
2879
		rcu_read_lock();
2880

2881
		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2882
		if (cpu < 0)
2883
			cpu = smp_processor_id();
2884

2885
		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2886

2887
		rcu_read_unlock();
2888
		preempt_enable();
2889
	}
2890
#else
2891
	{
2892
		unsigned int qtail;
2893
		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2894
		put_cpu();
2895
	}
2896
#endif
2897
	return ret;
2898
}
2899
EXPORT_SYMBOL(netif_rx);
2900

2901
int netif_rx_ni(struct sk_buff *skb)
2902
{
2903
	int err;
2904

2905
	preempt_disable();
2906
	err = netif_rx(skb);
2907
	if (local_softirq_pending())
2908
		do_softirq();
2909
	preempt_enable();
2910

2911
	return err;
2912
}
2913
EXPORT_SYMBOL(netif_rx_ni);
2914

2915
static void net_tx_action(struct softirq_action *h)
2916
{
2917
	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2918

2919
	if (sd->completion_queue) {
2920
		struct sk_buff *clist;
2921

2922
		local_irq_disable();
2923
		clist = sd->completion_queue;
2924
		sd->completion_queue = NULL;
2925
		local_irq_enable();
2926

2927
		while (clist) {
2928
			struct sk_buff *skb = clist;
2929
			clist = clist->next;
2930

2931
			WARN_ON(atomic_read(&skb->users));
2932
			trace_kfree_skb(skb, net_tx_action);
2933
			__kfree_skb(skb);
2934
		}
2935
	}
2936

2937
	if (sd->output_queue) {
2938
		struct Qdisc *head;
2939

2940
		local_irq_disable();
2941
		head = sd->output_queue;
2942
		sd->output_queue = NULL;
2943
		sd->output_queue_tailp = &sd->output_queue;
2944
		local_irq_enable();
2945

2946
		while (head) {
2947
			struct Qdisc *q = head;
2948
			spinlock_t *root_lock;
2949

2950
			head = head->next_sched;
2951

2952
			root_lock = qdisc_lock(q);
2953
			if (spin_trylock(root_lock)) {
2954
				smp_mb__before_clear_bit();
2955
				clear_bit(__QDISC_STATE_SCHED,
2956
					  &q->state);
2957
				qdisc_run(q);
2958
				spin_unlock(root_lock);
2959
			} else {
2960
				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2961
					      &q->state)) {
2962
					__netif_reschedule(q);
2963
				} else {
2964
					smp_mb__before_clear_bit();
2965
					clear_bit(__QDISC_STATE_SCHED,
2966
						  &q->state);
2967
				}
2968
			}
2969
		}
2970
	}
2971
}
2972

2973
#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2974
    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2975
/* This hook is defined here for ATM LANE */
2976
int (*br_fdb_test_addr_hook)(struct net_device *dev,
2977
			     unsigned char *addr) __read_mostly;
2978
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2979
#endif
2980

2981
#ifdef CONFIG_NET_CLS_ACT
2982
/* TODO: Maybe we should just force sch_ingress to be compiled in
2983
 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2984
 * a compare and 2 stores extra right now if we dont have it on
2985
 * but have CONFIG_NET_CLS_ACT
2986
 * NOTE: This doesn't stop any functionality; if you dont have
2987
 * the ingress scheduler, you just can't add policies on ingress.
2988
 *
2989
 */
2990
static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2991
{
2992
	struct net_device *dev = skb->dev;
2993
	u32 ttl = G_TC_RTTL(skb->tc_verd);
2994
	int result = TC_ACT_OK;
2995
	struct Qdisc *q;
2996

2997
	if (unlikely(MAX_RED_LOOP < ttl++)) {
2998
		if (net_ratelimit())
2999
			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3000
			       skb->skb_iif, dev->ifindex);
3001
		return TC_ACT_SHOT;
3002
	}
3003

3004
	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3005
	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3006

3007
	q = rxq->qdisc;
3008
	if (q != &noop_qdisc) {
3009
		spin_lock(qdisc_lock(q));
3010
		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3011
			result = qdisc_enqueue_root(skb, q);
3012
		spin_unlock(qdisc_lock(q));
3013
	}
3014

3015
	return result;
3016
}
3017

3018
static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3019
					 struct packet_type **pt_prev,
3020
					 int *ret, struct net_device *orig_dev)
3021
{
3022
	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3023

3024
	if (!rxq || rxq->qdisc == &noop_qdisc)
3025
		goto out;
3026

3027
	if (*pt_prev) {
3028
		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3029
		*pt_prev = NULL;
3030
	}
3031

3032
	switch (ing_filter(skb, rxq)) {
3033
	case TC_ACT_SHOT:
3034
	case TC_ACT_STOLEN:
3035
		kfree_skb(skb);
3036
		return NULL;
3037
	}
3038

3039
out:
3040
	skb->tc_verd = 0;
3041
	return skb;
3042
}
3043
#endif
3044

3045
/**
3046
 *	netdev_rx_handler_register - register receive handler
3047
 *	@dev: device to register a handler for
3048
 *	@rx_handler: receive handler to register
3049
 *	@rx_handler_data: data pointer that is used by rx handler
3050
 *
3051
 *	Register a receive hander for a device. This handler will then be
3052
 *	called from __netif_receive_skb. A negative errno code is returned
3053
 *	on a failure.
3054
 *
3055
 *	The caller must hold the rtnl_mutex.
3056
 *
3057
 *	For a general description of rx_handler, see enum rx_handler_result.
3058
 */
3059
int netdev_rx_handler_register(struct net_device *dev,
3060
			       rx_handler_func_t *rx_handler,
3061
			       void *rx_handler_data)
3062
{
3063
	ASSERT_RTNL();
3064

3065
	if (dev->rx_handler)
3066
		return -EBUSY;
3067

3068
	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3069
	rcu_assign_pointer(dev->rx_handler, rx_handler);
3070

3071
	return 0;
3072
}
3073
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3074

3075
/**
3076
 *	netdev_rx_handler_unregister - unregister receive handler
3077
 *	@dev: device to unregister a handler from
3078
 *
3079
 *	Unregister a receive hander from a device.
3080
 *
3081
 *	The caller must hold the rtnl_mutex.
3082
 */
3083
void netdev_rx_handler_unregister(struct net_device *dev)
3084
{
3085

3086
	ASSERT_RTNL();
3087
	rcu_assign_pointer(dev->rx_handler, NULL);
3088
	rcu_assign_pointer(dev->rx_handler_data, NULL);
3089
}
3090
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3091

3092
static int __netif_receive_skb(struct sk_buff *skb)
3093
{
3094
	struct packet_type *ptype, *pt_prev;
3095
	rx_handler_func_t *rx_handler;
3096
	struct net_device *orig_dev;
3097
	struct net_device *null_or_dev;
3098
	bool deliver_exact = false;
3099
	int ret = NET_RX_DROP;
3100
	__be16 type;
3101

3102
	if (!netdev_tstamp_prequeue)
3103
		net_timestamp_check(skb);
3104

3105
	trace_netif_receive_skb(skb);
3106

3107
	/* if we've gotten here through NAPI, check netpoll */
3108
	if (netpoll_receive_skb(skb))
3109
		return NET_RX_DROP;
3110

3111
	if (!skb->skb_iif)
3112
		skb->skb_iif = skb->dev->ifindex;
3113
	orig_dev = skb->dev;
3114

3115
	skb_reset_network_header(skb);
3116
	skb_reset_transport_header(skb);
3117
	skb_reset_mac_len(skb);
3118

3119
	pt_prev = NULL;
3120

3121
	rcu_read_lock();
3122

3123
another_round:
3124

3125
	__this_cpu_inc(softnet_data.processed);
3126

3127
	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3128
		skb = vlan_untag(skb);
3129
		if (unlikely(!skb))
3130
			goto out;
3131
	}
3132

3133
#ifdef CONFIG_NET_CLS_ACT
3134
	if (skb->tc_verd & TC_NCLS) {
3135
		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3136
		goto ncls;
3137
	}
3138
#endif
3139

3140
	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3141
		if (!ptype->dev || ptype->dev == skb->dev) {
3142
			if (pt_prev)
3143
				ret = deliver_skb(skb, pt_prev, orig_dev);
3144
			pt_prev = ptype;
3145
		}
3146
	}
3147

3148
#ifdef CONFIG_NET_CLS_ACT
3149
	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3150
	if (!skb)
3151
		goto out;
3152
ncls:
3153
#endif
3154

3155
	rx_handler = rcu_dereference(skb->dev->rx_handler);
3156
	if (rx_handler) {
3157
		if (pt_prev) {
3158
			ret = deliver_skb(skb, pt_prev, orig_dev);
3159
			pt_prev = NULL;
3160
		}
3161
		switch (rx_handler(&skb)) {
3162
		case RX_HANDLER_CONSUMED:
3163
			goto out;
3164
		case RX_HANDLER_ANOTHER:
3165
			goto another_round;
3166
		case RX_HANDLER_EXACT:
3167
			deliver_exact = true;
3168
		case RX_HANDLER_PASS:
3169
			break;
3170
		default:
3171
			BUG();
3172
		}
3173
	}
3174

3175
	if (vlan_tx_tag_present(skb)) {
3176
		if (pt_prev) {
3177
			ret = deliver_skb(skb, pt_prev, orig_dev);
3178
			pt_prev = NULL;
3179
		}
3180
		if (vlan_do_receive(&skb)) {
3181
			ret = __netif_receive_skb(skb);
3182
			goto out;
3183
		} else if (unlikely(!skb))
3184
			goto out;
3185
	}
3186

3187
	/* deliver only exact match when indicated */
3188
	null_or_dev = deliver_exact ? skb->dev : NULL;
3189

3190
	type = skb->protocol;
3191
	list_for_each_entry_rcu(ptype,
3192
			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3193
		if (ptype->type == type &&
3194
		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3195
		     ptype->dev == orig_dev)) {
3196
			if (pt_prev)
3197
				ret = deliver_skb(skb, pt_prev, orig_dev);
3198
			pt_prev = ptype;
3199
		}
3200
	}
3201

3202
	if (pt_prev) {
3203
		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3204
	} else {
3205
		atomic_long_inc(&skb->dev->rx_dropped);
3206
		kfree_skb(skb);
3207
		/* Jamal, now you will not able to escape explaining
3208
		 * me how you were going to use this. :-)
3209
		 */
3210
		ret = NET_RX_DROP;
3211
	}
3212

3213
out:
3214
	rcu_read_unlock();
3215
	return ret;
3216
}
3217

3218
/**
3219
 *	netif_receive_skb - process receive buffer from network
3220
 *	@skb: buffer to process
3221
 *
3222
 *	netif_receive_skb() is the main receive data processing function.
3223
 *	It always succeeds. The buffer may be dropped during processing
3224
 *	for congestion control or by the protocol layers.
3225
 *
3226
 *	This function may only be called from softirq context and interrupts
3227
 *	should be enabled.
3228
 *
3229
 *	Return values (usually ignored):
3230
 *	NET_RX_SUCCESS: no congestion
3231
 *	NET_RX_DROP: packet was dropped
3232
 */
3233
int netif_receive_skb(struct sk_buff *skb)
3234
{
3235
	if (netdev_tstamp_prequeue)
3236
		net_timestamp_check(skb);
3237

3238
	if (skb_defer_rx_timestamp(skb))
3239
		return NET_RX_SUCCESS;
3240

3241
#ifdef CONFIG_RPS
3242
	{
3243
		struct rps_dev_flow voidflow, *rflow = &voidflow;
3244
		int cpu, ret;
3245

3246
		rcu_read_lock();
3247

3248
		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3249

3250
		if (cpu >= 0) {
3251
			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3252
			rcu_read_unlock();
3253
		} else {
3254
			rcu_read_unlock();
3255
			ret = __netif_receive_skb(skb);
3256
		}
3257

3258
		return ret;
3259
	}
3260
#else
3261
	return __netif_receive_skb(skb);
3262
#endif
3263
}
3264
EXPORT_SYMBOL(netif_receive_skb);
3265

3266
/* Network device is going away, flush any packets still pending
3267
 * Called with irqs disabled.
3268
 */
3269
static void flush_backlog(void *arg)
3270
{
3271
	struct net_device *dev = arg;
3272
	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3273
	struct sk_buff *skb, *tmp;
3274

3275
	rps_lock(sd);
3276
	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3277
		if (skb->dev == dev) {
3278
			__skb_unlink(skb, &sd->input_pkt_queue);
3279
			kfree_skb(skb);
3280
			input_queue_head_incr(sd);
3281
		}
3282
	}
3283
	rps_unlock(sd);
3284

3285
	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3286
		if (skb->dev == dev) {
3287
			__skb_unlink(skb, &sd->process_queue);
3288
			kfree_skb(skb);
3289
			input_queue_head_incr(sd);
3290
		}
3291
	}
3292
}
3293

3294
static int napi_gro_complete(struct sk_buff *skb)
3295
{
3296
	struct packet_type *ptype;
3297
	__be16 type = skb->protocol;
3298
	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3299
	int err = -ENOENT;
3300

3301
	if (NAPI_GRO_CB(skb)->count == 1) {
3302
		skb_shinfo(skb)->gso_size = 0;
3303
		goto out;
3304
	}
3305

3306
	rcu_read_lock();
3307
	list_for_each_entry_rcu(ptype, head, list) {
3308
		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3309
			continue;
3310

3311
		err = ptype->gro_complete(skb);
3312
		break;
3313
	}
3314
	rcu_read_unlock();
3315

3316
	if (err) {
3317
		WARN_ON(&ptype->list == head);
3318
		kfree_skb(skb);
3319
		return NET_RX_SUCCESS;
3320
	}
3321

3322
out:
3323
	return netif_receive_skb(skb);
3324
}
3325

3326
inline void napi_gro_flush(struct napi_struct *napi)
3327
{
3328
	struct sk_buff *skb, *next;
3329

3330
	for (skb = napi->gro_list; skb; skb = next) {
3331
		next = skb->next;
3332
		skb->next = NULL;
3333
		napi_gro_complete(skb);
3334
	}
3335

3336
	napi->gro_count = 0;
3337
	napi->gro_list = NULL;
3338
}
3339
EXPORT_SYMBOL(napi_gro_flush);
3340

3341
enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3342
{
3343
	struct sk_buff **pp = NULL;
3344
	struct packet_type *ptype;
3345
	__be16 type = skb->protocol;
3346
	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3347
	int same_flow;
3348
	int mac_len;
3349
	enum gro_result ret;
3350

3351
	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3352
		goto normal;
3353

3354
	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3355
		goto normal;
3356

3357
	rcu_read_lock();
3358
	list_for_each_entry_rcu(ptype, head, list) {
3359
		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3360
			continue;
3361

3362
		skb_set_network_header(skb, skb_gro_offset(skb));
3363
		mac_len = skb->network_header - skb->mac_header;
3364
		skb->mac_len = mac_len;
3365
		NAPI_GRO_CB(skb)->same_flow = 0;
3366
		NAPI_GRO_CB(skb)->flush = 0;
3367
		NAPI_GRO_CB(skb)->free = 0;
3368

3369
		pp = ptype->gro_receive(&napi->gro_list, skb);
3370
		break;
3371
	}
3372
	rcu_read_unlock();
3373

3374
	if (&ptype->list == head)
3375
		goto normal;
3376

3377
	same_flow = NAPI_GRO_CB(skb)->same_flow;
3378
	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3379

3380
	if (pp) {
3381
		struct sk_buff *nskb = *pp;
3382

3383
		*pp = nskb->next;
3384
		nskb->next = NULL;
3385
		napi_gro_complete(nskb);
3386
		napi->gro_count--;
3387
	}
3388

3389
	if (same_flow)
3390
		goto ok;
3391

3392
	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3393
		goto normal;
3394

3395
	napi->gro_count++;
3396
	NAPI_GRO_CB(skb)->count = 1;
3397
	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3398
	skb->next = napi->gro_list;
3399
	napi->gro_list = skb;
3400
	ret = GRO_HELD;
3401

3402
pull:
3403
	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3404
		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3405

3406
		BUG_ON(skb->end - skb->tail < grow);
3407

3408
		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3409

3410
		skb->tail += grow;
3411
		skb->data_len -= grow;
3412

3413
		skb_shinfo(skb)->frags[0].page_offset += grow;
3414
		skb_shinfo(skb)->frags[0].size -= grow;
3415

3416
		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3417
			put_page(skb_shinfo(skb)->frags[0].page);
3418
			memmove(skb_shinfo(skb)->frags,
3419
				skb_shinfo(skb)->frags + 1,
3420
				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3421
		}
3422
	}
3423

3424
ok:
3425
	return ret;
3426

3427
normal:
3428
	ret = GRO_NORMAL;
3429
	goto pull;
3430
}
3431
EXPORT_SYMBOL(dev_gro_receive);
3432

3433
static inline gro_result_t
3434
__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3435
{
3436
	struct sk_buff *p;
3437

3438
	for (p = napi->gro_list; p; p = p->next) {
3439
		unsigned long diffs;
3440

3441
		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3442
		diffs |= p->vlan_tci ^ skb->vlan_tci;
3443
		diffs |= compare_ether_header(skb_mac_header(p),
3444
					      skb_gro_mac_header(skb));
3445
		NAPI_GRO_CB(p)->same_flow = !diffs;
3446
		NAPI_GRO_CB(p)->flush = 0;
3447
	}
3448

3449
	return dev_gro_receive(napi, skb);
3450
}
3451

3452
gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3453
{
3454
	switch (ret) {
3455
	case GRO_NORMAL:
3456
		if (netif_receive_skb(skb))
3457
			ret = GRO_DROP;
3458
		break;
3459

3460
	case GRO_DROP:
3461
	case GRO_MERGED_FREE:
3462
		kfree_skb(skb);
3463
		break;
3464

3465
	case GRO_HELD:
3466
	case GRO_MERGED:
3467
		break;
3468
	}
3469

3470
	return ret;
3471
}
3472
EXPORT_SYMBOL(napi_skb_finish);
3473

3474
void skb_gro_reset_offset(struct sk_buff *skb)
3475
{
3476
	NAPI_GRO_CB(skb)->data_offset = 0;
3477
	NAPI_GRO_CB(skb)->frag0 = NULL;
3478
	NAPI_GRO_CB(skb)->frag0_len = 0;
3479

3480
	if (skb->mac_header == skb->tail &&
3481
	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3482
		NAPI_GRO_CB(skb)->frag0 =
3483
			page_address(skb_shinfo(skb)->frags[0].page) +
3484
			skb_shinfo(skb)->frags[0].page_offset;
3485
		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3486
	}
3487
}
3488
EXPORT_SYMBOL(skb_gro_reset_offset);
3489

3490
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3491
{
3492
	skb_gro_reset_offset(skb);
3493

3494
	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3495
}
3496
EXPORT_SYMBOL(napi_gro_receive);
3497

3498
static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3499
{
3500
	__skb_pull(skb, skb_headlen(skb));
3501
	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3502
	skb->vlan_tci = 0;
3503
	skb->dev = napi->dev;
3504
	skb->skb_iif = 0;
3505

3506
	napi->skb = skb;
3507
}
3508

3509
struct sk_buff *napi_get_frags(struct napi_struct *napi)
3510
{
3511
	struct sk_buff *skb = napi->skb;
3512

3513
	if (!skb) {
3514
		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3515
		if (skb)
3516
			napi->skb = skb;
3517
	}
3518
	return skb;
3519
}
3520
EXPORT_SYMBOL(napi_get_frags);
3521

3522
gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3523
			       gro_result_t ret)
3524
{
3525
	switch (ret) {
3526
	case GRO_NORMAL:
3527
	case GRO_HELD:
3528
		skb->protocol = eth_type_trans(skb, skb->dev);
3529

3530
		if (ret == GRO_HELD)
3531
			skb_gro_pull(skb, -ETH_HLEN);
3532
		else if (netif_receive_skb(skb))
3533
			ret = GRO_DROP;
3534
		break;
3535

3536
	case GRO_DROP:
3537
	case GRO_MERGED_FREE:
3538
		napi_reuse_skb(napi, skb);
3539
		break;
3540

3541
	case GRO_MERGED:
3542
		break;
3543
	}
3544

3545
	return ret;
3546
}
3547
EXPORT_SYMBOL(napi_frags_finish);
3548

3549
struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3550
{
3551
	struct sk_buff *skb = napi->skb;
3552
	struct ethhdr *eth;
3553
	unsigned int hlen;
3554
	unsigned int off;
3555

3556
	napi->skb = NULL;
3557

3558
	skb_reset_mac_header(skb);
3559
	skb_gro_reset_offset(skb);
3560

3561
	off = skb_gro_offset(skb);
3562
	hlen = off + sizeof(*eth);
3563
	eth = skb_gro_header_fast(skb, off);
3564
	if (skb_gro_header_hard(skb, hlen)) {
3565
		eth = skb_gro_header_slow(skb, hlen, off);
3566
		if (unlikely(!eth)) {
3567
			napi_reuse_skb(napi, skb);
3568
			skb = NULL;
3569
			goto out;
3570
		}
3571
	}
3572

3573
	skb_gro_pull(skb, sizeof(*eth));
3574

3575
	/*
3576
	 * This works because the only protocols we care about don't require
3577
	 * special handling.  We'll fix it up properly at the end.
3578
	 */
3579
	skb->protocol = eth->h_proto;
3580

3581
out:
3582
	return skb;
3583
}
3584
EXPORT_SYMBOL(napi_frags_skb);
3585

3586
gro_result_t napi_gro_frags(struct napi_struct *napi)
3587
{
3588
	struct sk_buff *skb = napi_frags_skb(napi);
3589

3590
	if (!skb)
3591
		return GRO_DROP;
3592

3593
	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3594
}
3595
EXPORT_SYMBOL(napi_gro_frags);
3596

3597
/*
3598
 * net_rps_action sends any pending IPI's for rps.
3599
 * Note: called with local irq disabled, but exits with local irq enabled.
3600
 */
3601
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3602
{
3603
#ifdef CONFIG_RPS
3604
	struct softnet_data *remsd = sd->rps_ipi_list;
3605

3606
	if (remsd) {
3607
		sd->rps_ipi_list = NULL;
3608

3609
		local_irq_enable();
3610

3611
		/* Send pending IPI's to kick RPS processing on remote cpus. */
3612
		while (remsd) {
3613
			struct softnet_data *next = remsd->rps_ipi_next;
3614

3615
			if (cpu_online(remsd->cpu))
3616
				__smp_call_function_single(remsd->cpu,
3617
							   &remsd->csd, 0);
3618
			remsd = next;
3619
		}
3620
	} else
3621
#endif
3622
		local_irq_enable();
3623
}
3624

3625
static int process_backlog(struct napi_struct *napi, int quota)
3626
{
3627
	int work = 0;
3628
	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3629

3630
#ifdef CONFIG_RPS
3631
	/* Check if we have pending ipi, its better to send them now,
3632
	 * not waiting net_rx_action() end.
3633
	 */
3634
	if (sd->rps_ipi_list) {
3635
		local_irq_disable();
3636
		net_rps_action_and_irq_enable(sd);
3637
	}
3638
#endif
3639
	napi->weight = weight_p;
3640
	local_irq_disable();
3641
	while (work < quota) {
3642
		struct sk_buff *skb;
3643
		unsigned int qlen;
3644

3645
		while ((skb = __skb_dequeue(&sd->process_queue))) {
3646
			local_irq_enable();
3647
			__netif_receive_skb(skb);
3648
			local_irq_disable();
3649
			input_queue_head_incr(sd);
3650
			if (++work >= quota) {
3651
				local_irq_enable();
3652
				return work;
3653
			}
3654
		}
3655

3656
		rps_lock(sd);
3657
		qlen = skb_queue_len(&sd->input_pkt_queue);
3658
		if (qlen)
3659
			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3660
						   &sd->process_queue);
3661

3662
		if (qlen < quota - work) {
3663
			/*
3664
			 * Inline a custom version of __napi_complete().
3665
			 * only current cpu owns and manipulates this napi,
3666
			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3667
			 * we can use a plain write instead of clear_bit(),
3668
			 * and we dont need an smp_mb() memory barrier.
3669
			 */
3670
			list_del(&napi->poll_list);
3671
			napi->state = 0;
3672

3673
			quota = work + qlen;
3674
		}
3675
		rps_unlock(sd);
3676
	}
3677
	local_irq_enable();
3678

3679
	return work;
3680
}
3681

3682
/**
3683
 * __napi_schedule - schedule for receive
3684
 * @n: entry to schedule
3685
 *
3686
 * The entry's receive function will be scheduled to run
3687
 */
3688
void __napi_schedule(struct napi_struct *n)
3689
{
3690
	unsigned long flags;
3691

3692
	local_irq_save(flags);
3693
	____napi_schedule(&__get_cpu_var(softnet_data), n);
3694
	local_irq_restore(flags);
3695
}
3696
EXPORT_SYMBOL(__napi_schedule);
3697

3698
void __napi_complete(struct napi_struct *n)
3699
{
3700
	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3701
	BUG_ON(n->gro_list);
3702

3703
	list_del(&n->poll_list);
3704
	smp_mb__before_clear_bit();
3705
	clear_bit(NAPI_STATE_SCHED, &n->state);
3706
}
3707
EXPORT_SYMBOL(__napi_complete);
3708

3709
void napi_complete(struct napi_struct *n)
3710
{
3711
	unsigned long flags;
3712

3713
	/*
3714
	 * don't let napi dequeue from the cpu poll list
3715
	 * just in case its running on a different cpu
3716
	 */
3717
	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3718
		return;
3719

3720
	napi_gro_flush(n);
3721
	local_irq_save(flags);
3722
	__napi_complete(n);
3723
	local_irq_restore(flags);
3724
}
3725
EXPORT_SYMBOL(napi_complete);
3726

3727
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3728
		    int (*poll)(struct napi_struct *, int), int weight)
3729
{
3730
	INIT_LIST_HEAD(&napi->poll_list);
3731
	napi->gro_count = 0;
3732
	napi->gro_list = NULL;
3733
	napi->skb = NULL;
3734
	napi->poll = poll;
3735
	napi->weight = weight;
3736
	list_add(&napi->dev_list, &dev->napi_list);
3737
	napi->dev = dev;
3738
#ifdef CONFIG_NETPOLL
3739
	spin_lock_init(&napi->poll_lock);
3740
	napi->poll_owner = -1;
3741
#endif
3742
	set_bit(NAPI_STATE_SCHED, &napi->state);
3743
}
3744
EXPORT_SYMBOL(netif_napi_add);
3745

3746
void netif_napi_del(struct napi_struct *napi)
3747
{
3748
	struct sk_buff *skb, *next;
3749

3750
	list_del_init(&napi->dev_list);
3751
	napi_free_frags(napi);
3752

3753
	for (skb = napi->gro_list; skb; skb = next) {
3754
		next = skb->next;
3755
		skb->next = NULL;
3756
		kfree_skb(skb);
3757
	}
3758

3759
	napi->gro_list = NULL;
3760
	napi->gro_count = 0;
3761
}
3762
EXPORT_SYMBOL(netif_napi_del);
3763

3764
static void net_rx_action(struct softirq_action *h)
3765
{
3766
	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3767
	unsigned long time_limit = jiffies + 2;
3768
	int budget = netdev_budget;
3769
	void *have;
3770

3771
	local_irq_disable();
3772

3773
	while (!list_empty(&sd->poll_list)) {
3774
		struct napi_struct *n;
3775
		int work, weight;
3776

3777
		/* If softirq window is exhuasted then punt.
3778
		 * Allow this to run for 2 jiffies since which will allow
3779
		 * an average latency of 1.5/HZ.
3780
		 */
3781
		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3782
			goto softnet_break;
3783

3784
		local_irq_enable();
3785

3786
		/* Even though interrupts have been re-enabled, this
3787
		 * access is safe because interrupts can only add new
3788
		 * entries to the tail of this list, and only ->poll()
3789
		 * calls can remove this head entry from the list.
3790
		 */
3791
		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3792

3793
		have = netpoll_poll_lock(n);
3794

3795
		weight = n->weight;
3796

3797
		/* This NAPI_STATE_SCHED test is for avoiding a race
3798
		 * with netpoll's poll_napi().  Only the entity which
3799
		 * obtains the lock and sees NAPI_STATE_SCHED set will
3800
		 * actually make the ->poll() call.  Therefore we avoid
3801
		 * accidentally calling ->poll() when NAPI is not scheduled.
3802
		 */
3803
		work = 0;
3804
		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3805
			work = n->poll(n, weight);
3806
			trace_napi_poll(n);
3807
		}
3808

3809
		WARN_ON_ONCE(work > weight);
3810

3811
		budget -= work;
3812

3813
		local_irq_disable();
3814

3815
		/* Drivers must not modify the NAPI state if they
3816
		 * consume the entire weight.  In such cases this code
3817
		 * still "owns" the NAPI instance and therefore can
3818
		 * move the instance around on the list at-will.
3819
		 */
3820
		if (unlikely(work == weight)) {
3821
			if (unlikely(napi_disable_pending(n))) {
3822
				local_irq_enable();
3823
				napi_complete(n);
3824
				local_irq_disable();
3825
			} else
3826
				list_move_tail(&n->poll_list, &sd->poll_list);
3827
		}
3828

3829
		netpoll_poll_unlock(have);
3830
	}
3831
out:
3832
	net_rps_action_and_irq_enable(sd);
3833

3834
#ifdef CONFIG_NET_DMA
3835
	/*
3836
	 * There may not be any more sk_buffs coming right now, so push
3837
	 * any pending DMA copies to hardware
3838
	 */
3839
	dma_issue_pending_all();
3840
#endif
3841

3842
	return;
3843

3844
softnet_break:
3845
	sd->time_squeeze++;
3846
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3847
	goto out;
3848
}
3849

3850
static gifconf_func_t *gifconf_list[NPROTO];
3851

3852
/**
3853
 *	register_gifconf	-	register a SIOCGIF handler
3854
 *	@family: Address family
3855
 *	@gifconf: Function handler
3856
 *
3857
 *	Register protocol dependent address dumping routines. The handler
3858
 *	that is passed must not be freed or reused until it has been replaced
3859
 *	by another handler.
3860
 */
3861
int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3862
{
3863
	if (family >= NPROTO)
3864
		return -EINVAL;
3865
	gifconf_list[family] = gifconf;
3866
	return 0;
3867
}
3868
EXPORT_SYMBOL(register_gifconf);
3869

3870

3871
/*
3872
 *	Map an interface index to its name (SIOCGIFNAME)
3873
 */
3874

3875
/*
3876
 *	We need this ioctl for efficient implementation of the
3877
 *	if_indextoname() function required by the IPv6 API.  Without
3878
 *	it, we would have to search all the interfaces to find a
3879
 *	match.  --pb
3880
 */
3881

3882
static int dev_ifname(struct net *net, struct ifreq __user *arg)
3883
{
3884
	struct net_device *dev;
3885
	struct ifreq ifr;
3886

3887
	/*
3888
	 *	Fetch the caller's info block.
3889
	 */
3890

3891
	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3892
		return -EFAULT;
3893

3894
	rcu_read_lock();
3895
	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3896
	if (!dev) {
3897
		rcu_read_unlock();
3898
		return -ENODEV;
3899
	}
3900

3901
	strcpy(ifr.ifr_name, dev->name);
3902
	rcu_read_unlock();
3903

3904
	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3905
		return -EFAULT;
3906
	return 0;
3907
}
3908

3909
/*
3910
 *	Perform a SIOCGIFCONF call. This structure will change
3911
 *	size eventually, and there is nothing I can do about it.
3912
 *	Thus we will need a 'compatibility mode'.
3913
 */
3914

3915
static int dev_ifconf(struct net *net, char __user *arg)
3916
{
3917
	struct ifconf ifc;
3918
	struct net_device *dev;
3919
	char __user *pos;
3920
	int len;
3921
	int total;
3922
	int i;
3923

3924
	/*
3925
	 *	Fetch the caller's info block.
3926
	 */
3927

3928
	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3929
		return -EFAULT;
3930

3931
	pos = ifc.ifc_buf;
3932
	len = ifc.ifc_len;
3933

3934
	/*
3935
	 *	Loop over the interfaces, and write an info block for each.
3936
	 */
3937

3938
	total = 0;
3939
	for_each_netdev(net, dev) {
3940
		for (i = 0; i < NPROTO; i++) {
3941
			if (gifconf_list[i]) {
3942
				int done;
3943
				if (!pos)
3944
					done = gifconf_list[i](dev, NULL, 0);
3945
				else
3946
					done = gifconf_list[i](dev, pos + total,
3947
							       len - total);
3948
				if (done < 0)
3949
					return -EFAULT;
3950
				total += done;
3951
			}
3952
		}
3953
	}
3954

3955
	/*
3956
	 *	All done.  Write the updated control block back to the caller.
3957
	 */
3958
	ifc.ifc_len = total;
3959

3960
	/*
3961
	 * 	Both BSD and Solaris return 0 here, so we do too.
3962
	 */
3963
	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3964
}
3965

3966
#ifdef CONFIG_PROC_FS
3967
/*
3968
 *	This is invoked by the /proc filesystem handler to display a device
3969
 *	in detail.
3970
 */
3971
void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3972
	__acquires(RCU)
3973
{
3974
	struct net *net = seq_file_net(seq);
3975
	loff_t off;
3976
	struct net_device *dev;
3977

3978
	rcu_read_lock();
3979
	if (!*pos)
3980
		return SEQ_START_TOKEN;
3981

3982
	off = 1;
3983
	for_each_netdev_rcu(net, dev)
3984
		if (off++ == *pos)
3985
			return dev;
3986

3987
	return NULL;
3988
}
3989

3990
void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3991
{
3992
	struct net_device *dev = v;
3993

3994
	if (v == SEQ_START_TOKEN)
3995
		dev = first_net_device_rcu(seq_file_net(seq));
3996
	else
3997
		dev = next_net_device_rcu(dev);
3998

3999
	++*pos;
4000
	return dev;
4001
}
4002

4003
void dev_seq_stop(struct seq_file *seq, void *v)
4004
	__releases(RCU)
4005
{
4006
	rcu_read_unlock();
4007
}
4008

4009
static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4010
{
4011
	struct rtnl_link_stats64 temp;
4012
	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4013

4014
	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4015
		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4016
		   dev->name, stats->rx_bytes, stats->rx_packets,
4017
		   stats->rx_errors,
4018
		   stats->rx_dropped + stats->rx_missed_errors,
4019
		   stats->rx_fifo_errors,
4020
		   stats->rx_length_errors + stats->rx_over_errors +
4021
		    stats->rx_crc_errors + stats->rx_frame_errors,
4022
		   stats->rx_compressed, stats->multicast,
4023
		   stats->tx_bytes, stats->tx_packets,
4024
		   stats->tx_errors, stats->tx_dropped,
4025
		   stats->tx_fifo_errors, stats->collisions,
4026
		   stats->tx_carrier_errors +
4027
		    stats->tx_aborted_errors +
4028
		    stats->tx_window_errors +
4029
		    stats->tx_heartbeat_errors,
4030
		   stats->tx_compressed);
4031
}
4032

4033
/*
4034
 *	Called from the PROCfs module. This now uses the new arbitrary sized
4035
 *	/proc/net interface to create /proc/net/dev
4036
 */
4037
static int dev_seq_show(struct seq_file *seq, void *v)
4038
{
4039
	if (v == SEQ_START_TOKEN)
4040
		seq_puts(seq, "Inter-|   Receive                            "
4041
			      "                    |  Transmit\n"
4042
			      " face |bytes    packets errs drop fifo frame "
4043
			      "compressed multicast|bytes    packets errs "
4044
			      "drop fifo colls carrier compressed\n");
4045
	else
4046
		dev_seq_printf_stats(seq, v);
4047
	return 0;
4048
}
4049

4050
static struct softnet_data *softnet_get_online(loff_t *pos)
4051
{
4052
	struct softnet_data *sd = NULL;
4053

4054
	while (*pos < nr_cpu_ids)
4055
		if (cpu_online(*pos)) {
4056
			sd = &per_cpu(softnet_data, *pos);
4057
			break;
4058
		} else
4059
			++*pos;
4060
	return sd;
4061
}
4062

4063
static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4064
{
4065
	return softnet_get_online(pos);
4066
}
4067

4068
static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4069
{
4070
	++*pos;
4071
	return softnet_get_online(pos);
4072
}
4073

4074
static void softnet_seq_stop(struct seq_file *seq, void *v)
4075
{
4076
}
4077

4078
static int softnet_seq_show(struct seq_file *seq, void *v)
4079
{
4080
	struct softnet_data *sd = v;
4081

4082
	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4083
		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4084
		   0, 0, 0, 0, /* was fastroute */
4085
		   sd->cpu_collision, sd->received_rps);
4086
	return 0;
4087
}
4088

4089
static const struct seq_operations dev_seq_ops = {
4090
	.start = dev_seq_start,
4091
	.next  = dev_seq_next,
4092
	.stop  = dev_seq_stop,
4093
	.show  = dev_seq_show,
4094
};
4095

4096
static int dev_seq_open(struct inode *inode, struct file *file)
4097
{
4098
	return seq_open_net(inode, file, &dev_seq_ops,
4099
			    sizeof(struct seq_net_private));
4100
}
4101

4102
static const struct file_operations dev_seq_fops = {
4103
	.owner	 = THIS_MODULE,
4104
	.open    = dev_seq_open,
4105
	.read    = seq_read,
4106
	.llseek  = seq_lseek,
4107
	.release = seq_release_net,
4108
};
4109

4110
static const struct seq_operations softnet_seq_ops = {
4111
	.start = softnet_seq_start,
4112
	.next  = softnet_seq_next,
4113
	.stop  = softnet_seq_stop,
4114
	.show  = softnet_seq_show,
4115
};
4116

4117
static int softnet_seq_open(struct inode *inode, struct file *file)
4118
{
4119
	return seq_open(file, &softnet_seq_ops);
4120
}
4121

4122
static const struct file_operations softnet_seq_fops = {
4123
	.owner	 = THIS_MODULE,
4124
	.open    = softnet_seq_open,
4125
	.read    = seq_read,
4126
	.llseek  = seq_lseek,
4127
	.release = seq_release,
4128
};
4129

4130
static void *ptype_get_idx(loff_t pos)
4131
{
4132
	struct packet_type *pt = NULL;
4133
	loff_t i = 0;
4134
	int t;
4135

4136
	list_for_each_entry_rcu(pt, &ptype_all, list) {
4137
		if (i == pos)
4138
			return pt;
4139
		++i;
4140
	}
4141

4142
	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4143
		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4144
			if (i == pos)
4145
				return pt;
4146
			++i;
4147
		}
4148
	}
4149
	return NULL;
4150
}
4151

4152
static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4153
	__acquires(RCU)
4154
{
4155
	rcu_read_lock();
4156
	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4157
}
4158

4159
static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4160
{
4161
	struct packet_type *pt;
4162
	struct list_head *nxt;
4163
	int hash;
4164

4165
	++*pos;
4166
	if (v == SEQ_START_TOKEN)
4167
		return ptype_get_idx(0);
4168

4169
	pt = v;
4170
	nxt = pt->list.next;
4171
	if (pt->type == htons(ETH_P_ALL)) {
4172
		if (nxt != &ptype_all)
4173
			goto found;
4174
		hash = 0;
4175
		nxt = ptype_base[0].next;
4176
	} else
4177
		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4178

4179
	while (nxt == &ptype_base[hash]) {
4180
		if (++hash >= PTYPE_HASH_SIZE)
4181
			return NULL;
4182
		nxt = ptype_base[hash].next;
4183
	}
4184
found:
4185
	return list_entry(nxt, struct packet_type, list);
4186
}
4187

4188
static void ptype_seq_stop(struct seq_file *seq, void *v)
4189
	__releases(RCU)
4190
{
4191
	rcu_read_unlock();
4192
}
4193

4194
static int ptype_seq_show(struct seq_file *seq, void *v)
4195
{
4196
	struct packet_type *pt = v;
4197

4198
	if (v == SEQ_START_TOKEN)
4199
		seq_puts(seq, "Type Device      Function\n");
4200
	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4201
		if (pt->type == htons(ETH_P_ALL))
4202
			seq_puts(seq, "ALL ");
4203
		else
4204
			seq_printf(seq, "%04x", ntohs(pt->type));
4205

4206
		seq_printf(seq, " %-8s %pF\n",
4207
			   pt->dev ? pt->dev->name : "", pt->func);
4208
	}
4209

4210
	return 0;
4211
}
4212

4213
static const struct seq_operations ptype_seq_ops = {
4214
	.start = ptype_seq_start,
4215
	.next  = ptype_seq_next,
4216
	.stop  = ptype_seq_stop,
4217
	.show  = ptype_seq_show,
4218
};
4219

4220
static int ptype_seq_open(struct inode *inode, struct file *file)
4221
{
4222
	return seq_open_net(inode, file, &ptype_seq_ops,
4223
			sizeof(struct seq_net_private));
4224
}
4225

4226
static const struct file_operations ptype_seq_fops = {
4227
	.owner	 = THIS_MODULE,
4228
	.open    = ptype_seq_open,
4229
	.read    = seq_read,
4230
	.llseek  = seq_lseek,
4231
	.release = seq_release_net,
4232
};
4233

4234

4235
static int __net_init dev_proc_net_init(struct net *net)
4236
{
4237
	int rc = -ENOMEM;
4238

4239
	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4240
		goto out;
4241
	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4242
		goto out_dev;
4243
	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4244
		goto out_softnet;
4245

4246
	if (wext_proc_init(net))
4247
		goto out_ptype;
4248
	rc = 0;
4249
out:
4250
	return rc;
4251
out_ptype:
4252
	proc_net_remove(net, "ptype");
4253
out_softnet:
4254
	proc_net_remove(net, "softnet_stat");
4255
out_dev:
4256
	proc_net_remove(net, "dev");
4257
	goto out;
4258
}
4259

4260
static void __net_exit dev_proc_net_exit(struct net *net)
4261
{
4262
	wext_proc_exit(net);
4263

4264
	proc_net_remove(net, "ptype");
4265
	proc_net_remove(net, "softnet_stat");
4266
	proc_net_remove(net, "dev");
4267
}
4268

4269
static struct pernet_operations __net_initdata dev_proc_ops = {
4270
	.init = dev_proc_net_init,
4271
	.exit = dev_proc_net_exit,
4272
};
4273

4274
static int __init dev_proc_init(void)
4275
{
4276
	return register_pernet_subsys(&dev_proc_ops);
4277
}
4278
#else
4279
#define dev_proc_init() 0
4280
#endif	/* CONFIG_PROC_FS */
4281

4282

4283
/**
4284
 *	netdev_set_master	-	set up master pointer
4285
 *	@slave: slave device
4286
 *	@master: new master device
4287
 *
4288
 *	Changes the master device of the slave. Pass %NULL to break the
4289
 *	bonding. The caller must hold the RTNL semaphore. On a failure
4290
 *	a negative errno code is returned. On success the reference counts
4291
 *	are adjusted and the function returns zero.
4292
 */
4293
int netdev_set_master(struct net_device *slave, struct net_device *master)
4294
{
4295
	struct net_device *old = slave->master;
4296

4297
	ASSERT_RTNL();
4298

4299
	if (master) {
4300
		if (old)
4301
			return -EBUSY;
4302
		dev_hold(master);
4303
	}
4304

4305
	slave->master = master;
4306

4307
	if (old)
4308
		dev_put(old);
4309
	return 0;
4310
}
4311
EXPORT_SYMBOL(netdev_set_master);
4312

4313
/**
4314
 *	netdev_set_bond_master	-	set up bonding master/slave pair
4315
 *	@slave: slave device
4316
 *	@master: new master device
4317
 *
4318
 *	Changes the master device of the slave. Pass %NULL to break the
4319
 *	bonding. The caller must hold the RTNL semaphore. On a failure
4320
 *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4321
 *	to the routing socket and the function returns zero.
4322
 */
4323
int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4324
{
4325
	int err;
4326

4327
	ASSERT_RTNL();
4328

4329
	err = netdev_set_master(slave, master);
4330
	if (err)
4331
		return err;
4332
	if (master)
4333
		slave->flags |= IFF_SLAVE;
4334
	else
4335
		slave->flags &= ~IFF_SLAVE;
4336

4337
	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4338
	return 0;
4339
}
4340
EXPORT_SYMBOL(netdev_set_bond_master);
4341

4342
static void dev_change_rx_flags(struct net_device *dev, int flags)
4343
{
4344
	const struct net_device_ops *ops = dev->netdev_ops;
4345

4346
	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4347
		ops->ndo_change_rx_flags(dev, flags);
4348
}
4349

4350
static int __dev_set_promiscuity(struct net_device *dev, int inc)
4351
{
4352
	unsigned short old_flags = dev->flags;
4353
	uid_t uid;
4354
	gid_t gid;
4355

4356
	ASSERT_RTNL();
4357

4358
	dev->flags |= IFF_PROMISC;
4359
	dev->promiscuity += inc;
4360
	if (dev->promiscuity == 0) {
4361
		/*
4362
		 * Avoid overflow.
4363
		 * If inc causes overflow, untouch promisc and return error.
4364
		 */
4365
		if (inc < 0)
4366
			dev->flags &= ~IFF_PROMISC;
4367
		else {
4368
			dev->promiscuity -= inc;
4369
			printk(KERN_WARNING "%s: promiscuity touches roof, "
4370
				"set promiscuity failed, promiscuity feature "
4371
				"of device might be broken.\n", dev->name);
4372
			return -EOVERFLOW;
4373
		}
4374
	}
4375
	if (dev->flags != old_flags) {
4376
		printk(KERN_INFO "device %s %s promiscuous mode\n",
4377
		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4378
							       "left");
4379
		if (audit_enabled) {
4380
			current_uid_gid(&uid, &gid);
4381
			audit_log(current->audit_context, GFP_ATOMIC,
4382
				AUDIT_ANOM_PROMISCUOUS,
4383
				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4384
				dev->name, (dev->flags & IFF_PROMISC),
4385
				(old_flags & IFF_PROMISC),
4386
				audit_get_loginuid(current),
4387
				uid, gid,
4388
				audit_get_sessionid(current));
4389
		}
4390

4391
		dev_change_rx_flags(dev, IFF_PROMISC);
4392
	}
4393
	return 0;
4394
}
4395

4396
/**
4397
 *	dev_set_promiscuity	- update promiscuity count on a device
4398
 *	@dev: device
4399
 *	@inc: modifier
4400
 *
4401
 *	Add or remove promiscuity from a device. While the count in the device
4402
 *	remains above zero the interface remains promiscuous. Once it hits zero
4403
 *	the device reverts back to normal filtering operation. A negative inc
4404
 *	value is used to drop promiscuity on the device.
4405
 *	Return 0 if successful or a negative errno code on error.
4406
 */
4407
int dev_set_promiscuity(struct net_device *dev, int inc)
4408
{
4409
	unsigned short old_flags = dev->flags;
4410
	int err;
4411

4412
	err = __dev_set_promiscuity(dev, inc);
4413
	if (err < 0)
4414
		return err;
4415
	if (dev->flags != old_flags)
4416
		dev_set_rx_mode(dev);
4417
	return err;
4418
}
4419
EXPORT_SYMBOL(dev_set_promiscuity);
4420

4421
/**
4422
 *	dev_set_allmulti	- update allmulti count on a device
4423
 *	@dev: device
4424
 *	@inc: modifier
4425
 *
4426
 *	Add or remove reception of all multicast frames to a device. While the
4427
 *	count in the device remains above zero the interface remains listening
4428
 *	to all interfaces. Once it hits zero the device reverts back to normal
4429
 *	filtering operation. A negative @inc value is used to drop the counter
4430
 *	when releasing a resource needing all multicasts.
4431
 *	Return 0 if successful or a negative errno code on error.
4432
 */
4433

4434
int dev_set_allmulti(struct net_device *dev, int inc)
4435
{
4436
	unsigned short old_flags = dev->flags;
4437

4438
	ASSERT_RTNL();
4439

4440
	dev->flags |= IFF_ALLMULTI;
4441
	dev->allmulti += inc;
4442
	if (dev->allmulti == 0) {
4443
		/*
4444
		 * Avoid overflow.
4445
		 * If inc causes overflow, untouch allmulti and return error.
4446
		 */
4447
		if (inc < 0)
4448
			dev->flags &= ~IFF_ALLMULTI;
4449
		else {
4450
			dev->allmulti -= inc;
4451
			printk(KERN_WARNING "%s: allmulti touches roof, "
4452
				"set allmulti failed, allmulti feature of "
4453
				"device might be broken.\n", dev->name);
4454
			return -EOVERFLOW;
4455
		}
4456
	}
4457
	if (dev->flags ^ old_flags) {
4458
		dev_change_rx_flags(dev, IFF_ALLMULTI);
4459
		dev_set_rx_mode(dev);
4460
	}
4461
	return 0;
4462
}
4463
EXPORT_SYMBOL(dev_set_allmulti);
4464

4465
/*
4466
 *	Upload unicast and multicast address lists to device and
4467
 *	configure RX filtering. When the device doesn't support unicast
4468
 *	filtering it is put in promiscuous mode while unicast addresses
4469
 *	are present.
4470
 */
4471
void __dev_set_rx_mode(struct net_device *dev)
4472
{
4473
	const struct net_device_ops *ops = dev->netdev_ops;
4474

4475
	/* dev_open will call this function so the list will stay sane. */
4476
	if (!(dev->flags&IFF_UP))
4477
		return;
4478

4479
	if (!netif_device_present(dev))
4480
		return;
4481

4482
	if (ops->ndo_set_rx_mode)
4483
		ops->ndo_set_rx_mode(dev);
4484
	else {
4485
		/* Unicast addresses changes may only happen under the rtnl,
4486
		 * therefore calling __dev_set_promiscuity here is safe.
4487
		 */
4488
		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4489
			__dev_set_promiscuity(dev, 1);
4490
			dev->uc_promisc = 1;
4491
		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4492
			__dev_set_promiscuity(dev, -1);
4493
			dev->uc_promisc = 0;
4494
		}
4495

4496
		if (ops->ndo_set_multicast_list)
4497
			ops->ndo_set_multicast_list(dev);
4498
	}
4499
}
4500

4501
void dev_set_rx_mode(struct net_device *dev)
4502
{
4503
	netif_addr_lock_bh(dev);
4504
	__dev_set_rx_mode(dev);
4505
	netif_addr_unlock_bh(dev);
4506
}
4507

4508
/**
4509
 *	dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4510
 *	@dev: device
4511
 *	@cmd: memory area for ethtool_ops::get_settings() result
4512
 *
4513
 *      The cmd arg is initialized properly (cleared and
4514
 *      ethtool_cmd::cmd field set to ETHTOOL_GSET).
4515
 *
4516
 *	Return device's ethtool_ops::get_settings() result value or
4517
 *	-EOPNOTSUPP when device doesn't expose
4518
 *	ethtool_ops::get_settings() operation.
4519
 */
4520
int dev_ethtool_get_settings(struct net_device *dev,
4521
			     struct ethtool_cmd *cmd)
4522
{
4523
	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4524
		return -EOPNOTSUPP;
4525

4526
	memset(cmd, 0, sizeof(struct ethtool_cmd));
4527
	cmd->cmd = ETHTOOL_GSET;
4528
	return dev->ethtool_ops->get_settings(dev, cmd);
4529
}
4530
EXPORT_SYMBOL(dev_ethtool_get_settings);
4531

4532
/**
4533
 *	dev_get_flags - get flags reported to userspace
4534
 *	@dev: device
4535
 *
4536
 *	Get the combination of flag bits exported through APIs to userspace.
4537
 */
4538
unsigned dev_get_flags(const struct net_device *dev)
4539
{
4540
	unsigned flags;
4541

4542
	flags = (dev->flags & ~(IFF_PROMISC |
4543
				IFF_ALLMULTI |
4544
				IFF_RUNNING |
4545
				IFF_LOWER_UP |
4546
				IFF_DORMANT)) |
4547
		(dev->gflags & (IFF_PROMISC |
4548
				IFF_ALLMULTI));
4549

4550
	if (netif_running(dev)) {
4551
		if (netif_oper_up(dev))
4552
			flags |= IFF_RUNNING;
4553
		if (netif_carrier_ok(dev))
4554
			flags |= IFF_LOWER_UP;
4555
		if (netif_dormant(dev))
4556
			flags |= IFF_DORMANT;
4557
	}
4558

4559
	return flags;
4560
}
4561
EXPORT_SYMBOL(dev_get_flags);
4562

4563
int __dev_change_flags(struct net_device *dev, unsigned int flags)
4564
{
4565
	int old_flags = dev->flags;
4566
	int ret;
4567

4568
	ASSERT_RTNL();
4569

4570
	/*
4571
	 *	Set the flags on our device.
4572
	 */
4573

4574
	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4575
			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4576
			       IFF_AUTOMEDIA)) |
4577
		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4578
				    IFF_ALLMULTI));
4579

4580
	/*
4581
	 *	Load in the correct multicast list now the flags have changed.
4582
	 */
4583

4584
	if ((old_flags ^ flags) & IFF_MULTICAST)
4585
		dev_change_rx_flags(dev, IFF_MULTICAST);
4586

4587
	dev_set_rx_mode(dev);
4588

4589
	/*
4590
	 *	Have we downed the interface. We handle IFF_UP ourselves
4591
	 *	according to user attempts to set it, rather than blindly
4592
	 *	setting it.
4593
	 */
4594

4595
	ret = 0;
4596
	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4597
		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4598

4599
		if (!ret)
4600
			dev_set_rx_mode(dev);
4601
	}
4602

4603
	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4604
		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4605

4606
		dev->gflags ^= IFF_PROMISC;
4607
		dev_set_promiscuity(dev, inc);
4608
	}
4609

4610
	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4611
	   is important. Some (broken) drivers set IFF_PROMISC, when
4612
	   IFF_ALLMULTI is requested not asking us and not reporting.
4613
	 */
4614
	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4615
		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4616

4617
		dev->gflags ^= IFF_ALLMULTI;
4618
		dev_set_allmulti(dev, inc);
4619
	}
4620

4621
	return ret;
4622
}
4623

4624
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4625
{
4626
	unsigned int changes = dev->flags ^ old_flags;
4627

4628
	if (changes & IFF_UP) {
4629
		if (dev->flags & IFF_UP)
4630
			call_netdevice_notifiers(NETDEV_UP, dev);
4631
		else
4632
			call_netdevice_notifiers(NETDEV_DOWN, dev);
4633
	}
4634

4635
	if (dev->flags & IFF_UP &&
4636
	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4637
		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4638
}
4639

4640
/**
4641
 *	dev_change_flags - change device settings
4642
 *	@dev: device
4643
 *	@flags: device state flags
4644
 *
4645
 *	Change settings on device based state flags. The flags are
4646
 *	in the userspace exported format.
4647
 */
4648
int dev_change_flags(struct net_device *dev, unsigned flags)
4649
{
4650
	int ret, changes;
4651
	int old_flags = dev->flags;
4652

4653
	ret = __dev_change_flags(dev, flags);
4654
	if (ret < 0)
4655
		return ret;
4656

4657
	changes = old_flags ^ dev->flags;
4658
	if (changes)
4659
		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4660

4661
	__dev_notify_flags(dev, old_flags);
4662
	return ret;
4663
}
4664
EXPORT_SYMBOL(dev_change_flags);
4665

4666
/**
4667
 *	dev_set_mtu - Change maximum transfer unit
4668
 *	@dev: device
4669
 *	@new_mtu: new transfer unit
4670
 *
4671
 *	Change the maximum transfer size of the network device.
4672
 */
4673
int dev_set_mtu(struct net_device *dev, int new_mtu)
4674
{
4675
	const struct net_device_ops *ops = dev->netdev_ops;
4676
	int err;
4677

4678
	if (new_mtu == dev->mtu)
4679
		return 0;
4680

4681
	/*	MTU must be positive.	 */
4682
	if (new_mtu < 0)
4683
		return -EINVAL;
4684

4685
	if (!netif_device_present(dev))
4686
		return -ENODEV;
4687

4688
	err = 0;
4689
	if (ops->ndo_change_mtu)
4690
		err = ops->ndo_change_mtu(dev, new_mtu);
4691
	else
4692
		dev->mtu = new_mtu;
4693

4694
	if (!err && dev->flags & IFF_UP)
4695
		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4696
	return err;
4697
}
4698
EXPORT_SYMBOL(dev_set_mtu);
4699

4700
/**
4701
 *	dev_set_group - Change group this device belongs to
4702
 *	@dev: device
4703
 *	@new_group: group this device should belong to
4704
 */
4705
void dev_set_group(struct net_device *dev, int new_group)
4706
{
4707
	dev->group = new_group;
4708
}
4709
EXPORT_SYMBOL(dev_set_group);
4710

4711
/**
4712
 *	dev_set_mac_address - Change Media Access Control Address
4713
 *	@dev: device
4714
 *	@sa: new address
4715
 *
4716
 *	Change the hardware (MAC) address of the device
4717
 */
4718
int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4719
{
4720
	const struct net_device_ops *ops = dev->netdev_ops;
4721
	int err;
4722

4723
	if (!ops->ndo_set_mac_address)
4724
		return -EOPNOTSUPP;
4725
	if (sa->sa_family != dev->type)
4726
		return -EINVAL;
4727
	if (!netif_device_present(dev))
4728
		return -ENODEV;
4729
	err = ops->ndo_set_mac_address(dev, sa);
4730
	if (!err)
4731
		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4732
	return err;
4733
}
4734
EXPORT_SYMBOL(dev_set_mac_address);
4735

4736
/*
4737
 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4738
 */
4739
static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4740
{
4741
	int err;
4742
	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4743

4744
	if (!dev)
4745
		return -ENODEV;
4746

4747
	switch (cmd) {
4748
	case SIOCGIFFLAGS:	/* Get interface flags */
4749
		ifr->ifr_flags = (short) dev_get_flags(dev);
4750
		return 0;
4751

4752
	case SIOCGIFMETRIC:	/* Get the metric on the interface
4753
				   (currently unused) */
4754
		ifr->ifr_metric = 0;
4755
		return 0;
4756

4757
	case SIOCGIFMTU:	/* Get the MTU of a device */
4758
		ifr->ifr_mtu = dev->mtu;
4759
		return 0;
4760

4761
	case SIOCGIFHWADDR:
4762
		if (!dev->addr_len)
4763
			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4764
		else
4765
			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4766
			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4767
		ifr->ifr_hwaddr.sa_family = dev->type;
4768
		return 0;
4769

4770
	case SIOCGIFSLAVE:
4771
		err = -EINVAL;
4772
		break;
4773

4774
	case SIOCGIFMAP:
4775
		ifr->ifr_map.mem_start = dev->mem_start;
4776
		ifr->ifr_map.mem_end   = dev->mem_end;
4777
		ifr->ifr_map.base_addr = dev->base_addr;
4778
		ifr->ifr_map.irq       = dev->irq;
4779
		ifr->ifr_map.dma       = dev->dma;
4780
		ifr->ifr_map.port      = dev->if_port;
4781
		return 0;
4782

4783
	case SIOCGIFINDEX:
4784
		ifr->ifr_ifindex = dev->ifindex;
4785
		return 0;
4786

4787
	case SIOCGIFTXQLEN:
4788
		ifr->ifr_qlen = dev->tx_queue_len;
4789
		return 0;
4790

4791
	default:
4792
		/* dev_ioctl() should ensure this case
4793
		 * is never reached
4794
		 */
4795
		WARN_ON(1);
4796
		err = -ENOTTY;
4797
		break;
4798

4799
	}
4800
	return err;
4801
}
4802

4803
/*
4804
 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4805
 */
4806
static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4807
{
4808
	int err;
4809
	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4810
	const struct net_device_ops *ops;
4811

4812
	if (!dev)
4813
		return -ENODEV;
4814

4815
	ops = dev->netdev_ops;
4816

4817
	switch (cmd) {
4818
	case SIOCSIFFLAGS:	/* Set interface flags */
4819
		return dev_change_flags(dev, ifr->ifr_flags);
4820

4821
	case SIOCSIFMETRIC:	/* Set the metric on the interface
4822
				   (currently unused) */
4823
		return -EOPNOTSUPP;
4824

4825
	case SIOCSIFMTU:	/* Set the MTU of a device */
4826
		return dev_set_mtu(dev, ifr->ifr_mtu);
4827

4828
	case SIOCSIFHWADDR:
4829
		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4830

4831
	case SIOCSIFHWBROADCAST:
4832
		if (ifr->ifr_hwaddr.sa_family != dev->type)
4833
			return -EINVAL;
4834
		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4835
		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4836
		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4837
		return 0;
4838

4839
	case SIOCSIFMAP:
4840
		if (ops->ndo_set_config) {
4841
			if (!netif_device_present(dev))
4842
				return -ENODEV;
4843
			return ops->ndo_set_config(dev, &ifr->ifr_map);
4844
		}
4845
		return -EOPNOTSUPP;
4846

4847
	case SIOCADDMULTI:
4848
		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4849
		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4850
			return -EINVAL;
4851
		if (!netif_device_present(dev))
4852
			return -ENODEV;
4853
		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4854

4855
	case SIOCDELMULTI:
4856
		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4857
		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4858
			return -EINVAL;
4859
		if (!netif_device_present(dev))
4860
			return -ENODEV;
4861
		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4862

4863
	case SIOCSIFTXQLEN:
4864
		if (ifr->ifr_qlen < 0)
4865
			return -EINVAL;
4866
		dev->tx_queue_len = ifr->ifr_qlen;
4867
		return 0;
4868

4869
	case SIOCSIFNAME:
4870
		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4871
		return dev_change_name(dev, ifr->ifr_newname);
4872

4873
	/*
4874
	 *	Unknown or private ioctl
4875
	 */
4876
	default:
4877
		if ((cmd >= SIOCDEVPRIVATE &&
4878
		    cmd <= SIOCDEVPRIVATE + 15) ||
4879
		    cmd == SIOCBONDENSLAVE ||
4880
		    cmd == SIOCBONDRELEASE ||
4881
		    cmd == SIOCBONDSETHWADDR ||
4882
		    cmd == SIOCBONDSLAVEINFOQUERY ||
4883
		    cmd == SIOCBONDINFOQUERY ||
4884
		    cmd == SIOCBONDCHANGEACTIVE ||
4885
		    cmd == SIOCGMIIPHY ||
4886
		    cmd == SIOCGMIIREG ||
4887
		    cmd == SIOCSMIIREG ||
4888
		    cmd == SIOCBRADDIF ||
4889
		    cmd == SIOCBRDELIF ||
4890
		    cmd == SIOCSHWTSTAMP ||
4891
		    cmd == SIOCWANDEV) {
4892
			err = -EOPNOTSUPP;
4893
			if (ops->ndo_do_ioctl) {
4894
				if (netif_device_present(dev))
4895
					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4896
				else
4897
					err = -ENODEV;
4898
			}
4899
		} else
4900
			err = -EINVAL;
4901

4902
	}
4903
	return err;
4904
}
4905

4906
/*
4907
 *	This function handles all "interface"-type I/O control requests. The actual
4908
 *	'doing' part of this is dev_ifsioc above.
4909
 */
4910

4911
/**
4912
 *	dev_ioctl	-	network device ioctl
4913
 *	@net: the applicable net namespace
4914
 *	@cmd: command to issue
4915
 *	@arg: pointer to a struct ifreq in user space
4916
 *
4917
 *	Issue ioctl functions to devices. This is normally called by the
4918
 *	user space syscall interfaces but can sometimes be useful for
4919
 *	other purposes. The return value is the return from the syscall if
4920
 *	positive or a negative errno code on error.
4921
 */
4922

4923
int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4924
{
4925
	struct ifreq ifr;
4926
	int ret;
4927
	char *colon;
4928

4929
	/* One special case: SIOCGIFCONF takes ifconf argument
4930
	   and requires shared lock, because it sleeps writing
4931
	   to user space.
4932
	 */
4933

4934
	if (cmd == SIOCGIFCONF) {
4935
		rtnl_lock();
4936
		ret = dev_ifconf(net, (char __user *) arg);
4937
		rtnl_unlock();
4938
		return ret;
4939
	}
4940
	if (cmd == SIOCGIFNAME)
4941
		return dev_ifname(net, (struct ifreq __user *)arg);
4942

4943
	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4944
		return -EFAULT;
4945

4946
	ifr.ifr_name[IFNAMSIZ-1] = 0;
4947

4948
	colon = strchr(ifr.ifr_name, ':');
4949
	if (colon)
4950
		*colon = 0;
4951

4952
	/*
4953
	 *	See which interface the caller is talking about.
4954
	 */
4955

4956
	switch (cmd) {
4957
	/*
4958
	 *	These ioctl calls:
4959
	 *	- can be done by all.
4960
	 *	- atomic and do not require locking.
4961
	 *	- return a value
4962
	 */
4963
	case SIOCGIFFLAGS:
4964
	case SIOCGIFMETRIC:
4965
	case SIOCGIFMTU:
4966
	case SIOCGIFHWADDR:
4967
	case SIOCGIFSLAVE:
4968
	case SIOCGIFMAP:
4969
	case SIOCGIFINDEX:
4970
	case SIOCGIFTXQLEN:
4971
		dev_load(net, ifr.ifr_name);
4972
		rcu_read_lock();
4973
		ret = dev_ifsioc_locked(net, &ifr, cmd);
4974
		rcu_read_unlock();
4975
		if (!ret) {
4976
			if (colon)
4977
				*colon = ':';
4978
			if (copy_to_user(arg, &ifr,
4979
					 sizeof(struct ifreq)))
4980
				ret = -EFAULT;
4981
		}
4982
		return ret;
4983

4984
	case SIOCETHTOOL:
4985
		dev_load(net, ifr.ifr_name);
4986
		rtnl_lock();
4987
		ret = dev_ethtool(net, &ifr);
4988
		rtnl_unlock();
4989
		if (!ret) {
4990
			if (colon)
4991
				*colon = ':';
4992
			if (copy_to_user(arg, &ifr,
4993
					 sizeof(struct ifreq)))
4994
				ret = -EFAULT;
4995
		}
4996
		return ret;
4997

4998
	/*
4999
	 *	These ioctl calls:
5000
	 *	- require superuser power.
5001
	 *	- require strict serialization.
5002
	 *	- return a value
5003
	 */
5004
	case SIOCGMIIPHY:
5005
	case SIOCGMIIREG:
5006
	case SIOCSIFNAME:
5007
		if (!capable(CAP_NET_ADMIN))
5008
			return -EPERM;
5009
		dev_load(net, ifr.ifr_name);
5010
		rtnl_lock();
5011
		ret = dev_ifsioc(net, &ifr, cmd);
5012
		rtnl_unlock();
5013
		if (!ret) {
5014
			if (colon)
5015
				*colon = ':';
5016
			if (copy_to_user(arg, &ifr,
5017
					 sizeof(struct ifreq)))
5018
				ret = -EFAULT;
5019
		}
5020
		return ret;
5021

5022
	/*
5023
	 *	These ioctl calls:
5024
	 *	- require superuser power.
5025
	 *	- require strict serialization.
5026
	 *	- do not return a value
5027
	 */
5028
	case SIOCSIFFLAGS:
5029
	case SIOCSIFMETRIC:
5030
	case SIOCSIFMTU:
5031
	case SIOCSIFMAP:
5032
	case SIOCSIFHWADDR:
5033
	case SIOCSIFSLAVE:
5034
	case SIOCADDMULTI:
5035
	case SIOCDELMULTI:
5036
	case SIOCSIFHWBROADCAST:
5037
	case SIOCSIFTXQLEN:
5038
	case SIOCSMIIREG:
5039
	case SIOCBONDENSLAVE:
5040
	case SIOCBONDRELEASE:
5041
	case SIOCBONDSETHWADDR:
5042
	case SIOCBONDCHANGEACTIVE:
5043
	case SIOCBRADDIF:
5044
	case SIOCBRDELIF:
5045
	case SIOCSHWTSTAMP:
5046
		if (!capable(CAP_NET_ADMIN))
5047
			return -EPERM;
5048
		/* fall through */
5049
	case SIOCBONDSLAVEINFOQUERY:
5050
	case SIOCBONDINFOQUERY:
5051
		dev_load(net, ifr.ifr_name);
5052
		rtnl_lock();
5053
		ret = dev_ifsioc(net, &ifr, cmd);
5054
		rtnl_unlock();
5055
		return ret;
5056

5057
	case SIOCGIFMEM:
5058
		/* Get the per device memory space. We can add this but
5059
		 * currently do not support it */
5060
	case SIOCSIFMEM:
5061
		/* Set the per device memory buffer space.
5062
		 * Not applicable in our case */
5063
	case SIOCSIFLINK:
5064
		return -ENOTTY;
5065

5066
	/*
5067
	 *	Unknown or private ioctl.
5068
	 */
5069
	default:
5070
		if (cmd == SIOCWANDEV ||
5071
		    (cmd >= SIOCDEVPRIVATE &&
5072
		     cmd <= SIOCDEVPRIVATE + 15)) {
5073
			dev_load(net, ifr.ifr_name);
5074
			rtnl_lock();
5075
			ret = dev_ifsioc(net, &ifr, cmd);
5076
			rtnl_unlock();
5077
			if (!ret && copy_to_user(arg, &ifr,
5078
						 sizeof(struct ifreq)))
5079
				ret = -EFAULT;
5080
			return ret;
5081
		}
5082
		/* Take care of Wireless Extensions */
5083
		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5084
			return wext_handle_ioctl(net, &ifr, cmd, arg);
5085
		return -ENOTTY;
5086
	}
5087
}
5088

5089

5090
/**
5091
 *	dev_new_index	-	allocate an ifindex
5092
 *	@net: the applicable net namespace
5093
 *
5094
 *	Returns a suitable unique value for a new device interface
5095
 *	number.  The caller must hold the rtnl semaphore or the
5096
 *	dev_base_lock to be sure it remains unique.
5097
 */
5098
static int dev_new_index(struct net *net)
5099
{
5100
	static int ifindex;
5101
	for (;;) {
5102
		if (++ifindex <= 0)
5103
			ifindex = 1;
5104
		if (!__dev_get_by_index(net, ifindex))
5105
			return ifindex;
5106
	}
5107
}
5108

5109
/* Delayed registration/unregisteration */
5110
static LIST_HEAD(net_todo_list);
5111

5112
static void net_set_todo(struct net_device *dev)
5113
{
5114
	list_add_tail(&dev->todo_list, &net_todo_list);
5115
}
5116

5117
static void rollback_registered_many(struct list_head *head)
5118
{
5119
	struct net_device *dev, *tmp;
5120

5121
	BUG_ON(dev_boot_phase);
5122
	ASSERT_RTNL();
5123

5124
	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5125
		/* Some devices call without registering
5126
		 * for initialization unwind. Remove those
5127
		 * devices and proceed with the remaining.
5128
		 */
5129
		if (dev->reg_state == NETREG_UNINITIALIZED) {
5130
			pr_debug("unregister_netdevice: device %s/%p never "
5131
				 "was registered\n", dev->name, dev);
5132

5133
			WARN_ON(1);
5134
			list_del(&dev->unreg_list);
5135
			continue;
5136
		}
5137
		dev->dismantle = true;
5138
		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5139
	}
5140

5141
	/* If device is running, close it first. */
5142
	dev_close_many(head);
5143

5144
	list_for_each_entry(dev, head, unreg_list) {
5145
		/* And unlink it from device chain. */
5146
		unlist_netdevice(dev);
5147

5148
		dev->reg_state = NETREG_UNREGISTERING;
5149
	}
5150

5151
	synchronize_net();
5152

5153
	list_for_each_entry(dev, head, unreg_list) {
5154
		/* Shutdown queueing discipline. */
5155
		dev_shutdown(dev);
5156

5157

5158
		/* Notify protocols, that we are about to destroy
5159
		   this device. They should clean all the things.
5160
		*/
5161
		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5162

5163
		if (!dev->rtnl_link_ops ||
5164
		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5165
			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5166

5167
		/*
5168
		 *	Flush the unicast and multicast chains
5169
		 */
5170
		dev_uc_flush(dev);
5171
		dev_mc_flush(dev);
5172

5173
		if (dev->netdev_ops->ndo_uninit)
5174
			dev->netdev_ops->ndo_uninit(dev);
5175

5176
		/* Notifier chain MUST detach us from master device. */
5177
		WARN_ON(dev->master);
5178

5179
		/* Remove entries from kobject tree */
5180
		netdev_unregister_kobject(dev);
5181
	}
5182

5183
	/* Process any work delayed until the end of the batch */
5184
	dev = list_first_entry(head, struct net_device, unreg_list);
5185
	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5186

5187
	rcu_barrier();
5188

5189
	list_for_each_entry(dev, head, unreg_list)
5190
		dev_put(dev);
5191
}
5192

5193
static void rollback_registered(struct net_device *dev)
5194
{
5195
	LIST_HEAD(single);
5196

5197
	list_add(&dev->unreg_list, &single);
5198
	rollback_registered_many(&single);
5199
	list_del(&single);
5200
}
5201

5202
u32 netdev_fix_features(struct net_device *dev, u32 features)
5203
{
5204
	/* Fix illegal checksum combinations */
5205
	if ((features & NETIF_F_HW_CSUM) &&
5206
	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5207
		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5208
		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5209
	}
5210

5211
	if ((features & NETIF_F_NO_CSUM) &&
5212
	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5213
		netdev_warn(dev, "mixed no checksumming and other settings.\n");
5214
		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5215
	}
5216

5217
	/* Fix illegal SG+CSUM combinations. */
5218
	if ((features & NETIF_F_SG) &&
5219
	    !(features & NETIF_F_ALL_CSUM)) {
5220
		netdev_dbg(dev,
5221
			"Dropping NETIF_F_SG since no checksum feature.\n");
5222
		features &= ~NETIF_F_SG;
5223
	}
5224

5225
	/* TSO requires that SG is present as well. */
5226
	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5227
		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5228
		features &= ~NETIF_F_ALL_TSO;
5229
	}
5230

5231
	/* TSO ECN requires that TSO is present as well. */
5232
	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5233
		features &= ~NETIF_F_TSO_ECN;
5234

5235
	/* Software GSO depends on SG. */
5236
	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5237
		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5238
		features &= ~NETIF_F_GSO;
5239
	}
5240

5241
	/* UFO needs SG and checksumming */
5242
	if (features & NETIF_F_UFO) {
5243
		/* maybe split UFO into V4 and V6? */
5244
		if (!((features & NETIF_F_GEN_CSUM) ||
5245
		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5246
			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5247
			netdev_dbg(dev,
5248
				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5249
			features &= ~NETIF_F_UFO;
5250
		}
5251

5252
		if (!(features & NETIF_F_SG)) {
5253
			netdev_dbg(dev,
5254
				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5255
			features &= ~NETIF_F_UFO;
5256
		}
5257
	}
5258

5259
	return features;
5260
}
5261
EXPORT_SYMBOL(netdev_fix_features);
5262

5263
int __netdev_update_features(struct net_device *dev)
5264
{
5265
	u32 features;
5266
	int err = 0;
5267

5268
	ASSERT_RTNL();
5269

5270
	features = netdev_get_wanted_features(dev);
5271

5272
	if (dev->netdev_ops->ndo_fix_features)
5273
		features = dev->netdev_ops->ndo_fix_features(dev, features);
5274

5275
	/* driver might be less strict about feature dependencies */
5276
	features = netdev_fix_features(dev, features);
5277

5278
	if (dev->features == features)
5279
		return 0;
5280

5281
	netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5282
		dev->features, features);
5283

5284
	if (dev->netdev_ops->ndo_set_features)
5285
		err = dev->netdev_ops->ndo_set_features(dev, features);
5286

5287
	if (unlikely(err < 0)) {
5288
		netdev_err(dev,
5289
			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5290
			err, features, dev->features);
5291
		return -1;
5292
	}
5293

5294
	if (!err)
5295
		dev->features = features;
5296

5297
	return 1;
5298
}
5299

5300
/**
5301
 *	netdev_update_features - recalculate device features
5302
 *	@dev: the device to check
5303
 *
5304
 *	Recalculate dev->features set and send notifications if it
5305
 *	has changed. Should be called after driver or hardware dependent
5306
 *	conditions might have changed that influence the features.
5307
 */
5308
void netdev_update_features(struct net_device *dev)
5309
{
5310
	if (__netdev_update_features(dev))
5311
		netdev_features_change(dev);
5312
}
5313
EXPORT_SYMBOL(netdev_update_features);
5314

5315
/**
5316
 *	netdev_change_features - recalculate device features
5317
 *	@dev: the device to check
5318
 *
5319
 *	Recalculate dev->features set and send notifications even
5320
 *	if they have not changed. Should be called instead of
5321
 *	netdev_update_features() if also dev->vlan_features might
5322
 *	have changed to allow the changes to be propagated to stacked
5323
 *	VLAN devices.
5324
 */
5325
void netdev_change_features(struct net_device *dev)
5326
{
5327
	__netdev_update_features(dev);
5328
	netdev_features_change(dev);
5329
}
5330
EXPORT_SYMBOL(netdev_change_features);
5331

5332
/**
5333
 *	netif_stacked_transfer_operstate -	transfer operstate
5334
 *	@rootdev: the root or lower level device to transfer state from
5335
 *	@dev: the device to transfer operstate to
5336
 *
5337
 *	Transfer operational state from root to device. This is normally
5338
 *	called when a stacking relationship exists between the root
5339
 *	device and the device(a leaf device).
5340
 */
5341
void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5342
					struct net_device *dev)
5343
{
5344
	if (rootdev->operstate == IF_OPER_DORMANT)
5345
		netif_dormant_on(dev);
5346
	else
5347
		netif_dormant_off(dev);
5348

5349
	if (netif_carrier_ok(rootdev)) {
5350
		if (!netif_carrier_ok(dev))
5351
			netif_carrier_on(dev);
5352
	} else {
5353
		if (netif_carrier_ok(dev))
5354
			netif_carrier_off(dev);
5355
	}
5356
}
5357
EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5358

5359
#ifdef CONFIG_RPS
5360
static int netif_alloc_rx_queues(struct net_device *dev)
5361
{
5362
	unsigned int i, count = dev->num_rx_queues;
5363
	struct netdev_rx_queue *rx;
5364

5365
	BUG_ON(count < 1);
5366

5367
	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5368
	if (!rx) {
5369
		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5370
		return -ENOMEM;
5371
	}
5372
	dev->_rx = rx;
5373

5374
	for (i = 0; i < count; i++)
5375
		rx[i].dev = dev;
5376
	return 0;
5377
}
5378
#endif
5379

5380
static void netdev_init_one_queue(struct net_device *dev,
5381
				  struct netdev_queue *queue, void *_unused)
5382
{
5383
	/* Initialize queue lock */
5384
	spin_lock_init(&queue->_xmit_lock);
5385
	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5386
	queue->xmit_lock_owner = -1;
5387
	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5388
	queue->dev = dev;
5389
}
5390

5391
static int netif_alloc_netdev_queues(struct net_device *dev)
5392
{
5393
	unsigned int count = dev->num_tx_queues;
5394
	struct netdev_queue *tx;
5395

5396
	BUG_ON(count < 1);
5397

5398
	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5399
	if (!tx) {
5400
		pr_err("netdev: Unable to allocate %u tx queues.\n",
5401
		       count);
5402
		return -ENOMEM;
5403
	}
5404
	dev->_tx = tx;
5405

5406
	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5407
	spin_lock_init(&dev->tx_global_lock);
5408

5409
	return 0;
5410
}
5411

5412
/**
5413
 *	register_netdevice	- register a network device
5414
 *	@dev: device to register
5415
 *
5416
 *	Take a completed network device structure and add it to the kernel
5417
 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5418
 *	chain. 0 is returned on success. A negative errno code is returned
5419
 *	on a failure to set up the device, or if the name is a duplicate.
5420
 *
5421
 *	Callers must hold the rtnl semaphore. You may want
5422
 *	register_netdev() instead of this.
5423
 *
5424
 *	BUGS:
5425
 *	The locking appears insufficient to guarantee two parallel registers
5426
 *	will not get the same name.
5427
 */
5428

5429
int register_netdevice(struct net_device *dev)
5430
{
5431
	int ret;
5432
	struct net *net = dev_net(dev);
5433

5434
	BUG_ON(dev_boot_phase);
5435
	ASSERT_RTNL();
5436

5437
	might_sleep();
5438

5439
	/* When net_device's are persistent, this will be fatal. */
5440
	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5441
	BUG_ON(!net);
5442

5443
	spin_lock_init(&dev->addr_list_lock);
5444
	netdev_set_addr_lockdep_class(dev);
5445

5446
	dev->iflink = -1;
5447

5448
	ret = dev_get_valid_name(dev, dev->name);
5449
	if (ret < 0)
5450
		goto out;
5451

5452
	/* Init, if this function is available */
5453
	if (dev->netdev_ops->ndo_init) {
5454
		ret = dev->netdev_ops->ndo_init(dev);
5455
		if (ret) {
5456
			if (ret > 0)
5457
				ret = -EIO;
5458
			goto out;
5459
		}
5460
	}
5461

5462
	dev->ifindex = dev_new_index(net);
5463
	if (dev->iflink == -1)
5464
		dev->iflink = dev->ifindex;
5465

5466
	/* Transfer changeable features to wanted_features and enable
5467
	 * software offloads (GSO and GRO).
5468
	 */
5469
	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5470
	dev->features |= NETIF_F_SOFT_FEATURES;
5471
	dev->wanted_features = dev->features & dev->hw_features;
5472

5473
	/* Turn on no cache copy if HW is doing checksum */
5474
	dev->hw_features |= NETIF_F_NOCACHE_COPY;
5475
	if ((dev->features & NETIF_F_ALL_CSUM) &&
5476
	    !(dev->features & NETIF_F_NO_CSUM)) {
5477
		dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5478
		dev->features |= NETIF_F_NOCACHE_COPY;
5479
	}
5480

5481
	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5482
	 * vlan_dev_init() will do the dev->features check, so these features
5483
	 * are enabled only if supported by underlying device.
5484
	 */
5485
	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5486

5487
	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5488
	ret = notifier_to_errno(ret);
5489
	if (ret)
5490
		goto err_uninit;
5491

5492
	ret = netdev_register_kobject(dev);
5493
	if (ret)
5494
		goto err_uninit;
5495
	dev->reg_state = NETREG_REGISTERED;
5496

5497
	__netdev_update_features(dev);
5498

5499
	/*
5500
	 *	Default initial state at registry is that the
5501
	 *	device is present.
5502
	 */
5503

5504
	set_bit(__LINK_STATE_PRESENT, &dev->state);
5505

5506
	dev_init_scheduler(dev);
5507
	dev_hold(dev);
5508
	list_netdevice(dev);
5509

5510
	/* Notify protocols, that a new device appeared. */
5511
	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5512
	ret = notifier_to_errno(ret);
5513
	if (ret) {
5514
		rollback_registered(dev);
5515
		dev->reg_state = NETREG_UNREGISTERED;
5516
	}
5517
	/*
5518
	 *	Prevent userspace races by waiting until the network
5519
	 *	device is fully setup before sending notifications.
5520
	 */
5521
	if (!dev->rtnl_link_ops ||
5522
	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5523
		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5524

5525
out:
5526
	return ret;
5527

5528
err_uninit:
5529
	if (dev->netdev_ops->ndo_uninit)
5530
		dev->netdev_ops->ndo_uninit(dev);
5531
	goto out;
5532
}
5533
EXPORT_SYMBOL(register_netdevice);
5534

5535
/**
5536
 *	init_dummy_netdev	- init a dummy network device for NAPI
5537
 *	@dev: device to init
5538
 *
5539
 *	This takes a network device structure and initialize the minimum
5540
 *	amount of fields so it can be used to schedule NAPI polls without
5541
 *	registering a full blown interface. This is to be used by drivers
5542
 *	that need to tie several hardware interfaces to a single NAPI
5543
 *	poll scheduler due to HW limitations.
5544
 */
5545
int init_dummy_netdev(struct net_device *dev)
5546
{
5547
	/* Clear everything. Note we don't initialize spinlocks
5548
	 * are they aren't supposed to be taken by any of the
5549
	 * NAPI code and this dummy netdev is supposed to be
5550
	 * only ever used for NAPI polls
5551
	 */
5552
	memset(dev, 0, sizeof(struct net_device));
5553

5554
	/* make sure we BUG if trying to hit standard
5555
	 * register/unregister code path
5556
	 */
5557
	dev->reg_state = NETREG_DUMMY;
5558

5559
	/* NAPI wants this */
5560
	INIT_LIST_HEAD(&dev->napi_list);
5561

5562
	/* a dummy interface is started by default */
5563
	set_bit(__LINK_STATE_PRESENT, &dev->state);
5564
	set_bit(__LINK_STATE_START, &dev->state);
5565

5566
	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5567
	 * because users of this 'device' dont need to change
5568
	 * its refcount.
5569
	 */
5570

5571
	return 0;
5572
}
5573
EXPORT_SYMBOL_GPL(init_dummy_netdev);
5574

5575

5576
/**
5577
 *	register_netdev	- register a network device
5578
 *	@dev: device to register
5579
 *
5580
 *	Take a completed network device structure and add it to the kernel
5581
 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5582
 *	chain. 0 is returned on success. A negative errno code is returned
5583
 *	on a failure to set up the device, or if the name is a duplicate.
5584
 *
5585
 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5586
 *	and expands the device name if you passed a format string to
5587
 *	alloc_netdev.
5588
 */
5589
int register_netdev(struct net_device *dev)
5590
{
5591
	int err;
5592

5593
	rtnl_lock();
5594
	err = register_netdevice(dev);
5595
	rtnl_unlock();
5596
	return err;
5597
}
5598
EXPORT_SYMBOL(register_netdev);
5599

5600
int netdev_refcnt_read(const struct net_device *dev)
5601
{
5602
	int i, refcnt = 0;
5603

5604
	for_each_possible_cpu(i)
5605
		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5606
	return refcnt;
5607
}
5608
EXPORT_SYMBOL(netdev_refcnt_read);
5609

5610
/*
5611
 * netdev_wait_allrefs - wait until all references are gone.
5612
 *
5613
 * This is called when unregistering network devices.
5614
 *
5615
 * Any protocol or device that holds a reference should register
5616
 * for netdevice notification, and cleanup and put back the
5617
 * reference if they receive an UNREGISTER event.
5618
 * We can get stuck here if buggy protocols don't correctly
5619
 * call dev_put.
5620
 */
5621
static void netdev_wait_allrefs(struct net_device *dev)
5622
{
5623
	unsigned long rebroadcast_time, warning_time;
5624
	int refcnt;
5625

5626
	linkwatch_forget_dev(dev);
5627

5628
	rebroadcast_time = warning_time = jiffies;
5629
	refcnt = netdev_refcnt_read(dev);
5630

5631
	while (refcnt != 0) {
5632
		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5633
			rtnl_lock();
5634

5635
			/* Rebroadcast unregister notification */
5636
			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5637
			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5638
			 * should have already handle it the first time */
5639

5640
			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5641
				     &dev->state)) {
5642
				/* We must not have linkwatch events
5643
				 * pending on unregister. If this
5644
				 * happens, we simply run the queue
5645
				 * unscheduled, resulting in a noop
5646
				 * for this device.
5647
				 */
5648
				linkwatch_run_queue();
5649
			}
5650

5651
			__rtnl_unlock();
5652

5653
			rebroadcast_time = jiffies;
5654
		}
5655

5656
		msleep(250);
5657

5658
		refcnt = netdev_refcnt_read(dev);
5659

5660
		if (time_after(jiffies, warning_time + 10 * HZ)) {
5661
			printk(KERN_EMERG "unregister_netdevice: "
5662
			       "waiting for %s to become free. Usage "
5663
			       "count = %d\n",
5664
			       dev->name, refcnt);
5665
			warning_time = jiffies;
5666
		}
5667
	}
5668
}
5669

5670
/* The sequence is:
5671
 *
5672
 *	rtnl_lock();
5673
 *	...
5674
 *	register_netdevice(x1);
5675
 *	register_netdevice(x2);
5676
 *	...
5677
 *	unregister_netdevice(y1);
5678
 *	unregister_netdevice(y2);
5679
 *      ...
5680
 *	rtnl_unlock();
5681
 *	free_netdev(y1);
5682
 *	free_netdev(y2);
5683
 *
5684
 * We are invoked by rtnl_unlock().
5685
 * This allows us to deal with problems:
5686
 * 1) We can delete sysfs objects which invoke hotplug
5687
 *    without deadlocking with linkwatch via keventd.
5688
 * 2) Since we run with the RTNL semaphore not held, we can sleep
5689
 *    safely in order to wait for the netdev refcnt to drop to zero.
5690
 *
5691
 * We must not return until all unregister events added during
5692
 * the interval the lock was held have been completed.
5693
 */
5694
void netdev_run_todo(void)
5695
{
5696
	struct list_head list;
5697

5698
	/* Snapshot list, allow later requests */
5699
	list_replace_init(&net_todo_list, &list);
5700

5701
	__rtnl_unlock();
5702

5703
	while (!list_empty(&list)) {
5704
		struct net_device *dev
5705
			= list_first_entry(&list, struct net_device, todo_list);
5706
		list_del(&dev->todo_list);
5707

5708
		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5709
			printk(KERN_ERR "network todo '%s' but state %d\n",
5710
			       dev->name, dev->reg_state);
5711
			dump_stack();
5712
			continue;
5713
		}
5714

5715
		dev->reg_state = NETREG_UNREGISTERED;
5716

5717
		on_each_cpu(flush_backlog, dev, 1);
5718

5719
		netdev_wait_allrefs(dev);
5720

5721
		/* paranoia */
5722
		BUG_ON(netdev_refcnt_read(dev));
5723
		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5724
		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5725
		WARN_ON(dev->dn_ptr);
5726

5727
		if (dev->destructor)
5728
			dev->destructor(dev);
5729

5730
		/* Free network device */
5731
		kobject_put(&dev->dev.kobj);
5732
	}
5733
}
5734

5735
/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5736
 * fields in the same order, with only the type differing.
5737
 */
5738
static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5739
				    const struct net_device_stats *netdev_stats)
5740
{
5741
#if BITS_PER_LONG == 64
5742
        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5743
        memcpy(stats64, netdev_stats, sizeof(*stats64));
5744
#else
5745
	size_t i, n = sizeof(*stats64) / sizeof(u64);
5746
	const unsigned long *src = (const unsigned long *)netdev_stats;
5747
	u64 *dst = (u64 *)stats64;
5748

5749
	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5750
		     sizeof(*stats64) / sizeof(u64));
5751
	for (i = 0; i < n; i++)
5752
		dst[i] = src[i];
5753
#endif
5754
}
5755

5756
/**
5757
 *	dev_get_stats	- get network device statistics
5758
 *	@dev: device to get statistics from
5759
 *	@storage: place to store stats
5760
 *
5761
 *	Get network statistics from device. Return @storage.
5762
 *	The device driver may provide its own method by setting
5763
 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5764
 *	otherwise the internal statistics structure is used.
5765
 */
5766
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5767
					struct rtnl_link_stats64 *storage)
5768
{
5769
	const struct net_device_ops *ops = dev->netdev_ops;
5770

5771
	if (ops->ndo_get_stats64) {
5772
		memset(storage, 0, sizeof(*storage));
5773
		ops->ndo_get_stats64(dev, storage);
5774
	} else if (ops->ndo_get_stats) {
5775
		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5776
	} else {
5777
		netdev_stats_to_stats64(storage, &dev->stats);
5778
	}
5779
	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5780
	return storage;
5781
}
5782
EXPORT_SYMBOL(dev_get_stats);
5783

5784
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5785
{
5786
	struct netdev_queue *queue = dev_ingress_queue(dev);
5787

5788
#ifdef CONFIG_NET_CLS_ACT
5789
	if (queue)
5790
		return queue;
5791
	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5792
	if (!queue)
5793
		return NULL;
5794
	netdev_init_one_queue(dev, queue, NULL);
5795
	queue->qdisc = &noop_qdisc;
5796
	queue->qdisc_sleeping = &noop_qdisc;
5797
	rcu_assign_pointer(dev->ingress_queue, queue);
5798
#endif
5799
	return queue;
5800
}
5801

5802
/**
5803
 *	alloc_netdev_mqs - allocate network device
5804
 *	@sizeof_priv:	size of private data to allocate space for
5805
 *	@name:		device name format string
5806
 *	@setup:		callback to initialize device
5807
 *	@txqs:		the number of TX subqueues to allocate
5808
 *	@rxqs:		the number of RX subqueues to allocate
5809
 *
5810
 *	Allocates a struct net_device with private data area for driver use
5811
 *	and performs basic initialization.  Also allocates subquue structs
5812
 *	for each queue on the device.
5813
 */
5814
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5815
		void (*setup)(struct net_device *),
5816
		unsigned int txqs, unsigned int rxqs)
5817
{
5818
	struct net_device *dev;
5819
	size_t alloc_size;
5820
	struct net_device *p;
5821

5822
	BUG_ON(strlen(name) >= sizeof(dev->name));
5823

5824
	if (txqs < 1) {
5825
		pr_err("alloc_netdev: Unable to allocate device "
5826
		       "with zero queues.\n");
5827
		return NULL;
5828
	}
5829

5830
#ifdef CONFIG_RPS
5831
	if (rxqs < 1) {
5832
		pr_err("alloc_netdev: Unable to allocate device "
5833
		       "with zero RX queues.\n");
5834
		return NULL;
5835
	}
5836
#endif
5837

5838
	alloc_size = sizeof(struct net_device);
5839
	if (sizeof_priv) {
5840
		/* ensure 32-byte alignment of private area */
5841
		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5842
		alloc_size += sizeof_priv;
5843
	}
5844
	/* ensure 32-byte alignment of whole construct */
5845
	alloc_size += NETDEV_ALIGN - 1;
5846

5847
	p = kzalloc(alloc_size, GFP_KERNEL);
5848
	if (!p) {
5849
		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5850
		return NULL;
5851
	}
5852

5853
	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5854
	dev->padded = (char *)dev - (char *)p;
5855

5856
	dev->pcpu_refcnt = alloc_percpu(int);
5857
	if (!dev->pcpu_refcnt)
5858
		goto free_p;
5859

5860
	if (dev_addr_init(dev))
5861
		goto free_pcpu;
5862

5863
	dev_mc_init(dev);
5864
	dev_uc_init(dev);
5865

5866
	dev_net_set(dev, &init_net);
5867

5868
	dev->gso_max_size = GSO_MAX_SIZE;
5869

5870
	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5871
	dev->ethtool_ntuple_list.count = 0;
5872
	INIT_LIST_HEAD(&dev->napi_list);
5873
	INIT_LIST_HEAD(&dev->unreg_list);
5874
	INIT_LIST_HEAD(&dev->link_watch_list);
5875
	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5876
	setup(dev);
5877

5878
	dev->num_tx_queues = txqs;
5879
	dev->real_num_tx_queues = txqs;
5880
	if (netif_alloc_netdev_queues(dev))
5881
		goto free_all;
5882

5883
#ifdef CONFIG_RPS
5884
	dev->num_rx_queues = rxqs;
5885
	dev->real_num_rx_queues = rxqs;
5886
	if (netif_alloc_rx_queues(dev))
5887
		goto free_all;
5888
#endif
5889

5890
	strcpy(dev->name, name);
5891
	dev->group = INIT_NETDEV_GROUP;
5892
	return dev;
5893

5894
free_all:
5895
	free_netdev(dev);
5896
	return NULL;
5897

5898
free_pcpu:
5899
	free_percpu(dev->pcpu_refcnt);
5900
	kfree(dev->_tx);
5901
#ifdef CONFIG_RPS
5902
	kfree(dev->_rx);
5903
#endif
5904

5905
free_p:
5906
	kfree(p);
5907
	return NULL;
5908
}
5909
EXPORT_SYMBOL(alloc_netdev_mqs);
5910

5911
/**
5912
 *	free_netdev - free network device
5913
 *	@dev: device
5914
 *
5915
 *	This function does the last stage of destroying an allocated device
5916
 * 	interface. The reference to the device object is released.
5917
 *	If this is the last reference then it will be freed.
5918
 */
5919
void free_netdev(struct net_device *dev)
5920
{
5921
	struct napi_struct *p, *n;
5922

5923
	release_net(dev_net(dev));
5924

5925
	kfree(dev->_tx);
5926
#ifdef CONFIG_RPS
5927
	kfree(dev->_rx);
5928
#endif
5929

5930
	kfree(rcu_dereference_raw(dev->ingress_queue));
5931

5932
	/* Flush device addresses */
5933
	dev_addr_flush(dev);
5934

5935
	/* Clear ethtool n-tuple list */
5936
	ethtool_ntuple_flush(dev);
5937

5938
	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5939
		netif_napi_del(p);
5940

5941
	free_percpu(dev->pcpu_refcnt);
5942
	dev->pcpu_refcnt = NULL;
5943

5944
	/*  Compatibility with error handling in drivers */
5945
	if (dev->reg_state == NETREG_UNINITIALIZED) {
5946
		kfree((char *)dev - dev->padded);
5947
		return;
5948
	}
5949

5950
	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5951
	dev->reg_state = NETREG_RELEASED;
5952

5953
	/* will free via device release */
5954
	put_device(&dev->dev);
5955
}
5956
EXPORT_SYMBOL(free_netdev);
5957

5958
/**
5959
 *	synchronize_net -  Synchronize with packet receive processing
5960
 *
5961
 *	Wait for packets currently being received to be done.
5962
 *	Does not block later packets from starting.
5963
 */
5964
void synchronize_net(void)
5965
{
5966
	might_sleep();
5967
	if (rtnl_is_locked())
5968
		synchronize_rcu_expedited();
5969
	else
5970
		synchronize_rcu();
5971
}
5972
EXPORT_SYMBOL(synchronize_net);
5973

5974
/**
5975
 *	unregister_netdevice_queue - remove device from the kernel
5976
 *	@dev: device
5977
 *	@head: list
5978
 *
5979
 *	This function shuts down a device interface and removes it
5980
 *	from the kernel tables.
5981
 *	If head not NULL, device is queued to be unregistered later.
5982
 *
5983
 *	Callers must hold the rtnl semaphore.  You may want
5984
 *	unregister_netdev() instead of this.
5985
 */
5986

5987
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5988
{
5989
	ASSERT_RTNL();
5990

5991
	if (head) {
5992
		list_move_tail(&dev->unreg_list, head);
5993
	} else {
5994
		rollback_registered(dev);
5995
		/* Finish processing unregister after unlock */
5996
		net_set_todo(dev);
5997
	}
5998
}
5999
EXPORT_SYMBOL(unregister_netdevice_queue);
6000

6001
/**
6002
 *	unregister_netdevice_many - unregister many devices
6003
 *	@head: list of devices
6004
 */
6005
void unregister_netdevice_many(struct list_head *head)
6006
{
6007
	struct net_device *dev;
6008

6009
	if (!list_empty(head)) {
6010
		rollback_registered_many(head);
6011
		list_for_each_entry(dev, head, unreg_list)
6012
			net_set_todo(dev);
6013
	}
6014
}
6015
EXPORT_SYMBOL(unregister_netdevice_many);
6016

6017
/**
6018
 *	unregister_netdev - remove device from the kernel
6019
 *	@dev: device
6020
 *
6021
 *	This function shuts down a device interface and removes it
6022
 *	from the kernel tables.
6023
 *
6024
 *	This is just a wrapper for unregister_netdevice that takes
6025
 *	the rtnl semaphore.  In general you want to use this and not
6026
 *	unregister_netdevice.
6027
 */
6028
void unregister_netdev(struct net_device *dev)
6029
{
6030
	rtnl_lock();
6031
	unregister_netdevice(dev);
6032
	rtnl_unlock();
6033
}
6034
EXPORT_SYMBOL(unregister_netdev);
6035

6036
/**
6037
 *	dev_change_net_namespace - move device to different nethost namespace
6038
 *	@dev: device
6039
 *	@net: network namespace
6040
 *	@pat: If not NULL name pattern to try if the current device name
6041
 *	      is already taken in the destination network namespace.
6042
 *
6043
 *	This function shuts down a device interface and moves it
6044
 *	to a new network namespace. On success 0 is returned, on
6045
 *	a failure a netagive errno code is returned.
6046
 *
6047
 *	Callers must hold the rtnl semaphore.
6048
 */
6049

6050
int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6051
{
6052
	int err;
6053

6054
	ASSERT_RTNL();
6055

6056
	/* Don't allow namespace local devices to be moved. */
6057
	err = -EINVAL;
6058
	if (dev->features & NETIF_F_NETNS_LOCAL)
6059
		goto out;
6060

6061
	/* Ensure the device has been registrered */
6062
	err = -EINVAL;
6063
	if (dev->reg_state != NETREG_REGISTERED)
6064
		goto out;
6065

6066
	/* Get out if there is nothing todo */
6067
	err = 0;
6068
	if (net_eq(dev_net(dev), net))
6069
		goto out;
6070

6071
	/* Pick the destination device name, and ensure
6072
	 * we can use it in the destination network namespace.
6073
	 */
6074
	err = -EEXIST;
6075
	if (__dev_get_by_name(net, dev->name)) {
6076
		/* We get here if we can't use the current device name */
6077
		if (!pat)
6078
			goto out;
6079
		if (dev_get_valid_name(dev, pat) < 0)
6080
			goto out;
6081
	}
6082

6083
	/*
6084
	 * And now a mini version of register_netdevice unregister_netdevice.
6085
	 */
6086

6087
	/* If device is running close it first. */
6088
	dev_close(dev);
6089

6090
	/* And unlink it from device chain */
6091
	err = -ENODEV;
6092
	unlist_netdevice(dev);
6093

6094
	synchronize_net();
6095

6096
	/* Shutdown queueing discipline. */
6097
	dev_shutdown(dev);
6098

6099
	/* Notify protocols, that we are about to destroy
6100
	   this device. They should clean all the things.
6101

6102
	   Note that dev->reg_state stays at NETREG_REGISTERED.
6103
	   This is wanted because this way 8021q and macvlan know
6104
	   the device is just moving and can keep their slaves up.
6105
	*/
6106
	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6107
	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6108

6109
	/*
6110
	 *	Flush the unicast and multicast chains
6111
	 */
6112
	dev_uc_flush(dev);
6113
	dev_mc_flush(dev);
6114

6115
	/* Actually switch the network namespace */
6116
	dev_net_set(dev, net);
6117

6118
	/* If there is an ifindex conflict assign a new one */
6119
	if (__dev_get_by_index(net, dev->ifindex)) {
6120
		int iflink = (dev->iflink == dev->ifindex);
6121
		dev->ifindex = dev_new_index(net);
6122
		if (iflink)
6123
			dev->iflink = dev->ifindex;
6124
	}
6125

6126
	/* Fixup kobjects */
6127
	err = device_rename(&dev->dev, dev->name);
6128
	WARN_ON(err);
6129

6130
	/* Add the device back in the hashes */
6131
	list_netdevice(dev);
6132

6133
	/* Notify protocols, that a new device appeared. */
6134
	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6135

6136
	/*
6137
	 *	Prevent userspace races by waiting until the network
6138
	 *	device is fully setup before sending notifications.
6139
	 */
6140
	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6141

6142
	synchronize_net();
6143
	err = 0;
6144
out:
6145
	return err;
6146
}
6147
EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6148

6149
static int dev_cpu_callback(struct notifier_block *nfb,
6150
			    unsigned long action,
6151
			    void *ocpu)
6152
{
6153
	struct sk_buff **list_skb;
6154
	struct sk_buff *skb;
6155
	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6156
	struct softnet_data *sd, *oldsd;
6157

6158
	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6159
		return NOTIFY_OK;
6160

6161
	local_irq_disable();
6162
	cpu = smp_processor_id();
6163
	sd = &per_cpu(softnet_data, cpu);
6164
	oldsd = &per_cpu(softnet_data, oldcpu);
6165

6166
	/* Find end of our completion_queue. */
6167
	list_skb = &sd->completion_queue;
6168
	while (*list_skb)
6169
		list_skb = &(*list_skb)->next;
6170
	/* Append completion queue from offline CPU. */
6171
	*list_skb = oldsd->completion_queue;
6172
	oldsd->completion_queue = NULL;
6173

6174
	/* Append output queue from offline CPU. */
6175
	if (oldsd->output_queue) {
6176
		*sd->output_queue_tailp = oldsd->output_queue;
6177
		sd->output_queue_tailp = oldsd->output_queue_tailp;
6178
		oldsd->output_queue = NULL;
6179
		oldsd->output_queue_tailp = &oldsd->output_queue;
6180
	}
6181
	/* Append NAPI poll list from offline CPU. */
6182
	if (!list_empty(&oldsd->poll_list)) {
6183
		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6184
		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6185
	}
6186

6187
	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6188
	local_irq_enable();
6189

6190
	/* Process offline CPU's input_pkt_queue */
6191
	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6192
		netif_rx(skb);
6193
		input_queue_head_incr(oldsd);
6194
	}
6195
	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6196
		netif_rx(skb);
6197
		input_queue_head_incr(oldsd);
6198
	}
6199

6200
	return NOTIFY_OK;
6201
}
6202

6203

6204
/**
6205
 *	netdev_increment_features - increment feature set by one
6206
 *	@all: current feature set
6207
 *	@one: new feature set
6208
 *	@mask: mask feature set
6209
 *
6210
 *	Computes a new feature set after adding a device with feature set
6211
 *	@one to the master device with current feature set @all.  Will not
6212
 *	enable anything that is off in @mask. Returns the new feature set.
6213
 */
6214
u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6215
{
6216
	if (mask & NETIF_F_GEN_CSUM)
6217
		mask |= NETIF_F_ALL_CSUM;
6218
	mask |= NETIF_F_VLAN_CHALLENGED;
6219

6220
	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6221
	all &= one | ~NETIF_F_ALL_FOR_ALL;
6222

6223
	/* If device needs checksumming, downgrade to it. */
6224
	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6225
		all &= ~NETIF_F_NO_CSUM;
6226

6227
	/* If one device supports hw checksumming, set for all. */
6228
	if (all & NETIF_F_GEN_CSUM)
6229
		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6230

6231
	return all;
6232
}
6233
EXPORT_SYMBOL(netdev_increment_features);
6234

6235
static struct hlist_head *netdev_create_hash(void)
6236
{
6237
	int i;
6238
	struct hlist_head *hash;
6239

6240
	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6241
	if (hash != NULL)
6242
		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6243
			INIT_HLIST_HEAD(&hash[i]);
6244

6245
	return hash;
6246
}
6247

6248
/* Initialize per network namespace state */
6249
static int __net_init netdev_init(struct net *net)
6250
{
6251
	INIT_LIST_HEAD(&net->dev_base_head);
6252

6253
	net->dev_name_head = netdev_create_hash();
6254
	if (net->dev_name_head == NULL)
6255
		goto err_name;
6256

6257
	net->dev_index_head = netdev_create_hash();
6258
	if (net->dev_index_head == NULL)
6259
		goto err_idx;
6260

6261
	return 0;
6262

6263
err_idx:
6264
	kfree(net->dev_name_head);
6265
err_name:
6266
	return -ENOMEM;
6267
}
6268

6269
/**
6270
 *	netdev_drivername - network driver for the device
6271
 *	@dev: network device
6272
 *
6273
 *	Determine network driver for device.
6274
 */
6275
const char *netdev_drivername(const struct net_device *dev)
6276
{
6277
	const struct device_driver *driver;
6278
	const struct device *parent;
6279
	const char *empty = "";
6280

6281
	parent = dev->dev.parent;
6282
	if (!parent)
6283
		return empty;
6284

6285
	driver = parent->driver;
6286
	if (driver && driver->name)
6287
		return driver->name;
6288
	return empty;
6289
}
6290

6291
static int __netdev_printk(const char *level, const struct net_device *dev,
6292
			   struct va_format *vaf)
6293
{
6294
	int r;
6295

6296
	if (dev && dev->dev.parent)
6297
		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6298
			       netdev_name(dev), vaf);
6299
	else if (dev)
6300
		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6301
	else
6302
		r = printk("%s(NULL net_device): %pV", level, vaf);
6303

6304
	return r;
6305
}
6306

6307
int netdev_printk(const char *level, const struct net_device *dev,
6308
		  const char *format, ...)
6309
{
6310
	struct va_format vaf;
6311
	va_list args;
6312
	int r;
6313

6314
	va_start(args, format);
6315

6316
	vaf.fmt = format;
6317
	vaf.va = &args;
6318

6319
	r = __netdev_printk(level, dev, &vaf);
6320
	va_end(args);
6321

6322
	return r;
6323
}
6324
EXPORT_SYMBOL(netdev_printk);
6325

6326
#define define_netdev_printk_level(func, level)			\
6327
int func(const struct net_device *dev, const char *fmt, ...)	\
6328
{								\
6329
	int r;							\
6330
	struct va_format vaf;					\
6331
	va_list args;						\
6332
								\
6333
	va_start(args, fmt);					\
6334
								\
6335
	vaf.fmt = fmt;						\
6336
	vaf.va = &args;						\
6337
								\
6338
	r = __netdev_printk(level, dev, &vaf);			\
6339
	va_end(args);						\
6340
								\
6341
	return r;						\
6342
}								\
6343
EXPORT_SYMBOL(func);
6344

6345
define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6346
define_netdev_printk_level(netdev_alert, KERN_ALERT);
6347
define_netdev_printk_level(netdev_crit, KERN_CRIT);
6348
define_netdev_printk_level(netdev_err, KERN_ERR);
6349
define_netdev_printk_level(netdev_warn, KERN_WARNING);
6350
define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6351
define_netdev_printk_level(netdev_info, KERN_INFO);
6352

6353
static void __net_exit netdev_exit(struct net *net)
6354
{
6355
	kfree(net->dev_name_head);
6356
	kfree(net->dev_index_head);
6357
}
6358

6359
static struct pernet_operations __net_initdata netdev_net_ops = {
6360
	.init = netdev_init,
6361
	.exit = netdev_exit,
6362
};
6363

6364
static void __net_exit default_device_exit(struct net *net)
6365
{
6366
	struct net_device *dev, *aux;
6367
	/*
6368
	 * Push all migratable network devices back to the
6369
	 * initial network namespace
6370
	 */
6371
	rtnl_lock();
6372
	for_each_netdev_safe(net, dev, aux) {
6373
		int err;
6374
		char fb_name[IFNAMSIZ];
6375

6376
		/* Ignore unmoveable devices (i.e. loopback) */
6377
		if (dev->features & NETIF_F_NETNS_LOCAL)
6378
			continue;
6379

6380
		/* Leave virtual devices for the generic cleanup */
6381
		if (dev->rtnl_link_ops)
6382
			continue;
6383

6384
		/* Push remaining network devices to init_net */
6385
		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6386
		err = dev_change_net_namespace(dev, &init_net, fb_name);
6387
		if (err) {
6388
			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6389
				__func__, dev->name, err);
6390
			BUG();
6391
		}
6392
	}
6393
	rtnl_unlock();
6394
}
6395

6396
static void __net_exit default_device_exit_batch(struct list_head *net_list)
6397
{
6398
	/* At exit all network devices most be removed from a network
6399
	 * namespace.  Do this in the reverse order of registration.
6400
	 * Do this across as many network namespaces as possible to
6401
	 * improve batching efficiency.
6402
	 */
6403
	struct net_device *dev;
6404
	struct net *net;
6405
	LIST_HEAD(dev_kill_list);
6406

6407
	rtnl_lock();
6408
	list_for_each_entry(net, net_list, exit_list) {
6409
		for_each_netdev_reverse(net, dev) {
6410
			if (dev->rtnl_link_ops)
6411
				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6412
			else
6413
				unregister_netdevice_queue(dev, &dev_kill_list);
6414
		}
6415
	}
6416
	unregister_netdevice_many(&dev_kill_list);
6417
	list_del(&dev_kill_list);
6418
	rtnl_unlock();
6419
}
6420

6421
static struct pernet_operations __net_initdata default_device_ops = {
6422
	.exit = default_device_exit,
6423
	.exit_batch = default_device_exit_batch,
6424
};
6425

6426
/*
6427
 *	Initialize the DEV module. At boot time this walks the device list and
6428
 *	unhooks any devices that fail to initialise (normally hardware not
6429
 *	present) and leaves us with a valid list of present and active devices.
6430
 *
6431
 */
6432

6433
/*
6434
 *       This is called single threaded during boot, so no need
6435
 *       to take the rtnl semaphore.
6436
 */
6437
static int __init net_dev_init(void)
6438
{
6439
	int i, rc = -ENOMEM;
6440

6441
	BUG_ON(!dev_boot_phase);
6442

6443
	if (dev_proc_init())
6444
		goto out;
6445

6446
	if (netdev_kobject_init())
6447
		goto out;
6448

6449
	INIT_LIST_HEAD(&ptype_all);
6450
	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6451
		INIT_LIST_HEAD(&ptype_base[i]);
6452

6453
	if (register_pernet_subsys(&netdev_net_ops))
6454
		goto out;
6455

6456
	/*
6457
	 *	Initialise the packet receive queues.
6458
	 */
6459

6460
	for_each_possible_cpu(i) {
6461
		struct softnet_data *sd = &per_cpu(softnet_data, i);
6462

6463
		memset(sd, 0, sizeof(*sd));
6464
		skb_queue_head_init(&sd->input_pkt_queue);
6465
		skb_queue_head_init(&sd->process_queue);
6466
		sd->completion_queue = NULL;
6467
		INIT_LIST_HEAD(&sd->poll_list);
6468
		sd->output_queue = NULL;
6469
		sd->output_queue_tailp = &sd->output_queue;
6470
#ifdef CONFIG_RPS
6471
		sd->csd.func = rps_trigger_softirq;
6472
		sd->csd.info = sd;
6473
		sd->csd.flags = 0;
6474
		sd->cpu = i;
6475
#endif
6476

6477
		sd->backlog.poll = process_backlog;
6478
		sd->backlog.weight = weight_p;
6479
		sd->backlog.gro_list = NULL;
6480
		sd->backlog.gro_count = 0;
6481
	}
6482

6483
	dev_boot_phase = 0;
6484

6485
	/* The loopback device is special if any other network devices
6486
	 * is present in a network namespace the loopback device must
6487
	 * be present. Since we now dynamically allocate and free the
6488
	 * loopback device ensure this invariant is maintained by
6489
	 * keeping the loopback device as the first device on the
6490
	 * list of network devices.  Ensuring the loopback devices
6491
	 * is the first device that appears and the last network device
6492
	 * that disappears.
6493
	 */
6494
	if (register_pernet_device(&loopback_net_ops))
6495
		goto out;
6496

6497
	if (register_pernet_device(&default_device_ops))
6498
		goto out;
6499

6500
	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6501
	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6502

6503
	hotcpu_notifier(dev_cpu_callback, 0);
6504
	dst_init();
6505
	dev_mcast_init();
6506
	rc = 0;
6507
out:
6508
	return rc;
6509
}
6510

6511
subsys_initcall(net_dev_init);
6512

6513
static int __init initialize_hashrnd(void)
6514
{
6515
	get_random_bytes(&hashrnd, sizeof(hashrnd));
6516
	return 0;
6517
}
6518

6519
late_initcall_sync(initialize_hashrnd);
6520

6521

6522
Product

Resources

Company