Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/net/packet/af_packet.c
170831 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* INET An implementation of the TCP/IP protocol suite for the LINUX
4
* operating system. INET is implemented using the BSD Socket
5
* interface as the means of communication with the user level.
6
*
7
* PACKET - implements raw packet sockets.
8
*
9
* Authors: Ross Biro
10
* Fred N. van Kempen, <[email protected]>
11
* Alan Cox, <[email protected]>
12
*
13
* Fixes:
14
* Alan Cox : verify_area() now used correctly
15
* Alan Cox : new skbuff lists, look ma no backlogs!
16
* Alan Cox : tidied skbuff lists.
17
* Alan Cox : Now uses generic datagram routines I
18
* added. Also fixed the peek/read crash
19
* from all old Linux datagram code.
20
* Alan Cox : Uses the improved datagram code.
21
* Alan Cox : Added NULL's for socket options.
22
* Alan Cox : Re-commented the code.
23
* Alan Cox : Use new kernel side addressing
24
* Rob Janssen : Correct MTU usage.
25
* Dave Platt : Counter leaks caused by incorrect
26
* interrupt locking and some slightly
27
* dubious gcc output. Can you read
28
* compiler: it said _VOLATILE_
29
* Richard Kooijman : Timestamp fixes.
30
* Alan Cox : New buffers. Use sk->mac.raw.
31
* Alan Cox : sendmsg/recvmsg support.
32
* Alan Cox : Protocol setting support
33
* Alexey Kuznetsov : Untied from IPv4 stack.
34
* Cyrus Durgin : Fixed kerneld for kmod.
35
* Michal Ostrowski : Module initialization cleanup.
36
* Ulises Alonso : Frame number limit removal and
37
* packet_set_ring memory leak.
38
* Eric Biederman : Allow for > 8 byte hardware addresses.
39
* The convention is that longer addresses
40
* will simply extend the hardware address
41
* byte arrays at the end of sockaddr_ll
42
* and packet_mreq.
43
* Johann Baudy : Added TX RING.
44
* Chetan Loke : Implemented TPACKET_V3 block abstraction
45
* layer.
46
* Copyright (C) 2011, <[email protected]>
47
*/
48
49
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
50
51
#include <linux/ethtool.h>
52
#include <linux/uio.h>
53
#include <linux/filter.h>
54
#include <linux/types.h>
55
#include <linux/mm.h>
56
#include <linux/capability.h>
57
#include <linux/fcntl.h>
58
#include <linux/socket.h>
59
#include <linux/in.h>
60
#include <linux/inet.h>
61
#include <linux/netdevice.h>
62
#include <linux/if_packet.h>
63
#include <linux/wireless.h>
64
#include <linux/kernel.h>
65
#include <linux/kmod.h>
66
#include <linux/slab.h>
67
#include <linux/vmalloc.h>
68
#include <net/net_namespace.h>
69
#include <net/ip.h>
70
#include <net/protocol.h>
71
#include <linux/skbuff.h>
72
#include <net/sock.h>
73
#include <linux/errno.h>
74
#include <linux/timer.h>
75
#include <linux/uaccess.h>
76
#include <asm/ioctls.h>
77
#include <asm/page.h>
78
#include <asm/cacheflush.h>
79
#include <asm/io.h>
80
#include <linux/proc_fs.h>
81
#include <linux/seq_file.h>
82
#include <linux/poll.h>
83
#include <linux/module.h>
84
#include <linux/init.h>
85
#include <linux/mutex.h>
86
#include <linux/if_vlan.h>
87
#include <linux/virtio_net.h>
88
#include <linux/errqueue.h>
89
#include <linux/net_tstamp.h>
90
#include <linux/percpu.h>
91
#ifdef CONFIG_INET
92
#include <net/inet_common.h>
93
#endif
94
#include <linux/bpf.h>
95
#include <net/compat.h>
96
#include <linux/netfilter_netdev.h>
97
98
#include "internal.h"
99
100
/*
101
Assumptions:
102
- If the device has no dev->header_ops->create, there is no LL header
103
visible above the device. In this case, its hard_header_len should be 0.
104
The device may prepend its own header internally. In this case, its
105
needed_headroom should be set to the space needed for it to add its
106
internal header.
107
For example, a WiFi driver pretending to be an Ethernet driver should
108
set its hard_header_len to be the Ethernet header length, and set its
109
needed_headroom to be (the real WiFi header length - the fake Ethernet
110
header length).
111
- packet socket receives packets with pulled ll header,
112
so that SOCK_RAW should push it back.
113
114
On receive:
115
-----------
116
117
Incoming, dev_has_header(dev) == true
118
mac_header -> ll header
119
data -> data
120
121
Outgoing, dev_has_header(dev) == true
122
mac_header -> ll header
123
data -> ll header
124
125
Incoming, dev_has_header(dev) == false
126
mac_header -> data
127
However drivers often make it point to the ll header.
128
This is incorrect because the ll header should be invisible to us.
129
data -> data
130
131
Outgoing, dev_has_header(dev) == false
132
mac_header -> data. ll header is invisible to us.
133
data -> data
134
135
Resume
136
If dev_has_header(dev) == false we are unable to restore the ll header,
137
because it is invisible to us.
138
139
140
On transmit:
141
------------
142
143
dev_has_header(dev) == true
144
mac_header -> ll header
145
data -> ll header
146
147
dev_has_header(dev) == false (ll header is invisible to us)
148
mac_header -> data
149
data -> data
150
151
We should set network_header on output to the correct position,
152
packet classifier depends on it.
153
*/
154
155
/* Private packet socket structures. */
156
157
/* identical to struct packet_mreq except it has
158
* a longer address field.
159
*/
160
struct packet_mreq_max {
161
int mr_ifindex;
162
unsigned short mr_type;
163
unsigned short mr_alen;
164
unsigned char mr_address[MAX_ADDR_LEN];
165
};
166
167
union tpacket_uhdr {
168
struct tpacket_hdr *h1;
169
struct tpacket2_hdr *h2;
170
struct tpacket3_hdr *h3;
171
void *raw;
172
};
173
174
static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
175
int closing, int tx_ring);
176
177
#define V3_ALIGNMENT (8)
178
179
#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
180
181
#define BLK_PLUS_PRIV(sz_of_priv) \
182
(BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
183
184
#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
185
#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
186
#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
187
#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
188
#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
189
#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
190
191
struct packet_sock;
192
static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
193
struct packet_type *pt, struct net_device *orig_dev);
194
195
static void *packet_previous_frame(struct packet_sock *po,
196
struct packet_ring_buffer *rb,
197
int status);
198
static void packet_increment_head(struct packet_ring_buffer *buff);
199
static int prb_curr_blk_in_use(struct tpacket_block_desc *);
200
static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
201
struct packet_sock *);
202
static void prb_retire_current_block(struct tpacket_kbdq_core *,
203
struct packet_sock *, unsigned int status);
204
static int prb_queue_frozen(struct tpacket_kbdq_core *);
205
static void prb_open_block(struct tpacket_kbdq_core *,
206
struct tpacket_block_desc *);
207
static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
208
static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
209
static void prb_clear_rxhash(struct tpacket_kbdq_core *,
210
struct tpacket3_hdr *);
211
static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
212
struct tpacket3_hdr *);
213
static void packet_flush_mclist(struct sock *sk);
214
static u16 packet_pick_tx_queue(struct sk_buff *skb);
215
216
struct packet_skb_cb {
217
union {
218
struct sockaddr_pkt pkt;
219
union {
220
/* Trick: alias skb original length with
221
* ll.sll_family and ll.protocol in order
222
* to save room.
223
*/
224
unsigned int origlen;
225
struct sockaddr_ll ll;
226
};
227
} sa;
228
};
229
230
#define vio_le() virtio_legacy_is_little_endian()
231
232
#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
233
234
#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
235
#define GET_PBLOCK_DESC(x, bid) \
236
((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
237
#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
238
((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
239
#define GET_NEXT_PRB_BLK_NUM(x) \
240
(((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
241
((x)->kactive_blk_num+1) : 0)
242
243
static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
244
static void __fanout_link(struct sock *sk, struct packet_sock *po);
245
246
#ifdef CONFIG_NETFILTER_EGRESS
247
static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb)
248
{
249
struct sk_buff *next, *head = NULL, *tail;
250
int rc;
251
252
rcu_read_lock();
253
for (; skb != NULL; skb = next) {
254
next = skb->next;
255
skb_mark_not_on_list(skb);
256
257
if (!nf_hook_egress(skb, &rc, skb->dev))
258
continue;
259
260
if (!head)
261
head = skb;
262
else
263
tail->next = skb;
264
265
tail = skb;
266
}
267
rcu_read_unlock();
268
269
return head;
270
}
271
#endif
272
273
static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb)
274
{
275
if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS))
276
return dev_queue_xmit(skb);
277
278
#ifdef CONFIG_NETFILTER_EGRESS
279
if (nf_hook_egress_active()) {
280
skb = nf_hook_direct_egress(skb);
281
if (!skb)
282
return NET_XMIT_DROP;
283
}
284
#endif
285
return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
286
}
287
288
static struct net_device *packet_cached_dev_get(struct packet_sock *po)
289
{
290
struct net_device *dev;
291
292
rcu_read_lock();
293
dev = rcu_dereference(po->cached_dev);
294
dev_hold(dev);
295
rcu_read_unlock();
296
297
return dev;
298
}
299
300
static void packet_cached_dev_assign(struct packet_sock *po,
301
struct net_device *dev)
302
{
303
rcu_assign_pointer(po->cached_dev, dev);
304
}
305
306
static void packet_cached_dev_reset(struct packet_sock *po)
307
{
308
RCU_INIT_POINTER(po->cached_dev, NULL);
309
}
310
311
static u16 packet_pick_tx_queue(struct sk_buff *skb)
312
{
313
struct net_device *dev = skb->dev;
314
const struct net_device_ops *ops = dev->netdev_ops;
315
int cpu = raw_smp_processor_id();
316
u16 queue_index;
317
318
#ifdef CONFIG_XPS
319
skb->sender_cpu = cpu + 1;
320
#endif
321
skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
322
if (ops->ndo_select_queue) {
323
queue_index = ops->ndo_select_queue(dev, skb, NULL);
324
queue_index = netdev_cap_txqueue(dev, queue_index);
325
} else {
326
queue_index = netdev_pick_tx(dev, skb, NULL);
327
}
328
329
return queue_index;
330
}
331
332
/* __register_prot_hook must be invoked through register_prot_hook
333
* or from a context in which asynchronous accesses to the packet
334
* socket is not possible (packet_create()).
335
*/
336
static void __register_prot_hook(struct sock *sk)
337
{
338
struct packet_sock *po = pkt_sk(sk);
339
340
if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
341
if (po->fanout)
342
__fanout_link(sk, po);
343
else
344
dev_add_pack(&po->prot_hook);
345
346
sock_hold(sk);
347
packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1);
348
}
349
}
350
351
static void register_prot_hook(struct sock *sk)
352
{
353
lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
354
__register_prot_hook(sk);
355
}
356
357
/* If the sync parameter is true, we will temporarily drop
358
* the po->bind_lock and do a synchronize_net to make sure no
359
* asynchronous packet processing paths still refer to the elements
360
* of po->prot_hook. If the sync parameter is false, it is the
361
* callers responsibility to take care of this.
362
*/
363
static void __unregister_prot_hook(struct sock *sk, bool sync)
364
{
365
struct packet_sock *po = pkt_sk(sk);
366
367
lockdep_assert_held_once(&po->bind_lock);
368
369
packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0);
370
371
if (po->fanout)
372
__fanout_unlink(sk, po);
373
else
374
__dev_remove_pack(&po->prot_hook);
375
376
__sock_put(sk);
377
378
if (sync) {
379
spin_unlock(&po->bind_lock);
380
synchronize_net();
381
spin_lock(&po->bind_lock);
382
}
383
}
384
385
static void unregister_prot_hook(struct sock *sk, bool sync)
386
{
387
struct packet_sock *po = pkt_sk(sk);
388
389
if (packet_sock_flag(po, PACKET_SOCK_RUNNING))
390
__unregister_prot_hook(sk, sync);
391
}
392
393
static inline struct page * __pure pgv_to_page(void *addr)
394
{
395
if (is_vmalloc_addr(addr))
396
return vmalloc_to_page(addr);
397
return virt_to_page(addr);
398
}
399
400
static void __packet_set_status(struct packet_sock *po, void *frame, int status)
401
{
402
union tpacket_uhdr h;
403
404
/* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */
405
406
h.raw = frame;
407
switch (po->tp_version) {
408
case TPACKET_V1:
409
WRITE_ONCE(h.h1->tp_status, status);
410
flush_dcache_page(pgv_to_page(&h.h1->tp_status));
411
break;
412
case TPACKET_V2:
413
WRITE_ONCE(h.h2->tp_status, status);
414
flush_dcache_page(pgv_to_page(&h.h2->tp_status));
415
break;
416
case TPACKET_V3:
417
WRITE_ONCE(h.h3->tp_status, status);
418
flush_dcache_page(pgv_to_page(&h.h3->tp_status));
419
break;
420
default:
421
WARN(1, "TPACKET version not supported.\n");
422
BUG();
423
}
424
425
smp_wmb();
426
}
427
428
static int __packet_get_status(const struct packet_sock *po, void *frame)
429
{
430
union tpacket_uhdr h;
431
432
smp_rmb();
433
434
/* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */
435
436
h.raw = frame;
437
switch (po->tp_version) {
438
case TPACKET_V1:
439
flush_dcache_page(pgv_to_page(&h.h1->tp_status));
440
return READ_ONCE(h.h1->tp_status);
441
case TPACKET_V2:
442
flush_dcache_page(pgv_to_page(&h.h2->tp_status));
443
return READ_ONCE(h.h2->tp_status);
444
case TPACKET_V3:
445
flush_dcache_page(pgv_to_page(&h.h3->tp_status));
446
return READ_ONCE(h.h3->tp_status);
447
default:
448
WARN(1, "TPACKET version not supported.\n");
449
BUG();
450
return 0;
451
}
452
}
453
454
static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
455
unsigned int flags)
456
{
457
struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
458
459
if (shhwtstamps &&
460
(flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
461
ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
462
return TP_STATUS_TS_RAW_HARDWARE;
463
464
if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
465
ktime_to_timespec64_cond(skb_tstamp(skb), ts))
466
return TP_STATUS_TS_SOFTWARE;
467
468
return 0;
469
}
470
471
static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
472
struct sk_buff *skb)
473
{
474
union tpacket_uhdr h;
475
struct timespec64 ts;
476
__u32 ts_status;
477
478
if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp))))
479
return 0;
480
481
h.raw = frame;
482
/*
483
* versions 1 through 3 overflow the timestamps in y2106, since they
484
* all store the seconds in a 32-bit unsigned integer.
485
* If we create a version 4, that should have a 64-bit timestamp,
486
* either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
487
* nanoseconds.
488
*/
489
switch (po->tp_version) {
490
case TPACKET_V1:
491
h.h1->tp_sec = ts.tv_sec;
492
h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
493
break;
494
case TPACKET_V2:
495
h.h2->tp_sec = ts.tv_sec;
496
h.h2->tp_nsec = ts.tv_nsec;
497
break;
498
case TPACKET_V3:
499
h.h3->tp_sec = ts.tv_sec;
500
h.h3->tp_nsec = ts.tv_nsec;
501
break;
502
default:
503
WARN(1, "TPACKET version not supported.\n");
504
BUG();
505
}
506
507
/* one flush is safe, as both fields always lie on the same cacheline */
508
flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
509
smp_wmb();
510
511
return ts_status;
512
}
513
514
static void *packet_lookup_frame(const struct packet_sock *po,
515
const struct packet_ring_buffer *rb,
516
unsigned int position,
517
int status)
518
{
519
unsigned int pg_vec_pos, frame_offset;
520
union tpacket_uhdr h;
521
522
pg_vec_pos = position / rb->frames_per_block;
523
frame_offset = position % rb->frames_per_block;
524
525
h.raw = rb->pg_vec[pg_vec_pos].buffer +
526
(frame_offset * rb->frame_size);
527
528
if (status != __packet_get_status(po, h.raw))
529
return NULL;
530
531
return h.raw;
532
}
533
534
static void *packet_current_frame(struct packet_sock *po,
535
struct packet_ring_buffer *rb,
536
int status)
537
{
538
return packet_lookup_frame(po, rb, rb->head, status);
539
}
540
541
static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev)
542
{
543
struct vlan_hdr vhdr, *vh;
544
unsigned int header_len;
545
546
if (!dev)
547
return 0;
548
549
/* In the SOCK_DGRAM scenario, skb data starts at the network
550
* protocol, which is after the VLAN headers. The outer VLAN
551
* header is at the hard_header_len offset in non-variable
552
* length link layer headers. If it's a VLAN device, the
553
* min_header_len should be used to exclude the VLAN header
554
* size.
555
*/
556
if (dev->min_header_len == dev->hard_header_len)
557
header_len = dev->hard_header_len;
558
else if (is_vlan_dev(dev))
559
header_len = dev->min_header_len;
560
else
561
return 0;
562
563
vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len,
564
sizeof(vhdr), &vhdr);
565
if (unlikely(!vh))
566
return 0;
567
568
return ntohs(vh->h_vlan_TCI);
569
}
570
571
static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
572
{
573
__be16 proto = skb->protocol;
574
575
if (unlikely(eth_type_vlan(proto)))
576
proto = vlan_get_protocol_offset_inline(skb, proto,
577
skb_mac_offset(skb),
578
NULL);
579
580
return proto;
581
}
582
583
static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
584
struct sk_buff_head *rb_queue)
585
{
586
struct tpacket_kbdq_core *pkc;
587
588
pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
589
hrtimer_cancel(&pkc->retire_blk_timer);
590
}
591
592
static int prb_calc_retire_blk_tmo(struct packet_sock *po,
593
int blk_size_in_bytes)
594
{
595
struct net_device *dev;
596
unsigned int mbits, div;
597
struct ethtool_link_ksettings ecmd;
598
int err;
599
600
rtnl_lock();
601
dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
602
if (unlikely(!dev)) {
603
rtnl_unlock();
604
return DEFAULT_PRB_RETIRE_TOV;
605
}
606
err = __ethtool_get_link_ksettings(dev, &ecmd);
607
rtnl_unlock();
608
if (err)
609
return DEFAULT_PRB_RETIRE_TOV;
610
611
/* If the link speed is so slow you don't really
612
* need to worry about perf anyways
613
*/
614
if (ecmd.base.speed < SPEED_1000 ||
615
ecmd.base.speed == SPEED_UNKNOWN)
616
return DEFAULT_PRB_RETIRE_TOV;
617
618
div = ecmd.base.speed / 1000;
619
mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
620
621
if (div)
622
mbits /= div;
623
624
if (div)
625
return mbits + 1;
626
return mbits;
627
}
628
629
static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
630
union tpacket_req_u *req_u)
631
{
632
p1->feature_req_word = req_u->req3.tp_feature_req_word;
633
}
634
635
static void init_prb_bdqc(struct packet_sock *po,
636
struct packet_ring_buffer *rb,
637
struct pgv *pg_vec,
638
union tpacket_req_u *req_u)
639
{
640
struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
641
struct tpacket_block_desc *pbd;
642
643
memset(p1, 0x0, sizeof(*p1));
644
645
p1->knxt_seq_num = 1;
646
p1->pkbdq = pg_vec;
647
pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
648
p1->pkblk_start = pg_vec[0].buffer;
649
p1->kblk_size = req_u->req3.tp_block_size;
650
p1->knum_blocks = req_u->req3.tp_block_nr;
651
p1->hdrlen = po->tp_hdrlen;
652
p1->version = po->tp_version;
653
po->stats.stats3.tp_freeze_q_cnt = 0;
654
if (req_u->req3.tp_retire_blk_tov)
655
p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
656
else
657
p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
658
req_u->req3.tp_block_size));
659
p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
660
rwlock_init(&p1->blk_fill_in_prog_lock);
661
662
p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
663
prb_init_ft_ops(p1, req_u);
664
hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
665
CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
666
hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
667
HRTIMER_MODE_REL_SOFT);
668
prb_open_block(p1, pbd);
669
}
670
671
/*
672
* With a 1MB block-size, on a 1Gbps line, it will take
673
* i) ~8 ms to fill a block + ii) memcpy etc.
674
* In this cut we are not accounting for the memcpy time.
675
*
676
* Since the tmo granularity is in msecs, it is not too expensive
677
* to refresh the timer, lets say every '8' msecs.
678
* Either the user can set the 'tmo' or we can derive it based on
679
* a) line-speed and b) block-size.
680
* prb_calc_retire_blk_tmo() calculates the tmo.
681
*/
682
static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
683
{
684
struct packet_sock *po =
685
timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
686
struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
687
unsigned int frozen;
688
struct tpacket_block_desc *pbd;
689
690
spin_lock(&po->sk.sk_receive_queue.lock);
691
692
frozen = prb_queue_frozen(pkc);
693
pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
694
695
/* We only need to plug the race when the block is partially filled.
696
* tpacket_rcv:
697
* lock(); increment BLOCK_NUM_PKTS; unlock()
698
* copy_bits() is in progress ...
699
* timer fires on other cpu:
700
* we can't retire the current block because copy_bits
701
* is in progress.
702
*
703
*/
704
if (BLOCK_NUM_PKTS(pbd)) {
705
/* Waiting for skb_copy_bits to finish... */
706
write_lock(&pkc->blk_fill_in_prog_lock);
707
write_unlock(&pkc->blk_fill_in_prog_lock);
708
}
709
710
if (!frozen) {
711
if (BLOCK_NUM_PKTS(pbd)) {
712
/* Not an empty block. Need retire the block. */
713
prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
714
prb_dispatch_next_block(pkc, po);
715
}
716
} else {
717
/* Case 1. Queue was frozen because user-space was
718
* lagging behind.
719
*/
720
if (!prb_curr_blk_in_use(pbd)) {
721
/* Case 2. queue was frozen,user-space caught up,
722
* now the link went idle && the timer fired.
723
* We don't have a block to close.So we open this
724
* block and restart the timer.
725
* opening a block thaws the queue,restarts timer
726
* Thawing/timer-refresh is a side effect.
727
*/
728
prb_open_block(pkc, pbd);
729
}
730
}
731
732
hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime);
733
spin_unlock(&po->sk.sk_receive_queue.lock);
734
return HRTIMER_RESTART;
735
}
736
737
static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
738
struct tpacket_block_desc *pbd1, __u32 status)
739
{
740
/* Flush everything minus the block header */
741
742
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
743
u8 *start, *end;
744
745
start = (u8 *)pbd1;
746
747
/* Skip the block header(we know header WILL fit in 4K) */
748
start += PAGE_SIZE;
749
750
end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
751
for (; start < end; start += PAGE_SIZE)
752
flush_dcache_page(pgv_to_page(start));
753
754
smp_wmb();
755
#endif
756
757
/* Now update the block status. */
758
759
BLOCK_STATUS(pbd1) = status;
760
761
/* Flush the block header */
762
763
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
764
start = (u8 *)pbd1;
765
flush_dcache_page(pgv_to_page(start));
766
767
smp_wmb();
768
#endif
769
}
770
771
/*
772
* Side effect:
773
*
774
* 1) flush the block
775
* 2) Increment active_blk_num
776
*
777
* Note:We DONT refresh the timer on purpose.
778
* Because almost always the next block will be opened.
779
*/
780
static void prb_close_block(struct tpacket_kbdq_core *pkc1,
781
struct tpacket_block_desc *pbd1,
782
struct packet_sock *po, unsigned int stat)
783
{
784
__u32 status = TP_STATUS_USER | stat;
785
786
struct tpacket3_hdr *last_pkt;
787
struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
788
struct sock *sk = &po->sk;
789
790
if (atomic_read(&po->tp_drops))
791
status |= TP_STATUS_LOSING;
792
793
last_pkt = (struct tpacket3_hdr *)pkc1->prev;
794
last_pkt->tp_next_offset = 0;
795
796
/* Get the ts of the last pkt */
797
if (BLOCK_NUM_PKTS(pbd1)) {
798
h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
799
h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
800
} else {
801
/* Ok, we tmo'd - so get the current time.
802
*
803
* It shouldn't really happen as we don't close empty
804
* blocks. See prb_retire_rx_blk_timer_expired().
805
*/
806
struct timespec64 ts;
807
ktime_get_real_ts64(&ts);
808
h1->ts_last_pkt.ts_sec = ts.tv_sec;
809
h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
810
}
811
812
smp_wmb();
813
814
/* Flush the block */
815
prb_flush_block(pkc1, pbd1, status);
816
817
sk->sk_data_ready(sk);
818
819
pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
820
}
821
822
static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
823
{
824
pkc->reset_pending_on_curr_blk = 0;
825
}
826
827
/*
828
* prb_open_block is called by tpacket_rcv or timer callback.
829
*
830
* Reasons why NOT update hrtimer in prb_open_block:
831
* 1) It will increase complexity to distinguish the two caller scenario.
832
* 2) hrtimer_cancel and hrtimer_start need to be called if you want to update
833
* TMO of an already enqueued hrtimer, leading to complex shutdown logic.
834
*
835
* One side effect of NOT update hrtimer when called by tpacket_rcv is that
836
* a newly opened block triggered by tpacket_rcv may be retired earlier than
837
* expected. On the other hand, if timeout is updated in prb_open_block, the
838
* frequent reception of network packets that leads to prb_open_block being
839
* called may cause hrtimer to be removed and enqueued repeatedly.
840
*/
841
static void prb_open_block(struct tpacket_kbdq_core *pkc1,
842
struct tpacket_block_desc *pbd1)
843
{
844
struct timespec64 ts;
845
struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
846
847
smp_rmb();
848
849
/* We could have just memset this but we will lose the
850
* flexibility of making the priv area sticky
851
*/
852
853
BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
854
BLOCK_NUM_PKTS(pbd1) = 0;
855
BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
856
857
ktime_get_real_ts64(&ts);
858
859
h1->ts_first_pkt.ts_sec = ts.tv_sec;
860
h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
861
862
pkc1->pkblk_start = (char *)pbd1;
863
pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
864
865
BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
866
BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
867
868
pbd1->version = pkc1->version;
869
pkc1->prev = pkc1->nxt_offset;
870
pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
871
872
prb_thaw_queue(pkc1);
873
874
smp_wmb();
875
}
876
877
/*
878
* Queue freeze logic:
879
* 1) Assume tp_block_nr = 8 blocks.
880
* 2) At time 't0', user opens Rx ring.
881
* 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
882
* 4) user-space is either sleeping or processing block '0'.
883
* 5) tpacket_rcv is currently filling block '7', since there is no space left,
884
* it will close block-7,loop around and try to fill block '0'.
885
* call-flow:
886
* __packet_lookup_frame_in_block
887
* prb_retire_current_block()
888
* prb_dispatch_next_block()
889
* |->(BLOCK_STATUS == USER) evaluates to true
890
* 5.1) Since block-0 is currently in-use, we just freeze the queue.
891
* 6) Now there are two cases:
892
* 6.1) Link goes idle right after the queue is frozen.
893
* But remember, the last open_block() refreshed the timer.
894
* When this timer expires,it will refresh itself so that we can
895
* re-open block-0 in near future.
896
* 6.2) Link is busy and keeps on receiving packets. This is a simple
897
* case and __packet_lookup_frame_in_block will check if block-0
898
* is free and can now be re-used.
899
*/
900
static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
901
struct packet_sock *po)
902
{
903
pkc->reset_pending_on_curr_blk = 1;
904
po->stats.stats3.tp_freeze_q_cnt++;
905
}
906
907
#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
908
909
/*
910
* If the next block is free then we will dispatch it
911
* and return a good offset.
912
* Else, we will freeze the queue.
913
* So, caller must check the return value.
914
*/
915
static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
916
struct packet_sock *po)
917
{
918
struct tpacket_block_desc *pbd;
919
920
smp_rmb();
921
922
/* 1. Get current block num */
923
pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
924
925
/* 2. If this block is currently in_use then freeze the queue */
926
if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
927
prb_freeze_queue(pkc, po);
928
return NULL;
929
}
930
931
/*
932
* 3.
933
* open this block and return the offset where the first packet
934
* needs to get stored.
935
*/
936
prb_open_block(pkc, pbd);
937
return (void *)pkc->nxt_offset;
938
}
939
940
static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
941
struct packet_sock *po, unsigned int status)
942
{
943
struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
944
945
/* retire/close the current block */
946
if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
947
/*
948
* Plug the case where copy_bits() is in progress on
949
* cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
950
* have space to copy the pkt in the current block and
951
* called prb_retire_current_block()
952
*
953
* We don't need to worry about the TMO case because
954
* the timer-handler already handled this case.
955
*/
956
if (!(status & TP_STATUS_BLK_TMO)) {
957
/* Waiting for skb_copy_bits to finish... */
958
write_lock(&pkc->blk_fill_in_prog_lock);
959
write_unlock(&pkc->blk_fill_in_prog_lock);
960
}
961
prb_close_block(pkc, pbd, po, status);
962
return;
963
}
964
}
965
966
static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
967
{
968
return TP_STATUS_USER & BLOCK_STATUS(pbd);
969
}
970
971
static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
972
{
973
return pkc->reset_pending_on_curr_blk;
974
}
975
976
static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
977
__releases(&pkc->blk_fill_in_prog_lock)
978
{
979
struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
980
981
read_unlock(&pkc->blk_fill_in_prog_lock);
982
}
983
984
static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
985
struct tpacket3_hdr *ppd)
986
{
987
ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
988
}
989
990
static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
991
struct tpacket3_hdr *ppd)
992
{
993
ppd->hv1.tp_rxhash = 0;
994
}
995
996
static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
997
struct tpacket3_hdr *ppd)
998
{
999
struct packet_sock *po = container_of(pkc, struct packet_sock, rx_ring.prb_bdqc);
1000
1001
if (skb_vlan_tag_present(pkc->skb)) {
1002
ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
1003
ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1004
ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
1005
} else if (unlikely(po->sk.sk_type == SOCK_DGRAM && eth_type_vlan(pkc->skb->protocol))) {
1006
ppd->hv1.tp_vlan_tci = vlan_get_tci(pkc->skb, pkc->skb->dev);
1007
ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->protocol);
1008
ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
1009
} else {
1010
ppd->hv1.tp_vlan_tci = 0;
1011
ppd->hv1.tp_vlan_tpid = 0;
1012
ppd->tp_status = TP_STATUS_AVAILABLE;
1013
}
1014
}
1015
1016
static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
1017
struct tpacket3_hdr *ppd)
1018
{
1019
ppd->hv1.tp_padding = 0;
1020
prb_fill_vlan_info(pkc, ppd);
1021
1022
if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1023
prb_fill_rxhash(pkc, ppd);
1024
else
1025
prb_clear_rxhash(pkc, ppd);
1026
}
1027
1028
static void prb_fill_curr_block(char *curr,
1029
struct tpacket_kbdq_core *pkc,
1030
struct tpacket_block_desc *pbd,
1031
unsigned int len)
1032
__acquires(&pkc->blk_fill_in_prog_lock)
1033
{
1034
struct tpacket3_hdr *ppd;
1035
1036
ppd = (struct tpacket3_hdr *)curr;
1037
ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1038
pkc->prev = curr;
1039
pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1040
BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1041
BLOCK_NUM_PKTS(pbd) += 1;
1042
read_lock(&pkc->blk_fill_in_prog_lock);
1043
prb_run_all_ft_ops(pkc, ppd);
1044
}
1045
1046
/* Assumes caller has the sk->rx_queue.lock */
1047
static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1048
struct sk_buff *skb,
1049
unsigned int len
1050
)
1051
{
1052
struct tpacket_kbdq_core *pkc;
1053
struct tpacket_block_desc *pbd;
1054
char *curr, *end;
1055
1056
pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1057
pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1058
1059
/* Queue is frozen when user space is lagging behind */
1060
if (prb_queue_frozen(pkc)) {
1061
/*
1062
* Check if that last block which caused the queue to freeze,
1063
* is still in_use by user-space.
1064
*/
1065
if (prb_curr_blk_in_use(pbd)) {
1066
/* Can't record this packet */
1067
return NULL;
1068
} else {
1069
/*
1070
* Ok, the block was released by user-space.
1071
* Now let's open that block.
1072
* opening a block also thaws the queue.
1073
* Thawing is a side effect.
1074
*/
1075
prb_open_block(pkc, pbd);
1076
}
1077
}
1078
1079
smp_mb();
1080
curr = pkc->nxt_offset;
1081
pkc->skb = skb;
1082
end = (char *)pbd + pkc->kblk_size;
1083
1084
/* first try the current block */
1085
if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1086
prb_fill_curr_block(curr, pkc, pbd, len);
1087
return (void *)curr;
1088
}
1089
1090
/* Ok, close the current block */
1091
prb_retire_current_block(pkc, po, 0);
1092
1093
/* Now, try to dispatch the next block */
1094
curr = (char *)prb_dispatch_next_block(pkc, po);
1095
if (curr) {
1096
pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1097
prb_fill_curr_block(curr, pkc, pbd, len);
1098
return (void *)curr;
1099
}
1100
1101
/*
1102
* No free blocks are available.user_space hasn't caught up yet.
1103
* Queue was just frozen and now this packet will get dropped.
1104
*/
1105
return NULL;
1106
}
1107
1108
static void *packet_current_rx_frame(struct packet_sock *po,
1109
struct sk_buff *skb,
1110
int status, unsigned int len)
1111
{
1112
char *curr = NULL;
1113
switch (po->tp_version) {
1114
case TPACKET_V1:
1115
case TPACKET_V2:
1116
curr = packet_lookup_frame(po, &po->rx_ring,
1117
po->rx_ring.head, status);
1118
return curr;
1119
case TPACKET_V3:
1120
return __packet_lookup_frame_in_block(po, skb, len);
1121
default:
1122
WARN(1, "TPACKET version not supported\n");
1123
BUG();
1124
return NULL;
1125
}
1126
}
1127
1128
static void *prb_lookup_block(const struct packet_sock *po,
1129
const struct packet_ring_buffer *rb,
1130
unsigned int idx,
1131
int status)
1132
{
1133
struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1134
struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1135
1136
if (status != BLOCK_STATUS(pbd))
1137
return NULL;
1138
return pbd;
1139
}
1140
1141
static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1142
{
1143
unsigned int prev;
1144
if (rb->prb_bdqc.kactive_blk_num)
1145
prev = rb->prb_bdqc.kactive_blk_num-1;
1146
else
1147
prev = rb->prb_bdqc.knum_blocks-1;
1148
return prev;
1149
}
1150
1151
/* Assumes caller has held the rx_queue.lock */
1152
static void *__prb_previous_block(struct packet_sock *po,
1153
struct packet_ring_buffer *rb,
1154
int status)
1155
{
1156
unsigned int previous = prb_previous_blk_num(rb);
1157
return prb_lookup_block(po, rb, previous, status);
1158
}
1159
1160
static void *packet_previous_rx_frame(struct packet_sock *po,
1161
struct packet_ring_buffer *rb,
1162
int status)
1163
{
1164
if (po->tp_version <= TPACKET_V2)
1165
return packet_previous_frame(po, rb, status);
1166
1167
return __prb_previous_block(po, rb, status);
1168
}
1169
1170
static void packet_increment_rx_head(struct packet_sock *po,
1171
struct packet_ring_buffer *rb)
1172
{
1173
switch (po->tp_version) {
1174
case TPACKET_V1:
1175
case TPACKET_V2:
1176
return packet_increment_head(rb);
1177
case TPACKET_V3:
1178
default:
1179
WARN(1, "TPACKET version not supported.\n");
1180
BUG();
1181
return;
1182
}
1183
}
1184
1185
static void *packet_previous_frame(struct packet_sock *po,
1186
struct packet_ring_buffer *rb,
1187
int status)
1188
{
1189
unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1190
return packet_lookup_frame(po, rb, previous, status);
1191
}
1192
1193
static void packet_increment_head(struct packet_ring_buffer *buff)
1194
{
1195
buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1196
}
1197
1198
static void packet_inc_pending(struct packet_ring_buffer *rb)
1199
{
1200
this_cpu_inc(*rb->pending_refcnt);
1201
}
1202
1203
static void packet_dec_pending(struct packet_ring_buffer *rb)
1204
{
1205
this_cpu_dec(*rb->pending_refcnt);
1206
}
1207
1208
static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1209
{
1210
unsigned int refcnt = 0;
1211
int cpu;
1212
1213
/* We don't use pending refcount in rx_ring. */
1214
if (rb->pending_refcnt == NULL)
1215
return 0;
1216
1217
for_each_possible_cpu(cpu)
1218
refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1219
1220
return refcnt;
1221
}
1222
1223
static int packet_alloc_pending(struct packet_sock *po)
1224
{
1225
po->rx_ring.pending_refcnt = NULL;
1226
1227
po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1228
if (unlikely(po->tx_ring.pending_refcnt == NULL))
1229
return -ENOBUFS;
1230
1231
return 0;
1232
}
1233
1234
static void packet_free_pending(struct packet_sock *po)
1235
{
1236
free_percpu(po->tx_ring.pending_refcnt);
1237
}
1238
1239
#define ROOM_POW_OFF 2
1240
#define ROOM_NONE 0x0
1241
#define ROOM_LOW 0x1
1242
#define ROOM_NORMAL 0x2
1243
1244
static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
1245
{
1246
int idx, len;
1247
1248
len = READ_ONCE(po->rx_ring.frame_max) + 1;
1249
idx = READ_ONCE(po->rx_ring.head);
1250
if (pow_off)
1251
idx += len >> pow_off;
1252
if (idx >= len)
1253
idx -= len;
1254
return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1255
}
1256
1257
static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
1258
{
1259
int idx, len;
1260
1261
len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1262
idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
1263
if (pow_off)
1264
idx += len >> pow_off;
1265
if (idx >= len)
1266
idx -= len;
1267
return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1268
}
1269
1270
static int __packet_rcv_has_room(const struct packet_sock *po,
1271
const struct sk_buff *skb)
1272
{
1273
const struct sock *sk = &po->sk;
1274
int ret = ROOM_NONE;
1275
1276
if (po->prot_hook.func != tpacket_rcv) {
1277
int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1278
int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1279
- (skb ? skb->truesize : 0);
1280
1281
if (avail > (rcvbuf >> ROOM_POW_OFF))
1282
return ROOM_NORMAL;
1283
else if (avail > 0)
1284
return ROOM_LOW;
1285
else
1286
return ROOM_NONE;
1287
}
1288
1289
if (po->tp_version == TPACKET_V3) {
1290
if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1291
ret = ROOM_NORMAL;
1292
else if (__tpacket_v3_has_room(po, 0))
1293
ret = ROOM_LOW;
1294
} else {
1295
if (__tpacket_has_room(po, ROOM_POW_OFF))
1296
ret = ROOM_NORMAL;
1297
else if (__tpacket_has_room(po, 0))
1298
ret = ROOM_LOW;
1299
}
1300
1301
return ret;
1302
}
1303
1304
static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1305
{
1306
bool pressure;
1307
int ret;
1308
1309
ret = __packet_rcv_has_room(po, skb);
1310
pressure = ret != ROOM_NORMAL;
1311
1312
if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure)
1313
packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure);
1314
1315
return ret;
1316
}
1317
1318
static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1319
{
1320
if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) &&
1321
__packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1322
packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false);
1323
}
1324
1325
static void packet_sock_destruct(struct sock *sk)
1326
{
1327
skb_queue_purge(&sk->sk_error_queue);
1328
1329
WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1330
WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1331
1332
if (!sock_flag(sk, SOCK_DEAD)) {
1333
pr_err("Attempt to release alive packet socket: %p\n", sk);
1334
return;
1335
}
1336
}
1337
1338
static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1339
{
1340
u32 *history = po->rollover->history;
1341
u32 victim, rxhash;
1342
int i, count = 0;
1343
1344
rxhash = skb_get_hash(skb);
1345
for (i = 0; i < ROLLOVER_HLEN; i++)
1346
if (READ_ONCE(history[i]) == rxhash)
1347
count++;
1348
1349
victim = get_random_u32_below(ROLLOVER_HLEN);
1350
1351
/* Avoid dirtying the cache line if possible */
1352
if (READ_ONCE(history[victim]) != rxhash)
1353
WRITE_ONCE(history[victim], rxhash);
1354
1355
return count > (ROLLOVER_HLEN >> 1);
1356
}
1357
1358
static unsigned int fanout_demux_hash(struct packet_fanout *f,
1359
struct sk_buff *skb,
1360
unsigned int num)
1361
{
1362
return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1363
}
1364
1365
static unsigned int fanout_demux_lb(struct packet_fanout *f,
1366
struct sk_buff *skb,
1367
unsigned int num)
1368
{
1369
unsigned int val = atomic_inc_return(&f->rr_cur);
1370
1371
return val % num;
1372
}
1373
1374
static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1375
struct sk_buff *skb,
1376
unsigned int num)
1377
{
1378
return smp_processor_id() % num;
1379
}
1380
1381
static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1382
struct sk_buff *skb,
1383
unsigned int num)
1384
{
1385
return get_random_u32_below(num);
1386
}
1387
1388
static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1389
struct sk_buff *skb,
1390
unsigned int idx, bool try_self,
1391
unsigned int num)
1392
{
1393
struct packet_sock *po, *po_next, *po_skip = NULL;
1394
unsigned int i, j, room = ROOM_NONE;
1395
1396
po = pkt_sk(rcu_dereference(f->arr[idx]));
1397
1398
if (try_self) {
1399
room = packet_rcv_has_room(po, skb);
1400
if (room == ROOM_NORMAL ||
1401
(room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1402
return idx;
1403
po_skip = po;
1404
}
1405
1406
i = j = min_t(int, po->rollover->sock, num - 1);
1407
do {
1408
po_next = pkt_sk(rcu_dereference(f->arr[i]));
1409
if (po_next != po_skip &&
1410
!packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) &&
1411
packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1412
if (i != j)
1413
po->rollover->sock = i;
1414
atomic_long_inc(&po->rollover->num);
1415
if (room == ROOM_LOW)
1416
atomic_long_inc(&po->rollover->num_huge);
1417
return i;
1418
}
1419
1420
if (++i == num)
1421
i = 0;
1422
} while (i != j);
1423
1424
atomic_long_inc(&po->rollover->num_failed);
1425
return idx;
1426
}
1427
1428
static unsigned int fanout_demux_qm(struct packet_fanout *f,
1429
struct sk_buff *skb,
1430
unsigned int num)
1431
{
1432
return skb_get_queue_mapping(skb) % num;
1433
}
1434
1435
static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1436
struct sk_buff *skb,
1437
unsigned int num)
1438
{
1439
struct bpf_prog *prog;
1440
unsigned int ret = 0;
1441
1442
rcu_read_lock();
1443
prog = rcu_dereference(f->bpf_prog);
1444
if (prog)
1445
ret = bpf_prog_run_clear_cb(prog, skb) % num;
1446
rcu_read_unlock();
1447
1448
return ret;
1449
}
1450
1451
static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1452
{
1453
return f->flags & (flag >> 8);
1454
}
1455
1456
static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1457
struct packet_type *pt, struct net_device *orig_dev)
1458
{
1459
struct packet_fanout *f = pt->af_packet_priv;
1460
unsigned int num = READ_ONCE(f->num_members);
1461
struct net *net = read_pnet(&f->net);
1462
struct packet_sock *po;
1463
unsigned int idx;
1464
1465
if (!net_eq(dev_net(dev), net) || !num) {
1466
kfree_skb(skb);
1467
return 0;
1468
}
1469
1470
if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1471
skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
1472
if (!skb)
1473
return 0;
1474
}
1475
switch (f->type) {
1476
case PACKET_FANOUT_HASH:
1477
default:
1478
idx = fanout_demux_hash(f, skb, num);
1479
break;
1480
case PACKET_FANOUT_LB:
1481
idx = fanout_demux_lb(f, skb, num);
1482
break;
1483
case PACKET_FANOUT_CPU:
1484
idx = fanout_demux_cpu(f, skb, num);
1485
break;
1486
case PACKET_FANOUT_RND:
1487
idx = fanout_demux_rnd(f, skb, num);
1488
break;
1489
case PACKET_FANOUT_QM:
1490
idx = fanout_demux_qm(f, skb, num);
1491
break;
1492
case PACKET_FANOUT_ROLLOVER:
1493
idx = fanout_demux_rollover(f, skb, 0, false, num);
1494
break;
1495
case PACKET_FANOUT_CBPF:
1496
case PACKET_FANOUT_EBPF:
1497
idx = fanout_demux_bpf(f, skb, num);
1498
break;
1499
}
1500
1501
if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1502
idx = fanout_demux_rollover(f, skb, idx, true, num);
1503
1504
po = pkt_sk(rcu_dereference(f->arr[idx]));
1505
return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1506
}
1507
1508
DEFINE_MUTEX(fanout_mutex);
1509
EXPORT_SYMBOL_GPL(fanout_mutex);
1510
static LIST_HEAD(fanout_list);
1511
static u16 fanout_next_id;
1512
1513
static void __fanout_link(struct sock *sk, struct packet_sock *po)
1514
{
1515
struct packet_fanout *f = po->fanout;
1516
1517
spin_lock(&f->lock);
1518
rcu_assign_pointer(f->arr[f->num_members], sk);
1519
smp_wmb();
1520
f->num_members++;
1521
if (f->num_members == 1)
1522
dev_add_pack(&f->prot_hook);
1523
spin_unlock(&f->lock);
1524
}
1525
1526
static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1527
{
1528
struct packet_fanout *f = po->fanout;
1529
int i;
1530
1531
spin_lock(&f->lock);
1532
for (i = 0; i < f->num_members; i++) {
1533
if (rcu_dereference_protected(f->arr[i],
1534
lockdep_is_held(&f->lock)) == sk)
1535
break;
1536
}
1537
BUG_ON(i >= f->num_members);
1538
rcu_assign_pointer(f->arr[i],
1539
rcu_dereference_protected(f->arr[f->num_members - 1],
1540
lockdep_is_held(&f->lock)));
1541
f->num_members--;
1542
if (f->num_members == 0)
1543
__dev_remove_pack(&f->prot_hook);
1544
spin_unlock(&f->lock);
1545
}
1546
1547
static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1548
{
1549
if (sk->sk_family != PF_PACKET)
1550
return false;
1551
1552
return ptype->af_packet_priv == pkt_sk(sk)->fanout;
1553
}
1554
1555
static void fanout_init_data(struct packet_fanout *f)
1556
{
1557
switch (f->type) {
1558
case PACKET_FANOUT_LB:
1559
atomic_set(&f->rr_cur, 0);
1560
break;
1561
case PACKET_FANOUT_CBPF:
1562
case PACKET_FANOUT_EBPF:
1563
RCU_INIT_POINTER(f->bpf_prog, NULL);
1564
break;
1565
}
1566
}
1567
1568
static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1569
{
1570
struct bpf_prog *old;
1571
1572
spin_lock(&f->lock);
1573
old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1574
rcu_assign_pointer(f->bpf_prog, new);
1575
spin_unlock(&f->lock);
1576
1577
if (old) {
1578
synchronize_net();
1579
bpf_prog_destroy(old);
1580
}
1581
}
1582
1583
static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
1584
unsigned int len)
1585
{
1586
struct bpf_prog *new;
1587
struct sock_fprog fprog;
1588
int ret;
1589
1590
if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1591
return -EPERM;
1592
1593
ret = copy_bpf_fprog_from_user(&fprog, data, len);
1594
if (ret)
1595
return ret;
1596
1597
ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
1598
if (ret)
1599
return ret;
1600
1601
__fanout_set_data_bpf(po->fanout, new);
1602
return 0;
1603
}
1604
1605
static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
1606
unsigned int len)
1607
{
1608
struct bpf_prog *new;
1609
u32 fd;
1610
1611
if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1612
return -EPERM;
1613
if (len != sizeof(fd))
1614
return -EINVAL;
1615
if (copy_from_sockptr(&fd, data, len))
1616
return -EFAULT;
1617
1618
new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
1619
if (IS_ERR(new))
1620
return PTR_ERR(new);
1621
1622
__fanout_set_data_bpf(po->fanout, new);
1623
return 0;
1624
}
1625
1626
static int fanout_set_data(struct packet_sock *po, sockptr_t data,
1627
unsigned int len)
1628
{
1629
switch (po->fanout->type) {
1630
case PACKET_FANOUT_CBPF:
1631
return fanout_set_data_cbpf(po, data, len);
1632
case PACKET_FANOUT_EBPF:
1633
return fanout_set_data_ebpf(po, data, len);
1634
default:
1635
return -EINVAL;
1636
}
1637
}
1638
1639
static void fanout_release_data(struct packet_fanout *f)
1640
{
1641
switch (f->type) {
1642
case PACKET_FANOUT_CBPF:
1643
case PACKET_FANOUT_EBPF:
1644
__fanout_set_data_bpf(f, NULL);
1645
}
1646
}
1647
1648
static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1649
{
1650
struct packet_fanout *f;
1651
1652
list_for_each_entry(f, &fanout_list, list) {
1653
if (f->id == candidate_id &&
1654
read_pnet(&f->net) == sock_net(sk)) {
1655
return false;
1656
}
1657
}
1658
return true;
1659
}
1660
1661
static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1662
{
1663
u16 id = fanout_next_id;
1664
1665
do {
1666
if (__fanout_id_is_free(sk, id)) {
1667
*new_id = id;
1668
fanout_next_id = id + 1;
1669
return true;
1670
}
1671
1672
id++;
1673
} while (id != fanout_next_id);
1674
1675
return false;
1676
}
1677
1678
static int fanout_add(struct sock *sk, struct fanout_args *args)
1679
{
1680
struct packet_rollover *rollover = NULL;
1681
struct packet_sock *po = pkt_sk(sk);
1682
u16 type_flags = args->type_flags;
1683
struct packet_fanout *f, *match;
1684
u8 type = type_flags & 0xff;
1685
u8 flags = type_flags >> 8;
1686
u16 id = args->id;
1687
int err;
1688
1689
switch (type) {
1690
case PACKET_FANOUT_ROLLOVER:
1691
if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1692
return -EINVAL;
1693
break;
1694
case PACKET_FANOUT_HASH:
1695
case PACKET_FANOUT_LB:
1696
case PACKET_FANOUT_CPU:
1697
case PACKET_FANOUT_RND:
1698
case PACKET_FANOUT_QM:
1699
case PACKET_FANOUT_CBPF:
1700
case PACKET_FANOUT_EBPF:
1701
break;
1702
default:
1703
return -EINVAL;
1704
}
1705
1706
mutex_lock(&fanout_mutex);
1707
1708
err = -EALREADY;
1709
if (po->fanout)
1710
goto out;
1711
1712
if (type == PACKET_FANOUT_ROLLOVER ||
1713
(type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
1714
err = -ENOMEM;
1715
rollover = kzalloc_obj(*rollover);
1716
if (!rollover)
1717
goto out;
1718
atomic_long_set(&rollover->num, 0);
1719
atomic_long_set(&rollover->num_huge, 0);
1720
atomic_long_set(&rollover->num_failed, 0);
1721
}
1722
1723
if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1724
if (id != 0) {
1725
err = -EINVAL;
1726
goto out;
1727
}
1728
if (!fanout_find_new_id(sk, &id)) {
1729
err = -ENOMEM;
1730
goto out;
1731
}
1732
/* ephemeral flag for the first socket in the group: drop it */
1733
flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1734
}
1735
1736
match = NULL;
1737
list_for_each_entry(f, &fanout_list, list) {
1738
if (f->id == id &&
1739
read_pnet(&f->net) == sock_net(sk)) {
1740
match = f;
1741
break;
1742
}
1743
}
1744
err = -EINVAL;
1745
if (match) {
1746
if (match->flags != flags)
1747
goto out;
1748
if (args->max_num_members &&
1749
args->max_num_members != match->max_num_members)
1750
goto out;
1751
} else {
1752
if (args->max_num_members > PACKET_FANOUT_MAX)
1753
goto out;
1754
if (!args->max_num_members)
1755
/* legacy PACKET_FANOUT_MAX */
1756
args->max_num_members = 256;
1757
err = -ENOMEM;
1758
match = kvzalloc_flex(*match, arr, args->max_num_members);
1759
if (!match)
1760
goto out;
1761
write_pnet(&match->net, sock_net(sk));
1762
match->id = id;
1763
match->type = type;
1764
match->flags = flags;
1765
INIT_LIST_HEAD(&match->list);
1766
spin_lock_init(&match->lock);
1767
refcount_set(&match->sk_ref, 0);
1768
fanout_init_data(match);
1769
match->prot_hook.type = po->prot_hook.type;
1770
match->prot_hook.dev = po->prot_hook.dev;
1771
match->prot_hook.func = packet_rcv_fanout;
1772
match->prot_hook.af_packet_priv = match;
1773
match->prot_hook.af_packet_net = read_pnet(&match->net);
1774
match->prot_hook.id_match = match_fanout_group;
1775
match->max_num_members = args->max_num_members;
1776
match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING;
1777
list_add(&match->list, &fanout_list);
1778
}
1779
err = -EINVAL;
1780
1781
spin_lock(&po->bind_lock);
1782
if (po->num &&
1783
match->type == type &&
1784
match->prot_hook.type == po->prot_hook.type &&
1785
match->prot_hook.dev == po->prot_hook.dev) {
1786
err = -ENOSPC;
1787
if (refcount_read(&match->sk_ref) < match->max_num_members) {
1788
/* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
1789
WRITE_ONCE(po->fanout, match);
1790
1791
po->rollover = rollover;
1792
rollover = NULL;
1793
refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
1794
if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
1795
__dev_remove_pack(&po->prot_hook);
1796
__fanout_link(sk, po);
1797
}
1798
err = 0;
1799
}
1800
}
1801
spin_unlock(&po->bind_lock);
1802
1803
if (err && !refcount_read(&match->sk_ref)) {
1804
list_del(&match->list);
1805
kvfree(match);
1806
}
1807
1808
out:
1809
kfree(rollover);
1810
mutex_unlock(&fanout_mutex);
1811
return err;
1812
}
1813
1814
/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1815
* pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1816
* It is the responsibility of the caller to call fanout_release_data() and
1817
* free the returned packet_fanout (after synchronize_net())
1818
*/
1819
static struct packet_fanout *fanout_release(struct sock *sk)
1820
{
1821
struct packet_sock *po = pkt_sk(sk);
1822
struct packet_fanout *f;
1823
1824
mutex_lock(&fanout_mutex);
1825
f = po->fanout;
1826
if (f) {
1827
po->fanout = NULL;
1828
1829
if (refcount_dec_and_test(&f->sk_ref))
1830
list_del(&f->list);
1831
else
1832
f = NULL;
1833
}
1834
mutex_unlock(&fanout_mutex);
1835
1836
return f;
1837
}
1838
1839
static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1840
struct sk_buff *skb)
1841
{
1842
/* Earlier code assumed this would be a VLAN pkt, double-check
1843
* this now that we have the actual packet in hand. We can only
1844
* do this check on Ethernet devices.
1845
*/
1846
if (unlikely(dev->type != ARPHRD_ETHER))
1847
return false;
1848
1849
skb_reset_mac_header(skb);
1850
return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1851
}
1852
1853
static const struct proto_ops packet_ops;
1854
1855
static const struct proto_ops packet_ops_spkt;
1856
1857
static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1858
struct packet_type *pt, struct net_device *orig_dev)
1859
{
1860
struct sock *sk;
1861
struct sockaddr_pkt *spkt;
1862
1863
/*
1864
* When we registered the protocol we saved the socket in the data
1865
* field for just this event.
1866
*/
1867
1868
sk = pt->af_packet_priv;
1869
1870
/*
1871
* Yank back the headers [hope the device set this
1872
* right or kerboom...]
1873
*
1874
* Incoming packets have ll header pulled,
1875
* push it back.
1876
*
1877
* For outgoing ones skb->data == skb_mac_header(skb)
1878
* so that this procedure is noop.
1879
*/
1880
1881
if (skb->pkt_type == PACKET_LOOPBACK)
1882
goto out;
1883
1884
if (!net_eq(dev_net(dev), sock_net(sk)))
1885
goto out;
1886
1887
skb = skb_share_check(skb, GFP_ATOMIC);
1888
if (skb == NULL)
1889
goto oom;
1890
1891
/* drop any routing info */
1892
skb_dst_drop(skb);
1893
1894
/* drop conntrack reference */
1895
nf_reset_ct(skb);
1896
1897
spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1898
1899
skb_push(skb, skb->data - skb_mac_header(skb));
1900
1901
/*
1902
* The SOCK_PACKET socket receives _all_ frames.
1903
*/
1904
1905
spkt->spkt_family = dev->type;
1906
strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1907
spkt->spkt_protocol = skb->protocol;
1908
1909
/*
1910
* Charge the memory to the socket. This is done specifically
1911
* to prevent sockets using all the memory up.
1912
*/
1913
1914
if (sock_queue_rcv_skb(sk, skb) == 0)
1915
return 0;
1916
1917
out:
1918
kfree_skb(skb);
1919
oom:
1920
return 0;
1921
}
1922
1923
static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1924
{
1925
int depth;
1926
1927
if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1928
sock->type == SOCK_RAW) {
1929
skb_reset_mac_header(skb);
1930
skb->protocol = dev_parse_header_protocol(skb);
1931
}
1932
1933
/* Move network header to the right position for VLAN tagged packets */
1934
if (likely(skb->dev->type == ARPHRD_ETHER) &&
1935
eth_type_vlan(skb->protocol) &&
1936
vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
1937
skb_set_network_header(skb, depth);
1938
1939
skb_probe_transport_header(skb);
1940
}
1941
1942
/*
1943
* Output a raw packet to a device layer. This bypasses all the other
1944
* protocol layers and you must therefore supply it with a complete frame
1945
*/
1946
1947
static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1948
size_t len)
1949
{
1950
struct sock *sk = sock->sk;
1951
DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1952
struct sk_buff *skb = NULL;
1953
struct net_device *dev;
1954
struct sockcm_cookie sockc;
1955
__be16 proto = 0;
1956
int err;
1957
int extra_len = 0;
1958
1959
/*
1960
* Get and verify the address.
1961
*/
1962
1963
if (saddr) {
1964
if (msg->msg_namelen < sizeof(struct sockaddr))
1965
return -EINVAL;
1966
if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1967
proto = saddr->spkt_protocol;
1968
} else
1969
return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1970
1971
/*
1972
* Find the device first to size check it
1973
*/
1974
1975
saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1976
retry:
1977
rcu_read_lock();
1978
dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1979
err = -ENODEV;
1980
if (dev == NULL)
1981
goto out_unlock;
1982
1983
err = -ENETDOWN;
1984
if (!(dev->flags & IFF_UP))
1985
goto out_unlock;
1986
1987
/*
1988
* You may not queue a frame bigger than the mtu. This is the lowest level
1989
* raw protocol and you must do your own fragmentation at this level.
1990
*/
1991
1992
if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1993
if (!netif_supports_nofcs(dev)) {
1994
err = -EPROTONOSUPPORT;
1995
goto out_unlock;
1996
}
1997
extra_len = 4; /* We're doing our own CRC */
1998
}
1999
2000
err = -EMSGSIZE;
2001
if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
2002
goto out_unlock;
2003
2004
if (!skb) {
2005
size_t reserved = LL_RESERVED_SPACE(dev);
2006
int tlen = dev->needed_tailroom;
2007
unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
2008
2009
rcu_read_unlock();
2010
skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
2011
if (skb == NULL)
2012
return -ENOBUFS;
2013
/* FIXME: Save some space for broken drivers that write a hard
2014
* header at transmission time by themselves. PPP is the notable
2015
* one here. This should really be fixed at the driver level.
2016
*/
2017
skb_reserve(skb, reserved);
2018
skb_reset_network_header(skb);
2019
2020
/* Try to align data part correctly */
2021
if (hhlen) {
2022
skb->data -= hhlen;
2023
skb->tail -= hhlen;
2024
if (len < hhlen)
2025
skb_reset_network_header(skb);
2026
}
2027
err = memcpy_from_msg(skb_put(skb, len), msg, len);
2028
if (err)
2029
goto out_free;
2030
goto retry;
2031
}
2032
2033
if (!dev_validate_header(dev, skb->data, len) || !skb->len) {
2034
err = -EINVAL;
2035
goto out_unlock;
2036
}
2037
if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
2038
!packet_extra_vlan_len_allowed(dev, skb)) {
2039
err = -EMSGSIZE;
2040
goto out_unlock;
2041
}
2042
2043
sockcm_init(&sockc, sk);
2044
if (msg->msg_controllen) {
2045
err = sock_cmsg_send(sk, msg, &sockc);
2046
if (unlikely(err))
2047
goto out_unlock;
2048
}
2049
2050
skb->protocol = proto;
2051
skb->dev = dev;
2052
skb->priority = sockc.priority;
2053
skb->mark = sockc.mark;
2054
skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid);
2055
skb_setup_tx_timestamp(skb, &sockc);
2056
2057
if (unlikely(extra_len == 4))
2058
skb->no_fcs = 1;
2059
2060
packet_parse_headers(skb, sock);
2061
2062
dev_queue_xmit(skb);
2063
rcu_read_unlock();
2064
return len;
2065
2066
out_unlock:
2067
rcu_read_unlock();
2068
out_free:
2069
kfree_skb(skb);
2070
return err;
2071
}
2072
2073
static unsigned int run_filter(struct sk_buff *skb,
2074
const struct sock *sk,
2075
unsigned int res)
2076
{
2077
struct sk_filter *filter;
2078
2079
rcu_read_lock();
2080
filter = rcu_dereference(sk->sk_filter);
2081
if (filter != NULL)
2082
res = bpf_prog_run_clear_cb(filter->prog, skb);
2083
rcu_read_unlock();
2084
2085
return res;
2086
}
2087
2088
static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2089
size_t *len, int vnet_hdr_sz)
2090
{
2091
struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };
2092
2093
if (*len < vnet_hdr_sz)
2094
return -EINVAL;
2095
*len -= vnet_hdr_sz;
2096
2097
if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
2098
return -EINVAL;
2099
2100
return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
2101
}
2102
2103
/*
2104
* This function makes lazy skb cloning in hope that most of packets
2105
* are discarded by BPF.
2106
*
2107
* Note tricky part: we DO mangle shared skb! skb->data, skb->len
2108
* and skb->cb are mangled. It works because (and until) packets
2109
* falling here are owned by current CPU. Output packets are cloned
2110
* by dev_queue_xmit_nit(), input packets are processed by net_bh
2111
* sequentially, so that if we return skb to original state on exit,
2112
* we will not harm anyone.
2113
*/
2114
2115
static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2116
struct packet_type *pt, struct net_device *orig_dev)
2117
{
2118
enum skb_drop_reason drop_reason = SKB_CONSUMED;
2119
struct sock *sk = NULL;
2120
struct sockaddr_ll *sll;
2121
struct packet_sock *po;
2122
u8 *skb_head = skb->data;
2123
int skb_len = skb->len;
2124
unsigned int snaplen, res;
2125
2126
if (skb->pkt_type == PACKET_LOOPBACK)
2127
goto drop;
2128
2129
sk = pt->af_packet_priv;
2130
po = pkt_sk(sk);
2131
2132
if (!net_eq(dev_net(dev), sock_net(sk)))
2133
goto drop;
2134
2135
skb->dev = dev;
2136
2137
if (dev_has_header(dev)) {
2138
/* The device has an explicit notion of ll header,
2139
* exported to higher levels.
2140
*
2141
* Otherwise, the device hides details of its frame
2142
* structure, so that corresponding packet head is
2143
* never delivered to user.
2144
*/
2145
if (sk->sk_type != SOCK_DGRAM)
2146
skb_push(skb, skb->data - skb_mac_header(skb));
2147
else if (skb->pkt_type == PACKET_OUTGOING) {
2148
/* Special case: outgoing packets have ll header at head */
2149
skb_pull(skb, skb_network_offset(skb));
2150
}
2151
}
2152
2153
snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb);
2154
2155
res = run_filter(skb, sk, snaplen);
2156
if (!res)
2157
goto drop_n_restore;
2158
if (snaplen > res)
2159
snaplen = res;
2160
2161
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
2162
goto drop_n_acct;
2163
2164
if (skb_shared(skb)) {
2165
struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2166
if (nskb == NULL)
2167
goto drop_n_acct;
2168
2169
if (skb_head != skb->data) {
2170
skb->data = skb_head;
2171
skb->len = skb_len;
2172
}
2173
consume_skb(skb);
2174
skb = nskb;
2175
}
2176
2177
sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
2178
2179
sll = &PACKET_SKB_CB(skb)->sa.ll;
2180
sll->sll_hatype = dev->type;
2181
sll->sll_pkttype = skb->pkt_type;
2182
if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
2183
sll->sll_ifindex = orig_dev->ifindex;
2184
else
2185
sll->sll_ifindex = dev->ifindex;
2186
2187
sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2188
2189
/* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2190
* Use their space for storing the original skb length.
2191
*/
2192
PACKET_SKB_CB(skb)->sa.origlen = skb->len;
2193
2194
if (pskb_trim(skb, snaplen))
2195
goto drop_n_acct;
2196
2197
skb_set_owner_r(skb, sk);
2198
skb->dev = NULL;
2199
skb_dst_drop(skb);
2200
2201
/* drop conntrack reference */
2202
nf_reset_ct(skb);
2203
2204
spin_lock(&sk->sk_receive_queue.lock);
2205
po->stats.stats1.tp_packets++;
2206
sock_skb_set_dropcount(sk, skb);
2207
skb_clear_delivery_time(skb);
2208
__skb_queue_tail(&sk->sk_receive_queue, skb);
2209
spin_unlock(&sk->sk_receive_queue.lock);
2210
sk->sk_data_ready(sk);
2211
return 0;
2212
2213
drop_n_acct:
2214
atomic_inc(&po->tp_drops);
2215
sk_drops_inc(sk);
2216
drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR;
2217
2218
drop_n_restore:
2219
if (skb_head != skb->data && skb_shared(skb)) {
2220
skb->data = skb_head;
2221
skb->len = skb_len;
2222
}
2223
drop:
2224
sk_skb_reason_drop(sk, skb, drop_reason);
2225
return 0;
2226
}
2227
2228
static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2229
struct packet_type *pt, struct net_device *orig_dev)
2230
{
2231
enum skb_drop_reason drop_reason = SKB_CONSUMED;
2232
struct sock *sk = NULL;
2233
struct packet_sock *po;
2234
struct sockaddr_ll *sll;
2235
union tpacket_uhdr h;
2236
u8 *skb_head = skb->data;
2237
int skb_len = skb->len;
2238
unsigned int snaplen, res;
2239
unsigned long status = TP_STATUS_USER;
2240
unsigned short macoff, hdrlen;
2241
unsigned int netoff;
2242
struct sk_buff *copy_skb = NULL;
2243
struct timespec64 ts;
2244
__u32 ts_status;
2245
unsigned int slot_id = 0;
2246
int vnet_hdr_sz = 0;
2247
2248
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2249
* We may add members to them until current aligned size without forcing
2250
* userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2251
*/
2252
BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2253
BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2254
2255
if (skb->pkt_type == PACKET_LOOPBACK)
2256
goto drop;
2257
2258
sk = pt->af_packet_priv;
2259
po = pkt_sk(sk);
2260
2261
if (!net_eq(dev_net(dev), sock_net(sk)))
2262
goto drop;
2263
2264
if (dev_has_header(dev)) {
2265
if (sk->sk_type != SOCK_DGRAM)
2266
skb_push(skb, skb->data - skb_mac_header(skb));
2267
else if (skb->pkt_type == PACKET_OUTGOING) {
2268
/* Special case: outgoing packets have ll header at head */
2269
skb_pull(skb, skb_network_offset(skb));
2270
}
2271
}
2272
2273
snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb);
2274
2275
res = run_filter(skb, sk, snaplen);
2276
if (!res)
2277
goto drop_n_restore;
2278
2279
/* If we are flooded, just give up */
2280
if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2281
atomic_inc(&po->tp_drops);
2282
goto drop_n_restore;
2283
}
2284
2285
if (skb->ip_summed == CHECKSUM_PARTIAL)
2286
status |= TP_STATUS_CSUMNOTREADY;
2287
else if (skb->pkt_type != PACKET_OUTGOING &&
2288
skb_csum_unnecessary(skb))
2289
status |= TP_STATUS_CSUM_VALID;
2290
if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
2291
status |= TP_STATUS_GSO_TCP;
2292
2293
if (snaplen > res)
2294
snaplen = res;
2295
2296
if (sk->sk_type == SOCK_DGRAM) {
2297
macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2298
po->tp_reserve;
2299
} else {
2300
unsigned int maclen = skb_network_offset(skb);
2301
netoff = TPACKET_ALIGN(po->tp_hdrlen +
2302
(maclen < 16 ? 16 : maclen)) +
2303
po->tp_reserve;
2304
vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
2305
if (vnet_hdr_sz)
2306
netoff += vnet_hdr_sz;
2307
macoff = netoff - maclen;
2308
}
2309
if (netoff > USHRT_MAX) {
2310
atomic_inc(&po->tp_drops);
2311
goto drop_n_restore;
2312
}
2313
if (po->tp_version <= TPACKET_V2) {
2314
if (macoff + snaplen > po->rx_ring.frame_size) {
2315
if (READ_ONCE(po->copy_thresh) &&
2316
atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2317
if (skb_shared(skb)) {
2318
copy_skb = skb_clone(skb, GFP_ATOMIC);
2319
} else {
2320
copy_skb = skb_get(skb);
2321
skb_head = skb->data;
2322
}
2323
if (copy_skb) {
2324
memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
2325
sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
2326
skb_set_owner_r(copy_skb, sk);
2327
}
2328
}
2329
snaplen = po->rx_ring.frame_size - macoff;
2330
if ((int)snaplen < 0) {
2331
snaplen = 0;
2332
vnet_hdr_sz = 0;
2333
}
2334
}
2335
} else if (unlikely(macoff + snaplen >
2336
GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2337
u32 nval;
2338
2339
nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2340
pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2341
snaplen, nval, macoff);
2342
snaplen = nval;
2343
if (unlikely((int)snaplen < 0)) {
2344
snaplen = 0;
2345
macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2346
vnet_hdr_sz = 0;
2347
}
2348
}
2349
spin_lock(&sk->sk_receive_queue.lock);
2350
h.raw = packet_current_rx_frame(po, skb,
2351
TP_STATUS_KERNEL, (macoff+snaplen));
2352
if (!h.raw)
2353
goto drop_n_account;
2354
2355
if (po->tp_version <= TPACKET_V2) {
2356
slot_id = po->rx_ring.head;
2357
if (test_bit(slot_id, po->rx_ring.rx_owner_map))
2358
goto drop_n_account;
2359
__set_bit(slot_id, po->rx_ring.rx_owner_map);
2360
}
2361
2362
if (vnet_hdr_sz &&
2363
virtio_net_hdr_from_skb(skb, h.raw + macoff -
2364
sizeof(struct virtio_net_hdr),
2365
vio_le(), true, 0)) {
2366
if (po->tp_version == TPACKET_V3)
2367
prb_clear_blk_fill_status(&po->rx_ring);
2368
goto drop_n_account;
2369
}
2370
2371
if (po->tp_version <= TPACKET_V2) {
2372
packet_increment_rx_head(po, &po->rx_ring);
2373
/*
2374
* LOSING will be reported till you read the stats,
2375
* because it's COR - Clear On Read.
2376
* Anyways, moving it for V1/V2 only as V3 doesn't need this
2377
* at packet level.
2378
*/
2379
if (atomic_read(&po->tp_drops))
2380
status |= TP_STATUS_LOSING;
2381
}
2382
2383
po->stats.stats1.tp_packets++;
2384
if (copy_skb) {
2385
status |= TP_STATUS_COPY;
2386
skb_clear_delivery_time(copy_skb);
2387
__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2388
}
2389
spin_unlock(&sk->sk_receive_queue.lock);
2390
2391
skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2392
2393
/* Always timestamp; prefer an existing software timestamp taken
2394
* closer to the time of capture.
2395
*/
2396
ts_status = tpacket_get_timestamp(skb, &ts,
2397
READ_ONCE(po->tp_tstamp) |
2398
SOF_TIMESTAMPING_SOFTWARE);
2399
if (!ts_status)
2400
ktime_get_real_ts64(&ts);
2401
2402
status |= ts_status;
2403
2404
switch (po->tp_version) {
2405
case TPACKET_V1:
2406
h.h1->tp_len = skb->len;
2407
h.h1->tp_snaplen = snaplen;
2408
h.h1->tp_mac = macoff;
2409
h.h1->tp_net = netoff;
2410
h.h1->tp_sec = ts.tv_sec;
2411
h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2412
hdrlen = sizeof(*h.h1);
2413
break;
2414
case TPACKET_V2:
2415
h.h2->tp_len = skb->len;
2416
h.h2->tp_snaplen = snaplen;
2417
h.h2->tp_mac = macoff;
2418
h.h2->tp_net = netoff;
2419
h.h2->tp_sec = ts.tv_sec;
2420
h.h2->tp_nsec = ts.tv_nsec;
2421
if (skb_vlan_tag_present(skb)) {
2422
h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2423
h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2424
status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2425
} else if (unlikely(sk->sk_type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) {
2426
h.h2->tp_vlan_tci = vlan_get_tci(skb, skb->dev);
2427
h.h2->tp_vlan_tpid = ntohs(skb->protocol);
2428
status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2429
} else {
2430
h.h2->tp_vlan_tci = 0;
2431
h.h2->tp_vlan_tpid = 0;
2432
}
2433
memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2434
hdrlen = sizeof(*h.h2);
2435
break;
2436
case TPACKET_V3:
2437
/* tp_nxt_offset,vlan are already populated above.
2438
* So DONT clear those fields here
2439
*/
2440
h.h3->tp_status |= status;
2441
h.h3->tp_len = skb->len;
2442
h.h3->tp_snaplen = snaplen;
2443
h.h3->tp_mac = macoff;
2444
h.h3->tp_net = netoff;
2445
h.h3->tp_sec = ts.tv_sec;
2446
h.h3->tp_nsec = ts.tv_nsec;
2447
memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2448
hdrlen = sizeof(*h.h3);
2449
break;
2450
default:
2451
BUG();
2452
}
2453
2454
sll = h.raw + TPACKET_ALIGN(hdrlen);
2455
sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2456
sll->sll_family = AF_PACKET;
2457
sll->sll_hatype = dev->type;
2458
sll->sll_protocol = (sk->sk_type == SOCK_DGRAM) ?
2459
vlan_get_protocol_dgram(skb) : skb->protocol;
2460
sll->sll_pkttype = skb->pkt_type;
2461
if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
2462
sll->sll_ifindex = orig_dev->ifindex;
2463
else
2464
sll->sll_ifindex = dev->ifindex;
2465
2466
smp_mb();
2467
2468
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2469
if (po->tp_version <= TPACKET_V2) {
2470
u8 *start, *end;
2471
2472
end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2473
macoff + snaplen);
2474
2475
for (start = h.raw; start < end; start += PAGE_SIZE)
2476
flush_dcache_page(pgv_to_page(start));
2477
}
2478
smp_wmb();
2479
#endif
2480
2481
if (po->tp_version <= TPACKET_V2) {
2482
spin_lock(&sk->sk_receive_queue.lock);
2483
__packet_set_status(po, h.raw, status);
2484
__clear_bit(slot_id, po->rx_ring.rx_owner_map);
2485
spin_unlock(&sk->sk_receive_queue.lock);
2486
sk->sk_data_ready(sk);
2487
} else if (po->tp_version == TPACKET_V3) {
2488
prb_clear_blk_fill_status(&po->rx_ring);
2489
}
2490
2491
drop_n_restore:
2492
if (skb_head != skb->data && skb_shared(skb)) {
2493
skb->data = skb_head;
2494
skb->len = skb_len;
2495
}
2496
drop:
2497
sk_skb_reason_drop(sk, skb, drop_reason);
2498
return 0;
2499
2500
drop_n_account:
2501
spin_unlock(&sk->sk_receive_queue.lock);
2502
atomic_inc(&po->tp_drops);
2503
drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR;
2504
2505
sk->sk_data_ready(sk);
2506
sk_skb_reason_drop(sk, copy_skb, drop_reason);
2507
goto drop_n_restore;
2508
}
2509
2510
static void tpacket_destruct_skb(struct sk_buff *skb)
2511
{
2512
struct packet_sock *po = pkt_sk(skb->sk);
2513
2514
if (likely(po->tx_ring.pg_vec)) {
2515
void *ph;
2516
__u32 ts;
2517
2518
ph = skb_zcopy_get_nouarg(skb);
2519
packet_dec_pending(&po->tx_ring);
2520
2521
ts = __packet_set_timestamp(po, ph, skb);
2522
__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2523
2524
complete(&po->skb_completion);
2525
}
2526
2527
sock_wfree(skb);
2528
}
2529
2530
static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2531
{
2532
if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2533
(__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2534
__virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2535
__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2536
vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2537
__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2538
__virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2539
2540
if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2541
return -EINVAL;
2542
2543
return 0;
2544
}
2545
2546
static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2547
struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz)
2548
{
2549
int ret;
2550
2551
if (*len < vnet_hdr_sz)
2552
return -EINVAL;
2553
*len -= vnet_hdr_sz;
2554
2555
if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
2556
return -EFAULT;
2557
2558
ret = __packet_snd_vnet_parse(vnet_hdr, *len);
2559
if (ret)
2560
return ret;
2561
2562
/* move iter to point to the start of mac header */
2563
if (vnet_hdr_sz != sizeof(struct virtio_net_hdr))
2564
iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr));
2565
2566
return 0;
2567
}
2568
2569
static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2570
void *frame, struct net_device *dev, void *data, int tp_len,
2571
__be16 proto, unsigned char *addr, int hlen, int copylen,
2572
const struct sockcm_cookie *sockc)
2573
{
2574
union tpacket_uhdr ph;
2575
int to_write, offset, len, nr_frags, len_max;
2576
struct socket *sock = po->sk.sk_socket;
2577
struct page *page;
2578
int err;
2579
2580
ph.raw = frame;
2581
2582
skb->protocol = proto;
2583
skb->dev = dev;
2584
skb->priority = sockc->priority;
2585
skb->mark = sockc->mark;
2586
skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid);
2587
skb_setup_tx_timestamp(skb, sockc);
2588
skb_zcopy_set_nouarg(skb, ph.raw);
2589
2590
skb_reserve(skb, hlen);
2591
skb_reset_network_header(skb);
2592
2593
to_write = tp_len;
2594
2595
if (sock->type == SOCK_DGRAM) {
2596
err = dev_hard_header(skb, dev, ntohs(proto), addr,
2597
NULL, tp_len);
2598
if (unlikely(err < 0))
2599
return -EINVAL;
2600
} else if (copylen) {
2601
int hdrlen = min_t(int, copylen, tp_len);
2602
2603
skb_push(skb, dev->hard_header_len);
2604
skb_put(skb, copylen - dev->hard_header_len);
2605
err = skb_store_bits(skb, 0, data, hdrlen);
2606
if (unlikely(err))
2607
return err;
2608
if (!dev_validate_header(dev, skb->data, hdrlen))
2609
return -EINVAL;
2610
2611
data += hdrlen;
2612
to_write -= hdrlen;
2613
}
2614
2615
offset = offset_in_page(data);
2616
len_max = PAGE_SIZE - offset;
2617
len = ((to_write > len_max) ? len_max : to_write);
2618
2619
skb->data_len = to_write;
2620
skb->len += to_write;
2621
skb->truesize += to_write;
2622
refcount_add(to_write, &po->sk.sk_wmem_alloc);
2623
2624
while (likely(to_write)) {
2625
nr_frags = skb_shinfo(skb)->nr_frags;
2626
2627
if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2628
pr_err("Packet exceed the number of skb frags(%u)\n",
2629
(unsigned int)MAX_SKB_FRAGS);
2630
return -EFAULT;
2631
}
2632
2633
page = pgv_to_page(data);
2634
data += len;
2635
flush_dcache_page(page);
2636
get_page(page);
2637
skb_fill_page_desc(skb, nr_frags, page, offset, len);
2638
to_write -= len;
2639
offset = 0;
2640
len_max = PAGE_SIZE;
2641
len = ((to_write > len_max) ? len_max : to_write);
2642
}
2643
2644
packet_parse_headers(skb, sock);
2645
2646
return tp_len;
2647
}
2648
2649
static int tpacket_parse_header(struct packet_sock *po, void *frame,
2650
int size_max, void **data)
2651
{
2652
union tpacket_uhdr ph;
2653
int tp_len, off;
2654
2655
ph.raw = frame;
2656
2657
switch (po->tp_version) {
2658
case TPACKET_V3:
2659
if (ph.h3->tp_next_offset != 0) {
2660
pr_warn_once("variable sized slot not supported");
2661
return -EINVAL;
2662
}
2663
tp_len = ph.h3->tp_len;
2664
break;
2665
case TPACKET_V2:
2666
tp_len = ph.h2->tp_len;
2667
break;
2668
default:
2669
tp_len = ph.h1->tp_len;
2670
break;
2671
}
2672
if (unlikely(tp_len > size_max)) {
2673
pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2674
return -EMSGSIZE;
2675
}
2676
2677
if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) {
2678
int off_min, off_max;
2679
2680
off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2681
off_max = po->tx_ring.frame_size - tp_len;
2682
if (po->sk.sk_type == SOCK_DGRAM) {
2683
switch (po->tp_version) {
2684
case TPACKET_V3:
2685
off = ph.h3->tp_net;
2686
break;
2687
case TPACKET_V2:
2688
off = ph.h2->tp_net;
2689
break;
2690
default:
2691
off = ph.h1->tp_net;
2692
break;
2693
}
2694
} else {
2695
switch (po->tp_version) {
2696
case TPACKET_V3:
2697
off = ph.h3->tp_mac;
2698
break;
2699
case TPACKET_V2:
2700
off = ph.h2->tp_mac;
2701
break;
2702
default:
2703
off = ph.h1->tp_mac;
2704
break;
2705
}
2706
}
2707
if (unlikely((off < off_min) || (off_max < off)))
2708
return -EINVAL;
2709
} else {
2710
off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2711
}
2712
2713
*data = frame + off;
2714
return tp_len;
2715
}
2716
2717
static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2718
{
2719
struct sk_buff *skb = NULL;
2720
struct net_device *dev;
2721
struct virtio_net_hdr vnet_hdr;
2722
bool has_vnet_hdr = false;
2723
struct sockcm_cookie sockc;
2724
__be16 proto;
2725
int err, reserve = 0;
2726
void *ph;
2727
DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2728
bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2729
int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
2730
unsigned char *addr = NULL;
2731
int tp_len, size_max;
2732
void *data;
2733
int len_sum = 0;
2734
int status = TP_STATUS_AVAILABLE;
2735
int hlen, tlen, copylen = 0;
2736
long timeo;
2737
2738
mutex_lock(&po->pg_vec_lock);
2739
2740
/* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2741
* we need to confirm it under protection of pg_vec_lock.
2742
*/
2743
if (unlikely(!po->tx_ring.pg_vec)) {
2744
err = -EBUSY;
2745
goto out;
2746
}
2747
if (likely(saddr == NULL)) {
2748
dev = packet_cached_dev_get(po);
2749
proto = READ_ONCE(po->num);
2750
} else {
2751
err = -EINVAL;
2752
if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2753
goto out;
2754
if (msg->msg_namelen < (saddr->sll_halen
2755
+ offsetof(struct sockaddr_ll,
2756
sll_addr)))
2757
goto out;
2758
proto = saddr->sll_protocol;
2759
dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2760
if (po->sk.sk_socket->type == SOCK_DGRAM) {
2761
if (dev && msg->msg_namelen < dev->addr_len +
2762
offsetof(struct sockaddr_ll, sll_addr))
2763
goto out_put;
2764
addr = saddr->sll_addr;
2765
}
2766
}
2767
2768
err = -ENXIO;
2769
if (unlikely(dev == NULL))
2770
goto out;
2771
err = -ENETDOWN;
2772
if (unlikely(!(dev->flags & IFF_UP)))
2773
goto out_put;
2774
2775
sockcm_init(&sockc, &po->sk);
2776
if (msg->msg_controllen) {
2777
err = sock_cmsg_send(&po->sk, msg, &sockc);
2778
if (unlikely(err))
2779
goto out_put;
2780
}
2781
2782
if (po->sk.sk_socket->type == SOCK_RAW)
2783
reserve = dev->hard_header_len;
2784
size_max = po->tx_ring.frame_size
2785
- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2786
2787
if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz)
2788
size_max = dev->mtu + reserve + VLAN_HLEN;
2789
2790
timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
2791
reinit_completion(&po->skb_completion);
2792
2793
do {
2794
ph = packet_current_frame(po, &po->tx_ring,
2795
TP_STATUS_SEND_REQUEST);
2796
if (unlikely(ph == NULL)) {
2797
/* Note: packet_read_pending() might be slow if we
2798
* have to call it as it's per_cpu variable, but in
2799
* fast-path we don't have to call it, only when ph
2800
* is NULL, we need to check the pending_refcnt.
2801
*/
2802
if (need_wait && packet_read_pending(&po->tx_ring)) {
2803
timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
2804
if (timeo <= 0) {
2805
err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2806
goto out_put;
2807
}
2808
/* check for additional frames */
2809
continue;
2810
} else
2811
break;
2812
}
2813
2814
skb = NULL;
2815
tp_len = tpacket_parse_header(po, ph, size_max, &data);
2816
if (tp_len < 0)
2817
goto tpacket_error;
2818
2819
status = TP_STATUS_SEND_REQUEST;
2820
hlen = LL_RESERVED_SPACE(dev);
2821
tlen = dev->needed_tailroom;
2822
if (vnet_hdr_sz) {
2823
data += vnet_hdr_sz;
2824
tp_len -= vnet_hdr_sz;
2825
if (tp_len < 0) {
2826
tp_len = -EINVAL;
2827
goto tpacket_error;
2828
}
2829
memcpy(&vnet_hdr, data - vnet_hdr_sz, sizeof(vnet_hdr));
2830
if (__packet_snd_vnet_parse(&vnet_hdr, tp_len)) {
2831
tp_len = -EINVAL;
2832
goto tpacket_error;
2833
}
2834
copylen = __virtio16_to_cpu(vio_le(),
2835
vnet_hdr.hdr_len);
2836
has_vnet_hdr = true;
2837
}
2838
copylen = max_t(int, copylen, dev->hard_header_len);
2839
skb = sock_alloc_send_skb(&po->sk,
2840
hlen + tlen + sizeof(struct sockaddr_ll) +
2841
(copylen - dev->hard_header_len),
2842
!need_wait, &err);
2843
2844
if (unlikely(skb == NULL)) {
2845
/* we assume the socket was initially writeable ... */
2846
if (likely(len_sum > 0))
2847
err = len_sum;
2848
goto out_status;
2849
}
2850
tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
2851
addr, hlen, copylen, &sockc);
2852
if (likely(tp_len >= 0) &&
2853
tp_len > dev->mtu + reserve &&
2854
!vnet_hdr_sz &&
2855
!packet_extra_vlan_len_allowed(dev, skb))
2856
tp_len = -EMSGSIZE;
2857
2858
if (unlikely(tp_len < 0)) {
2859
tpacket_error:
2860
if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) {
2861
__packet_set_status(po, ph,
2862
TP_STATUS_AVAILABLE);
2863
packet_increment_head(&po->tx_ring);
2864
kfree_skb(skb);
2865
continue;
2866
} else {
2867
status = TP_STATUS_WRONG_FORMAT;
2868
err = tp_len;
2869
goto out_status;
2870
}
2871
}
2872
2873
if (has_vnet_hdr) {
2874
if (virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le())) {
2875
tp_len = -EINVAL;
2876
goto tpacket_error;
2877
}
2878
virtio_net_hdr_set_proto(skb, &vnet_hdr);
2879
}
2880
2881
skb->destructor = tpacket_destruct_skb;
2882
__packet_set_status(po, ph, TP_STATUS_SENDING);
2883
packet_inc_pending(&po->tx_ring);
2884
2885
status = TP_STATUS_SEND_REQUEST;
2886
err = packet_xmit(po, skb);
2887
if (unlikely(err != 0)) {
2888
if (err > 0)
2889
err = net_xmit_errno(err);
2890
if (err && __packet_get_status(po, ph) ==
2891
TP_STATUS_AVAILABLE) {
2892
/* skb was destructed already */
2893
skb = NULL;
2894
goto out_status;
2895
}
2896
/*
2897
* skb was dropped but not destructed yet;
2898
* let's treat it like congestion or err < 0
2899
*/
2900
err = 0;
2901
}
2902
packet_increment_head(&po->tx_ring);
2903
len_sum += tp_len;
2904
} while (1);
2905
2906
err = len_sum;
2907
goto out_put;
2908
2909
out_status:
2910
__packet_set_status(po, ph, status);
2911
kfree_skb(skb);
2912
out_put:
2913
dev_put(dev);
2914
out:
2915
mutex_unlock(&po->pg_vec_lock);
2916
return err;
2917
}
2918
2919
static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2920
size_t reserve, size_t len,
2921
size_t linear, int noblock,
2922
int *err)
2923
{
2924
struct sk_buff *skb;
2925
2926
/* Under a page? Don't bother with paged skb. */
2927
if (prepad + len < PAGE_SIZE || !linear)
2928
linear = len;
2929
2930
if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
2931
linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
2932
skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2933
err, PAGE_ALLOC_COSTLY_ORDER);
2934
if (!skb)
2935
return NULL;
2936
2937
skb_reserve(skb, reserve);
2938
skb_put(skb, linear);
2939
skb->data_len = len - linear;
2940
skb->len += len - linear;
2941
2942
return skb;
2943
}
2944
2945
static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2946
{
2947
struct sock *sk = sock->sk;
2948
DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2949
struct sk_buff *skb;
2950
struct net_device *dev;
2951
__be16 proto;
2952
unsigned char *addr = NULL;
2953
int err, reserve = 0;
2954
struct sockcm_cookie sockc;
2955
struct virtio_net_hdr vnet_hdr = { 0 };
2956
int offset = 0;
2957
struct packet_sock *po = pkt_sk(sk);
2958
int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
2959
int hlen, tlen, linear;
2960
int extra_len = 0;
2961
2962
/*
2963
* Get and verify the address.
2964
*/
2965
2966
if (likely(saddr == NULL)) {
2967
dev = packet_cached_dev_get(po);
2968
proto = READ_ONCE(po->num);
2969
} else {
2970
err = -EINVAL;
2971
if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2972
goto out;
2973
if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2974
goto out;
2975
proto = saddr->sll_protocol;
2976
dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2977
if (sock->type == SOCK_DGRAM) {
2978
if (dev && msg->msg_namelen < dev->addr_len +
2979
offsetof(struct sockaddr_ll, sll_addr))
2980
goto out_unlock;
2981
addr = saddr->sll_addr;
2982
}
2983
}
2984
2985
err = -ENXIO;
2986
if (unlikely(dev == NULL))
2987
goto out_unlock;
2988
err = -ENETDOWN;
2989
if (unlikely(!(dev->flags & IFF_UP)))
2990
goto out_unlock;
2991
2992
sockcm_init(&sockc, sk);
2993
if (msg->msg_controllen) {
2994
err = sock_cmsg_send(sk, msg, &sockc);
2995
if (unlikely(err))
2996
goto out_unlock;
2997
}
2998
2999
if (sock->type == SOCK_RAW)
3000
reserve = dev->hard_header_len;
3001
if (vnet_hdr_sz) {
3002
err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz);
3003
if (err)
3004
goto out_unlock;
3005
}
3006
3007
if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
3008
if (!netif_supports_nofcs(dev)) {
3009
err = -EPROTONOSUPPORT;
3010
goto out_unlock;
3011
}
3012
extra_len = 4; /* We're doing our own CRC */
3013
}
3014
3015
err = -EMSGSIZE;
3016
if (!vnet_hdr.gso_type &&
3017
(len > dev->mtu + reserve + VLAN_HLEN + extra_len))
3018
goto out_unlock;
3019
3020
err = -ENOBUFS;
3021
hlen = LL_RESERVED_SPACE(dev);
3022
tlen = dev->needed_tailroom;
3023
linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
3024
linear = max(linear, min_t(int, len, dev->hard_header_len));
3025
skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
3026
msg->msg_flags & MSG_DONTWAIT, &err);
3027
if (skb == NULL)
3028
goto out_unlock;
3029
3030
skb_reset_network_header(skb);
3031
3032
err = -EINVAL;
3033
if (sock->type == SOCK_DGRAM) {
3034
offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
3035
if (unlikely(offset < 0))
3036
goto out_free;
3037
} else if (reserve) {
3038
skb_reserve(skb, -reserve);
3039
if (len < reserve + sizeof(struct ipv6hdr) &&
3040
dev->min_header_len != dev->hard_header_len)
3041
skb_reset_network_header(skb);
3042
}
3043
3044
/* Returns -EFAULT on error */
3045
err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
3046
if (err)
3047
goto out_free;
3048
3049
if ((sock->type == SOCK_RAW &&
3050
!dev_validate_header(dev, skb->data, len)) || !skb->len) {
3051
err = -EINVAL;
3052
goto out_free;
3053
}
3054
3055
skb_setup_tx_timestamp(skb, &sockc);
3056
3057
if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3058
!packet_extra_vlan_len_allowed(dev, skb)) {
3059
err = -EMSGSIZE;
3060
goto out_free;
3061
}
3062
3063
skb->protocol = proto;
3064
skb->dev = dev;
3065
skb->priority = sockc.priority;
3066
skb->mark = sockc.mark;
3067
skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid);
3068
3069
if (unlikely(extra_len == 4))
3070
skb->no_fcs = 1;
3071
3072
packet_parse_headers(skb, sock);
3073
3074
if (vnet_hdr_sz) {
3075
err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
3076
if (err)
3077
goto out_free;
3078
len += vnet_hdr_sz;
3079
virtio_net_hdr_set_proto(skb, &vnet_hdr);
3080
}
3081
3082
err = packet_xmit(po, skb);
3083
3084
if (unlikely(err != 0)) {
3085
if (err > 0)
3086
err = net_xmit_errno(err);
3087
if (err)
3088
goto out_unlock;
3089
}
3090
3091
dev_put(dev);
3092
3093
return len;
3094
3095
out_free:
3096
kfree_skb(skb);
3097
out_unlock:
3098
dev_put(dev);
3099
out:
3100
return err;
3101
}
3102
3103
static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
3104
{
3105
struct sock *sk = sock->sk;
3106
struct packet_sock *po = pkt_sk(sk);
3107
3108
/* Reading tx_ring.pg_vec without holding pg_vec_lock is racy.
3109
* tpacket_snd() will redo the check safely.
3110
*/
3111
if (data_race(po->tx_ring.pg_vec))
3112
return tpacket_snd(po, msg);
3113
3114
return packet_snd(sock, msg, len);
3115
}
3116
3117
/*
3118
* Close a PACKET socket. This is fairly simple. We immediately go
3119
* to 'closed' state and remove our protocol entry in the device list.
3120
*/
3121
3122
static int packet_release(struct socket *sock)
3123
{
3124
struct sock *sk = sock->sk;
3125
struct packet_sock *po;
3126
struct packet_fanout *f;
3127
struct net *net;
3128
union tpacket_req_u req_u;
3129
3130
if (!sk)
3131
return 0;
3132
3133
net = sock_net(sk);
3134
po = pkt_sk(sk);
3135
3136
mutex_lock(&net->packet.sklist_lock);
3137
sk_del_node_init_rcu(sk);
3138
mutex_unlock(&net->packet.sklist_lock);
3139
3140
sock_prot_inuse_add(net, sk->sk_prot, -1);
3141
3142
spin_lock(&po->bind_lock);
3143
unregister_prot_hook(sk, false);
3144
WRITE_ONCE(po->num, 0);
3145
packet_cached_dev_reset(po);
3146
3147
if (po->prot_hook.dev) {
3148
netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
3149
po->prot_hook.dev = NULL;
3150
}
3151
spin_unlock(&po->bind_lock);
3152
3153
packet_flush_mclist(sk);
3154
3155
lock_sock(sk);
3156
if (po->rx_ring.pg_vec) {
3157
memset(&req_u, 0, sizeof(req_u));
3158
packet_set_ring(sk, &req_u, 1, 0);
3159
}
3160
3161
if (po->tx_ring.pg_vec) {
3162
memset(&req_u, 0, sizeof(req_u));
3163
packet_set_ring(sk, &req_u, 1, 1);
3164
}
3165
release_sock(sk);
3166
3167
f = fanout_release(sk);
3168
3169
synchronize_net();
3170
3171
kfree(po->rollover);
3172
if (f) {
3173
fanout_release_data(f);
3174
kvfree(f);
3175
}
3176
/*
3177
* Now the socket is dead. No more input will appear.
3178
*/
3179
sock_orphan(sk);
3180
sock->sk = NULL;
3181
3182
/* Purge queues */
3183
3184
skb_queue_purge(&sk->sk_receive_queue);
3185
packet_free_pending(po);
3186
3187
sock_put(sk);
3188
return 0;
3189
}
3190
3191
/*
3192
* Attach a packet hook.
3193
*/
3194
3195
static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3196
__be16 proto)
3197
{
3198
struct packet_sock *po = pkt_sk(sk);
3199
struct net_device *dev = NULL;
3200
bool unlisted = false;
3201
bool need_rehook;
3202
int ret = 0;
3203
3204
lock_sock(sk);
3205
spin_lock(&po->bind_lock);
3206
if (!proto)
3207
proto = po->num;
3208
3209
rcu_read_lock();
3210
3211
if (po->fanout) {
3212
ret = -EINVAL;
3213
goto out_unlock;
3214
}
3215
3216
if (name) {
3217
dev = dev_get_by_name_rcu(sock_net(sk), name);
3218
if (!dev) {
3219
ret = -ENODEV;
3220
goto out_unlock;
3221
}
3222
} else if (ifindex) {
3223
dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3224
if (!dev) {
3225
ret = -ENODEV;
3226
goto out_unlock;
3227
}
3228
}
3229
3230
need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev;
3231
3232
if (need_rehook) {
3233
dev_hold(dev);
3234
if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
3235
rcu_read_unlock();
3236
/* prevents packet_notifier() from calling
3237
* register_prot_hook()
3238
*/
3239
WRITE_ONCE(po->num, 0);
3240
__unregister_prot_hook(sk, true);
3241
rcu_read_lock();
3242
if (dev)
3243
unlisted = !dev_get_by_index_rcu(sock_net(sk),
3244
dev->ifindex);
3245
}
3246
3247
BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING));
3248
WRITE_ONCE(po->num, proto);
3249
po->prot_hook.type = proto;
3250
3251
netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
3252
3253
if (unlikely(unlisted)) {
3254
po->prot_hook.dev = NULL;
3255
WRITE_ONCE(po->ifindex, -1);
3256
packet_cached_dev_reset(po);
3257
} else {
3258
netdev_hold(dev, &po->prot_hook.dev_tracker,
3259
GFP_ATOMIC);
3260
po->prot_hook.dev = dev;
3261
WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
3262
packet_cached_dev_assign(po, dev);
3263
}
3264
dev_put(dev);
3265
}
3266
3267
if (proto == 0 || !need_rehook)
3268
goto out_unlock;
3269
3270
if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
3271
register_prot_hook(sk);
3272
} else {
3273
sk->sk_err = ENETDOWN;
3274
if (!sock_flag(sk, SOCK_DEAD))
3275
sk_error_report(sk);
3276
}
3277
3278
out_unlock:
3279
rcu_read_unlock();
3280
spin_unlock(&po->bind_lock);
3281
release_sock(sk);
3282
return ret;
3283
}
3284
3285
/*
3286
* Bind a packet socket to a device
3287
*/
3288
3289
static int packet_bind_spkt(struct socket *sock, struct sockaddr_unsized *uaddr,
3290
int addr_len)
3291
{
3292
struct sock *sk = sock->sk;
3293
struct sockaddr *sa = (struct sockaddr *)uaddr;
3294
char name[sizeof(sa->sa_data) + 1];
3295
3296
/*
3297
* Check legality
3298
*/
3299
3300
if (addr_len != sizeof(struct sockaddr))
3301
return -EINVAL;
3302
/* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3303
* zero-terminated.
3304
*/
3305
memcpy(name, sa->sa_data, sizeof(sa->sa_data));
3306
name[sizeof(sa->sa_data)] = 0;
3307
3308
return packet_do_bind(sk, name, 0, 0);
3309
}
3310
3311
static int packet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
3312
{
3313
struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3314
struct sock *sk = sock->sk;
3315
3316
/*
3317
* Check legality
3318
*/
3319
3320
if (addr_len < sizeof(struct sockaddr_ll))
3321
return -EINVAL;
3322
if (sll->sll_family != AF_PACKET)
3323
return -EINVAL;
3324
3325
return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol);
3326
}
3327
3328
static struct proto packet_proto = {
3329
.name = "PACKET",
3330
.owner = THIS_MODULE,
3331
.obj_size = sizeof(struct packet_sock),
3332
};
3333
3334
/*
3335
* Create a packet of type SOCK_PACKET.
3336
*/
3337
3338
static int packet_create(struct net *net, struct socket *sock, int protocol,
3339
int kern)
3340
{
3341
struct sock *sk;
3342
struct packet_sock *po;
3343
__be16 proto = (__force __be16)protocol; /* weird, but documented */
3344
int err;
3345
3346
if (!ns_capable(net->user_ns, CAP_NET_RAW))
3347
return -EPERM;
3348
if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3349
sock->type != SOCK_PACKET)
3350
return -ESOCKTNOSUPPORT;
3351
3352
sock->state = SS_UNCONNECTED;
3353
3354
err = -ENOBUFS;
3355
sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
3356
if (sk == NULL)
3357
goto out;
3358
3359
sock->ops = &packet_ops;
3360
if (sock->type == SOCK_PACKET)
3361
sock->ops = &packet_ops_spkt;
3362
3363
po = pkt_sk(sk);
3364
err = packet_alloc_pending(po);
3365
if (err)
3366
goto out_sk_free;
3367
3368
sock_init_data(sock, sk);
3369
3370
init_completion(&po->skb_completion);
3371
sk->sk_family = PF_PACKET;
3372
po->num = proto;
3373
3374
packet_cached_dev_reset(po);
3375
3376
sk->sk_destruct = packet_sock_destruct;
3377
3378
/*
3379
* Attach a protocol block
3380
*/
3381
3382
spin_lock_init(&po->bind_lock);
3383
mutex_init(&po->pg_vec_lock);
3384
po->rollover = NULL;
3385
po->prot_hook.func = packet_rcv;
3386
3387
if (sock->type == SOCK_PACKET)
3388
po->prot_hook.func = packet_rcv_spkt;
3389
3390
po->prot_hook.af_packet_priv = sk;
3391
po->prot_hook.af_packet_net = sock_net(sk);
3392
3393
if (proto) {
3394
po->prot_hook.type = proto;
3395
__register_prot_hook(sk);
3396
}
3397
3398
mutex_lock(&net->packet.sklist_lock);
3399
sk_add_node_tail_rcu(sk, &net->packet.sklist);
3400
mutex_unlock(&net->packet.sklist_lock);
3401
3402
sock_prot_inuse_add(net, &packet_proto, 1);
3403
3404
return 0;
3405
out_sk_free:
3406
sk_free(sk);
3407
out:
3408
return err;
3409
}
3410
3411
/*
3412
* Pull a packet from our receive queue and hand it to the user.
3413
* If necessary we block.
3414
*/
3415
3416
static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3417
int flags)
3418
{
3419
struct sock *sk = sock->sk;
3420
struct sk_buff *skb;
3421
int copied, err;
3422
int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz);
3423
unsigned int origlen = 0;
3424
3425
err = -EINVAL;
3426
if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
3427
goto out;
3428
3429
#if 0
3430
/* What error should we return now? EUNATTACH? */
3431
if (pkt_sk(sk)->ifindex < 0)
3432
return -ENODEV;
3433
#endif
3434
3435
if (flags & MSG_ERRQUEUE) {
3436
err = sock_recv_errqueue(sk, msg, len,
3437
SOL_PACKET, PACKET_TX_TIMESTAMP);
3438
goto out;
3439
}
3440
3441
/*
3442
* Call the generic datagram receiver. This handles all sorts
3443
* of horrible races and re-entrancy so we can forget about it
3444
* in the protocol layers.
3445
*
3446
* Now it will return ENETDOWN, if device have just gone down,
3447
* but then it will block.
3448
*/
3449
3450
skb = skb_recv_datagram(sk, flags, &err);
3451
3452
/*
3453
* An error occurred so return it. Because skb_recv_datagram()
3454
* handles the blocking we don't see and worry about blocking
3455
* retries.
3456
*/
3457
3458
if (skb == NULL)
3459
goto out;
3460
3461
packet_rcv_try_clear_pressure(pkt_sk(sk));
3462
3463
if (vnet_hdr_len) {
3464
err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
3465
if (err)
3466
goto out_free;
3467
}
3468
3469
/* You lose any data beyond the buffer you gave. If it worries
3470
* a user program they can ask the device for its MTU
3471
* anyway.
3472
*/
3473
copied = skb->len;
3474
if (copied > len) {
3475
copied = len;
3476
msg->msg_flags |= MSG_TRUNC;
3477
}
3478
3479
err = skb_copy_datagram_msg(skb, 0, msg, copied);
3480
if (err)
3481
goto out_free;
3482
3483
if (sock->type != SOCK_PACKET) {
3484
struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3485
3486
/* Original length was stored in sockaddr_ll fields */
3487
origlen = PACKET_SKB_CB(skb)->sa.origlen;
3488
sll->sll_family = AF_PACKET;
3489
sll->sll_protocol = (sock->type == SOCK_DGRAM) ?
3490
vlan_get_protocol_dgram(skb) : skb->protocol;
3491
}
3492
3493
sock_recv_cmsgs(msg, sk, skb);
3494
3495
if (msg->msg_name) {
3496
const size_t max_len = min(sizeof(skb->cb),
3497
sizeof(struct sockaddr_storage));
3498
int copy_len;
3499
3500
/* If the address length field is there to be filled
3501
* in, we fill it in now.
3502
*/
3503
if (sock->type == SOCK_PACKET) {
3504
__sockaddr_check_size(sizeof(struct sockaddr_pkt));
3505
msg->msg_namelen = sizeof(struct sockaddr_pkt);
3506
copy_len = msg->msg_namelen;
3507
} else {
3508
struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3509
3510
msg->msg_namelen = sll->sll_halen +
3511
offsetof(struct sockaddr_ll, sll_addr);
3512
copy_len = msg->msg_namelen;
3513
if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3514
memset(msg->msg_name +
3515
offsetof(struct sockaddr_ll, sll_addr),
3516
0, sizeof(sll->sll_addr));
3517
msg->msg_namelen = sizeof(struct sockaddr_ll);
3518
}
3519
}
3520
if (WARN_ON_ONCE(copy_len > max_len)) {
3521
copy_len = max_len;
3522
msg->msg_namelen = copy_len;
3523
}
3524
memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
3525
}
3526
3527
if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) {
3528
struct tpacket_auxdata aux;
3529
3530
aux.tp_status = TP_STATUS_USER;
3531
if (skb->ip_summed == CHECKSUM_PARTIAL)
3532
aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3533
else if (skb->pkt_type != PACKET_OUTGOING &&
3534
skb_csum_unnecessary(skb))
3535
aux.tp_status |= TP_STATUS_CSUM_VALID;
3536
if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
3537
aux.tp_status |= TP_STATUS_GSO_TCP;
3538
3539
aux.tp_len = origlen;
3540
aux.tp_snaplen = skb->len;
3541
aux.tp_mac = 0;
3542
aux.tp_net = skb_network_offset(skb);
3543
if (skb_vlan_tag_present(skb)) {
3544
aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3545
aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3546
aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3547
} else if (unlikely(sock->type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) {
3548
struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3549
struct net_device *dev;
3550
3551
rcu_read_lock();
3552
dev = dev_get_by_index_rcu(sock_net(sk), sll->sll_ifindex);
3553
if (dev) {
3554
aux.tp_vlan_tci = vlan_get_tci(skb, dev);
3555
aux.tp_vlan_tpid = ntohs(skb->protocol);
3556
aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3557
} else {
3558
aux.tp_vlan_tci = 0;
3559
aux.tp_vlan_tpid = 0;
3560
}
3561
rcu_read_unlock();
3562
} else {
3563
aux.tp_vlan_tci = 0;
3564
aux.tp_vlan_tpid = 0;
3565
}
3566
put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
3567
}
3568
3569
/*
3570
* Free or return the buffer as appropriate. Again this
3571
* hides all the races and re-entrancy issues from us.
3572
*/
3573
err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3574
3575
out_free:
3576
skb_free_datagram(sk, skb);
3577
out:
3578
return err;
3579
}
3580
3581
static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3582
int peer)
3583
{
3584
struct net_device *dev;
3585
struct sock *sk = sock->sk;
3586
3587
if (peer)
3588
return -EOPNOTSUPP;
3589
3590
uaddr->sa_family = AF_PACKET;
3591
memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
3592
rcu_read_lock();
3593
dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
3594
if (dev)
3595
strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
3596
rcu_read_unlock();
3597
3598
return sizeof(*uaddr);
3599
}
3600
3601
static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3602
int peer)
3603
{
3604
struct net_device *dev;
3605
struct sock *sk = sock->sk;
3606
struct packet_sock *po = pkt_sk(sk);
3607
DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3608
int ifindex;
3609
3610
if (peer)
3611
return -EOPNOTSUPP;
3612
3613
ifindex = READ_ONCE(po->ifindex);
3614
sll->sll_family = AF_PACKET;
3615
sll->sll_ifindex = ifindex;
3616
sll->sll_protocol = READ_ONCE(po->num);
3617
sll->sll_pkttype = 0;
3618
rcu_read_lock();
3619
dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3620
if (dev) {
3621
sll->sll_hatype = dev->type;
3622
sll->sll_halen = dev->addr_len;
3623
3624
/* Let __fortify_memcpy_chk() know the actual buffer size. */
3625
memcpy(((struct sockaddr_storage *)sll)->__data +
3626
offsetof(struct sockaddr_ll, sll_addr) -
3627
offsetofend(struct sockaddr_ll, sll_family),
3628
dev->dev_addr, dev->addr_len);
3629
} else {
3630
sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3631
sll->sll_halen = 0;
3632
}
3633
rcu_read_unlock();
3634
3635
return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3636
}
3637
3638
static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3639
int what)
3640
{
3641
switch (i->type) {
3642
case PACKET_MR_MULTICAST:
3643
if (i->alen != dev->addr_len)
3644
return -EINVAL;
3645
if (what > 0)
3646
return dev_mc_add(dev, i->addr);
3647
else
3648
return dev_mc_del(dev, i->addr);
3649
break;
3650
case PACKET_MR_PROMISC:
3651
return dev_set_promiscuity(dev, what);
3652
case PACKET_MR_ALLMULTI:
3653
return dev_set_allmulti(dev, what);
3654
case PACKET_MR_UNICAST:
3655
if (i->alen != dev->addr_len)
3656
return -EINVAL;
3657
if (what > 0)
3658
return dev_uc_add(dev, i->addr);
3659
else
3660
return dev_uc_del(dev, i->addr);
3661
break;
3662
default:
3663
break;
3664
}
3665
return 0;
3666
}
3667
3668
static void packet_dev_mclist_delete(struct net_device *dev,
3669
struct packet_mclist **mlp,
3670
struct list_head *list)
3671
{
3672
struct packet_mclist *ml;
3673
3674
while ((ml = *mlp) != NULL) {
3675
if (ml->ifindex == dev->ifindex) {
3676
list_add(&ml->remove_list, list);
3677
*mlp = ml->next;
3678
} else
3679
mlp = &ml->next;
3680
}
3681
}
3682
3683
static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3684
{
3685
struct packet_sock *po = pkt_sk(sk);
3686
struct packet_mclist *ml, *i;
3687
struct net_device *dev;
3688
int err;
3689
3690
rtnl_lock();
3691
3692
err = -ENODEV;
3693
dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3694
if (!dev)
3695
goto done;
3696
3697
err = -EINVAL;
3698
if (mreq->mr_alen > dev->addr_len)
3699
goto done;
3700
3701
err = -ENOBUFS;
3702
i = kmalloc_obj(*i);
3703
if (i == NULL)
3704
goto done;
3705
3706
err = 0;
3707
for (ml = po->mclist; ml; ml = ml->next) {
3708
if (ml->ifindex == mreq->mr_ifindex &&
3709
ml->type == mreq->mr_type &&
3710
ml->alen == mreq->mr_alen &&
3711
memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3712
ml->count++;
3713
/* Free the new element ... */
3714
kfree(i);
3715
goto done;
3716
}
3717
}
3718
3719
i->type = mreq->mr_type;
3720
i->ifindex = mreq->mr_ifindex;
3721
i->alen = mreq->mr_alen;
3722
memcpy(i->addr, mreq->mr_address, i->alen);
3723
memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
3724
i->count = 1;
3725
INIT_LIST_HEAD(&i->remove_list);
3726
i->next = po->mclist;
3727
po->mclist = i;
3728
err = packet_dev_mc(dev, i, 1);
3729
if (err) {
3730
po->mclist = i->next;
3731
kfree(i);
3732
}
3733
3734
done:
3735
rtnl_unlock();
3736
return err;
3737
}
3738
3739
static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3740
{
3741
struct packet_mclist *ml, **mlp;
3742
3743
rtnl_lock();
3744
3745
for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3746
if (ml->ifindex == mreq->mr_ifindex &&
3747
ml->type == mreq->mr_type &&
3748
ml->alen == mreq->mr_alen &&
3749
memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3750
if (--ml->count == 0) {
3751
struct net_device *dev;
3752
*mlp = ml->next;
3753
dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3754
if (dev)
3755
packet_dev_mc(dev, ml, -1);
3756
kfree(ml);
3757
}
3758
break;
3759
}
3760
}
3761
rtnl_unlock();
3762
return 0;
3763
}
3764
3765
static void packet_flush_mclist(struct sock *sk)
3766
{
3767
struct packet_sock *po = pkt_sk(sk);
3768
struct packet_mclist *ml;
3769
3770
if (!po->mclist)
3771
return;
3772
3773
rtnl_lock();
3774
while ((ml = po->mclist) != NULL) {
3775
struct net_device *dev;
3776
3777
po->mclist = ml->next;
3778
dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3779
if (dev != NULL)
3780
packet_dev_mc(dev, ml, -1);
3781
kfree(ml);
3782
}
3783
rtnl_unlock();
3784
}
3785
3786
static int
3787
packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
3788
unsigned int optlen)
3789
{
3790
struct sock *sk = sock->sk;
3791
struct packet_sock *po = pkt_sk(sk);
3792
int ret;
3793
3794
if (level != SOL_PACKET)
3795
return -ENOPROTOOPT;
3796
3797
switch (optname) {
3798
case PACKET_ADD_MEMBERSHIP:
3799
case PACKET_DROP_MEMBERSHIP:
3800
{
3801
struct packet_mreq_max mreq;
3802
int len = optlen;
3803
memset(&mreq, 0, sizeof(mreq));
3804
if (len < sizeof(struct packet_mreq))
3805
return -EINVAL;
3806
if (len > sizeof(mreq))
3807
len = sizeof(mreq);
3808
if (copy_from_sockptr(&mreq, optval, len))
3809
return -EFAULT;
3810
if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3811
return -EINVAL;
3812
if (optname == PACKET_ADD_MEMBERSHIP)
3813
ret = packet_mc_add(sk, &mreq);
3814
else
3815
ret = packet_mc_drop(sk, &mreq);
3816
return ret;
3817
}
3818
3819
case PACKET_RX_RING:
3820
case PACKET_TX_RING:
3821
{
3822
union tpacket_req_u req_u;
3823
3824
ret = -EINVAL;
3825
lock_sock(sk);
3826
switch (po->tp_version) {
3827
case TPACKET_V1:
3828
case TPACKET_V2:
3829
if (optlen < sizeof(req_u.req))
3830
break;
3831
ret = copy_from_sockptr(&req_u.req, optval,
3832
sizeof(req_u.req)) ?
3833
-EINVAL : 0;
3834
break;
3835
case TPACKET_V3:
3836
default:
3837
if (optlen < sizeof(req_u.req3))
3838
break;
3839
ret = copy_from_sockptr(&req_u.req3, optval,
3840
sizeof(req_u.req3)) ?
3841
-EINVAL : 0;
3842
break;
3843
}
3844
if (!ret)
3845
ret = packet_set_ring(sk, &req_u, 0,
3846
optname == PACKET_TX_RING);
3847
release_sock(sk);
3848
return ret;
3849
}
3850
case PACKET_COPY_THRESH:
3851
{
3852
int val;
3853
3854
if (optlen != sizeof(val))
3855
return -EINVAL;
3856
if (copy_from_sockptr(&val, optval, sizeof(val)))
3857
return -EFAULT;
3858
3859
WRITE_ONCE(pkt_sk(sk)->copy_thresh, val);
3860
return 0;
3861
}
3862
case PACKET_VERSION:
3863
{
3864
int val;
3865
3866
if (optlen != sizeof(val))
3867
return -EINVAL;
3868
if (copy_from_sockptr(&val, optval, sizeof(val)))
3869
return -EFAULT;
3870
switch (val) {
3871
case TPACKET_V1:
3872
case TPACKET_V2:
3873
case TPACKET_V3:
3874
break;
3875
default:
3876
return -EINVAL;
3877
}
3878
lock_sock(sk);
3879
if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3880
ret = -EBUSY;
3881
} else {
3882
po->tp_version = val;
3883
ret = 0;
3884
}
3885
release_sock(sk);
3886
return ret;
3887
}
3888
case PACKET_RESERVE:
3889
{
3890
unsigned int val;
3891
3892
if (optlen != sizeof(val))
3893
return -EINVAL;
3894
if (copy_from_sockptr(&val, optval, sizeof(val)))
3895
return -EFAULT;
3896
if (val > INT_MAX)
3897
return -EINVAL;
3898
lock_sock(sk);
3899
if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3900
ret = -EBUSY;
3901
} else {
3902
po->tp_reserve = val;
3903
ret = 0;
3904
}
3905
release_sock(sk);
3906
return ret;
3907
}
3908
case PACKET_LOSS:
3909
{
3910
unsigned int val;
3911
3912
if (optlen != sizeof(val))
3913
return -EINVAL;
3914
if (copy_from_sockptr(&val, optval, sizeof(val)))
3915
return -EFAULT;
3916
3917
lock_sock(sk);
3918
if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3919
ret = -EBUSY;
3920
} else {
3921
packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val);
3922
ret = 0;
3923
}
3924
release_sock(sk);
3925
return ret;
3926
}
3927
case PACKET_AUXDATA:
3928
{
3929
int val;
3930
3931
if (optlen < sizeof(val))
3932
return -EINVAL;
3933
if (copy_from_sockptr(&val, optval, sizeof(val)))
3934
return -EFAULT;
3935
3936
packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val);
3937
return 0;
3938
}
3939
case PACKET_ORIGDEV:
3940
{
3941
int val;
3942
3943
if (optlen < sizeof(val))
3944
return -EINVAL;
3945
if (copy_from_sockptr(&val, optval, sizeof(val)))
3946
return -EFAULT;
3947
3948
packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val);
3949
return 0;
3950
}
3951
case PACKET_VNET_HDR:
3952
case PACKET_VNET_HDR_SZ:
3953
{
3954
int val, hdr_len;
3955
3956
if (sock->type != SOCK_RAW)
3957
return -EINVAL;
3958
if (optlen < sizeof(val))
3959
return -EINVAL;
3960
if (copy_from_sockptr(&val, optval, sizeof(val)))
3961
return -EFAULT;
3962
3963
if (optname == PACKET_VNET_HDR_SZ) {
3964
if (val && val != sizeof(struct virtio_net_hdr) &&
3965
val != sizeof(struct virtio_net_hdr_mrg_rxbuf))
3966
return -EINVAL;
3967
hdr_len = val;
3968
} else {
3969
hdr_len = val ? sizeof(struct virtio_net_hdr) : 0;
3970
}
3971
lock_sock(sk);
3972
if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3973
ret = -EBUSY;
3974
} else {
3975
WRITE_ONCE(po->vnet_hdr_sz, hdr_len);
3976
ret = 0;
3977
}
3978
release_sock(sk);
3979
return ret;
3980
}
3981
case PACKET_TIMESTAMP:
3982
{
3983
int val;
3984
3985
if (optlen != sizeof(val))
3986
return -EINVAL;
3987
if (copy_from_sockptr(&val, optval, sizeof(val)))
3988
return -EFAULT;
3989
3990
WRITE_ONCE(po->tp_tstamp, val);
3991
return 0;
3992
}
3993
case PACKET_FANOUT:
3994
{
3995
struct fanout_args args = { 0 };
3996
3997
if (optlen != sizeof(int) && optlen != sizeof(args))
3998
return -EINVAL;
3999
if (copy_from_sockptr(&args, optval, optlen))
4000
return -EFAULT;
4001
4002
return fanout_add(sk, &args);
4003
}
4004
case PACKET_FANOUT_DATA:
4005
{
4006
/* Paired with the WRITE_ONCE() in fanout_add() */
4007
if (!READ_ONCE(po->fanout))
4008
return -EINVAL;
4009
4010
return fanout_set_data(po, optval, optlen);
4011
}
4012
case PACKET_IGNORE_OUTGOING:
4013
{
4014
int val;
4015
4016
if (optlen != sizeof(val))
4017
return -EINVAL;
4018
if (copy_from_sockptr(&val, optval, sizeof(val)))
4019
return -EFAULT;
4020
if (val < 0 || val > 1)
4021
return -EINVAL;
4022
4023
WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val);
4024
return 0;
4025
}
4026
case PACKET_TX_HAS_OFF:
4027
{
4028
unsigned int val;
4029
4030
if (optlen != sizeof(val))
4031
return -EINVAL;
4032
if (copy_from_sockptr(&val, optval, sizeof(val)))
4033
return -EFAULT;
4034
4035
lock_sock(sk);
4036
if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec)
4037
packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val);
4038
4039
release_sock(sk);
4040
return 0;
4041
}
4042
case PACKET_QDISC_BYPASS:
4043
{
4044
int val;
4045
4046
if (optlen != sizeof(val))
4047
return -EINVAL;
4048
if (copy_from_sockptr(&val, optval, sizeof(val)))
4049
return -EFAULT;
4050
4051
packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val);
4052
return 0;
4053
}
4054
default:
4055
return -ENOPROTOOPT;
4056
}
4057
}
4058
4059
static int packet_getsockopt(struct socket *sock, int level, int optname,
4060
sockopt_t *opt)
4061
{
4062
int len;
4063
int val, lv = sizeof(val);
4064
struct sock *sk = sock->sk;
4065
struct packet_sock *po = pkt_sk(sk);
4066
void *data = &val;
4067
union tpacket_stats_u st;
4068
struct tpacket_rollover_stats rstats;
4069
int drops;
4070
4071
if (level != SOL_PACKET)
4072
return -ENOPROTOOPT;
4073
4074
len = opt->optlen;
4075
4076
if (len < 0)
4077
return -EINVAL;
4078
4079
switch (optname) {
4080
case PACKET_STATISTICS:
4081
spin_lock_bh(&sk->sk_receive_queue.lock);
4082
memcpy(&st, &po->stats, sizeof(st));
4083
memset(&po->stats, 0, sizeof(po->stats));
4084
spin_unlock_bh(&sk->sk_receive_queue.lock);
4085
drops = atomic_xchg(&po->tp_drops, 0);
4086
4087
if (po->tp_version == TPACKET_V3) {
4088
lv = sizeof(struct tpacket_stats_v3);
4089
st.stats3.tp_drops = drops;
4090
st.stats3.tp_packets += drops;
4091
data = &st.stats3;
4092
} else {
4093
lv = sizeof(struct tpacket_stats);
4094
st.stats1.tp_drops = drops;
4095
st.stats1.tp_packets += drops;
4096
data = &st.stats1;
4097
}
4098
4099
break;
4100
case PACKET_AUXDATA:
4101
val = packet_sock_flag(po, PACKET_SOCK_AUXDATA);
4102
break;
4103
case PACKET_ORIGDEV:
4104
val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
4105
break;
4106
case PACKET_VNET_HDR:
4107
val = !!READ_ONCE(po->vnet_hdr_sz);
4108
break;
4109
case PACKET_VNET_HDR_SZ:
4110
val = READ_ONCE(po->vnet_hdr_sz);
4111
break;
4112
case PACKET_COPY_THRESH:
4113
val = READ_ONCE(pkt_sk(sk)->copy_thresh);
4114
break;
4115
case PACKET_VERSION:
4116
val = po->tp_version;
4117
break;
4118
case PACKET_HDRLEN:
4119
if (len > sizeof(int))
4120
len = sizeof(int);
4121
if (len < sizeof(int))
4122
return -EINVAL;
4123
if (copy_from_iter(&val, len, &opt->iter_in) != len)
4124
return -EFAULT;
4125
switch (val) {
4126
case TPACKET_V1:
4127
val = sizeof(struct tpacket_hdr);
4128
break;
4129
case TPACKET_V2:
4130
val = sizeof(struct tpacket2_hdr);
4131
break;
4132
case TPACKET_V3:
4133
val = sizeof(struct tpacket3_hdr);
4134
break;
4135
default:
4136
return -EINVAL;
4137
}
4138
break;
4139
case PACKET_RESERVE:
4140
val = po->tp_reserve;
4141
break;
4142
case PACKET_LOSS:
4143
val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS);
4144
break;
4145
case PACKET_TIMESTAMP:
4146
val = READ_ONCE(po->tp_tstamp);
4147
break;
4148
case PACKET_FANOUT:
4149
val = (po->fanout ?
4150
((u32)po->fanout->id |
4151
((u32)po->fanout->type << 16) |
4152
((u32)po->fanout->flags << 24)) :
4153
0);
4154
break;
4155
case PACKET_IGNORE_OUTGOING:
4156
val = READ_ONCE(po->prot_hook.ignore_outgoing);
4157
break;
4158
case PACKET_ROLLOVER_STATS:
4159
if (!po->rollover)
4160
return -EINVAL;
4161
rstats.tp_all = atomic_long_read(&po->rollover->num);
4162
rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
4163
rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4164
data = &rstats;
4165
lv = sizeof(rstats);
4166
break;
4167
case PACKET_TX_HAS_OFF:
4168
val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF);
4169
break;
4170
case PACKET_QDISC_BYPASS:
4171
val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS);
4172
break;
4173
default:
4174
return -ENOPROTOOPT;
4175
}
4176
4177
if (len > lv)
4178
len = lv;
4179
opt->optlen = len;
4180
if (copy_to_iter(data, len, &opt->iter_out) != len)
4181
return -EFAULT;
4182
return 0;
4183
}
4184
4185
static int packet_notifier(struct notifier_block *this,
4186
unsigned long msg, void *ptr)
4187
{
4188
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4189
struct net *net = dev_net(dev);
4190
struct packet_mclist *ml, *tmp;
4191
LIST_HEAD(mclist);
4192
struct sock *sk;
4193
4194
rcu_read_lock();
4195
sk_for_each_rcu(sk, &net->packet.sklist) {
4196
struct packet_sock *po = pkt_sk(sk);
4197
4198
switch (msg) {
4199
case NETDEV_UNREGISTER:
4200
if (po->mclist)
4201
packet_dev_mclist_delete(dev, &po->mclist,
4202
&mclist);
4203
fallthrough;
4204
4205
case NETDEV_DOWN:
4206
if (dev->ifindex == po->ifindex) {
4207
spin_lock(&po->bind_lock);
4208
if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
4209
__unregister_prot_hook(sk, false);
4210
sk->sk_err = ENETDOWN;
4211
if (!sock_flag(sk, SOCK_DEAD))
4212
sk_error_report(sk);
4213
}
4214
if (msg == NETDEV_UNREGISTER) {
4215
packet_cached_dev_reset(po);
4216
WRITE_ONCE(po->ifindex, -1);
4217
netdev_put(po->prot_hook.dev,
4218
&po->prot_hook.dev_tracker);
4219
po->prot_hook.dev = NULL;
4220
}
4221
spin_unlock(&po->bind_lock);
4222
}
4223
break;
4224
case NETDEV_UP:
4225
if (dev->ifindex == po->ifindex) {
4226
spin_lock(&po->bind_lock);
4227
if (po->num)
4228
register_prot_hook(sk);
4229
spin_unlock(&po->bind_lock);
4230
}
4231
break;
4232
}
4233
}
4234
rcu_read_unlock();
4235
4236
/* packet_dev_mc might grab instance locks so can't run under rcu */
4237
list_for_each_entry_safe(ml, tmp, &mclist, remove_list) {
4238
packet_dev_mc(dev, ml, -1);
4239
kfree(ml);
4240
}
4241
4242
return NOTIFY_DONE;
4243
}
4244
4245
4246
static int packet_ioctl(struct socket *sock, unsigned int cmd,
4247
unsigned long arg)
4248
{
4249
struct sock *sk = sock->sk;
4250
4251
switch (cmd) {
4252
case SIOCOUTQ:
4253
{
4254
int amount = sk_wmem_alloc_get(sk);
4255
4256
return put_user(amount, (int __user *)arg);
4257
}
4258
case SIOCINQ:
4259
{
4260
struct sk_buff *skb;
4261
int amount = 0;
4262
4263
spin_lock_bh(&sk->sk_receive_queue.lock);
4264
skb = skb_peek(&sk->sk_receive_queue);
4265
if (skb)
4266
amount = skb->len;
4267
spin_unlock_bh(&sk->sk_receive_queue.lock);
4268
return put_user(amount, (int __user *)arg);
4269
}
4270
#ifdef CONFIG_INET
4271
case SIOCADDRT:
4272
case SIOCDELRT:
4273
case SIOCDARP:
4274
case SIOCGARP:
4275
case SIOCSARP:
4276
case SIOCGIFADDR:
4277
case SIOCSIFADDR:
4278
case SIOCGIFBRDADDR:
4279
case SIOCSIFBRDADDR:
4280
case SIOCGIFNETMASK:
4281
case SIOCSIFNETMASK:
4282
case SIOCGIFDSTADDR:
4283
case SIOCSIFDSTADDR:
4284
case SIOCSIFFLAGS:
4285
return inet_dgram_ops.ioctl(sock, cmd, arg);
4286
#endif
4287
4288
default:
4289
return -ENOIOCTLCMD;
4290
}
4291
return 0;
4292
}
4293
4294
static __poll_t packet_poll(struct file *file, struct socket *sock,
4295
poll_table *wait)
4296
{
4297
struct sock *sk = sock->sk;
4298
struct packet_sock *po = pkt_sk(sk);
4299
__poll_t mask = datagram_poll(file, sock, wait);
4300
4301
spin_lock_bh(&sk->sk_receive_queue.lock);
4302
if (po->rx_ring.pg_vec) {
4303
if (!packet_previous_rx_frame(po, &po->rx_ring,
4304
TP_STATUS_KERNEL))
4305
mask |= EPOLLIN | EPOLLRDNORM;
4306
}
4307
packet_rcv_try_clear_pressure(po);
4308
spin_unlock_bh(&sk->sk_receive_queue.lock);
4309
spin_lock_bh(&sk->sk_write_queue.lock);
4310
if (po->tx_ring.pg_vec) {
4311
if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4312
mask |= EPOLLOUT | EPOLLWRNORM;
4313
}
4314
spin_unlock_bh(&sk->sk_write_queue.lock);
4315
return mask;
4316
}
4317
4318
4319
/* Dirty? Well, I still did not learn better way to account
4320
* for user mmaps.
4321
*/
4322
4323
static void packet_mm_open(struct vm_area_struct *vma)
4324
{
4325
struct file *file = vma->vm_file;
4326
struct socket *sock = file->private_data;
4327
struct sock *sk = sock->sk;
4328
4329
if (sk)
4330
atomic_long_inc(&pkt_sk(sk)->mapped);
4331
}
4332
4333
static void packet_mm_close(struct vm_area_struct *vma)
4334
{
4335
struct file *file = vma->vm_file;
4336
struct socket *sock = file->private_data;
4337
struct sock *sk = sock->sk;
4338
4339
if (sk)
4340
atomic_long_dec(&pkt_sk(sk)->mapped);
4341
}
4342
4343
static const struct vm_operations_struct packet_mmap_ops = {
4344
.open = packet_mm_open,
4345
.close = packet_mm_close,
4346
};
4347
4348
static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4349
unsigned int len)
4350
{
4351
int i;
4352
4353
for (i = 0; i < len; i++) {
4354
if (likely(pg_vec[i].buffer)) {
4355
if (is_vmalloc_addr(pg_vec[i].buffer))
4356
vfree(pg_vec[i].buffer);
4357
else
4358
free_pages((unsigned long)pg_vec[i].buffer,
4359
order);
4360
pg_vec[i].buffer = NULL;
4361
}
4362
}
4363
kfree(pg_vec);
4364
}
4365
4366
static char *alloc_one_pg_vec_page(unsigned long order)
4367
{
4368
char *buffer;
4369
gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4370
__GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4371
4372
buffer = (char *) __get_free_pages(gfp_flags, order);
4373
if (buffer)
4374
return buffer;
4375
4376
/* __get_free_pages failed, fall back to vmalloc */
4377
buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4378
if (buffer)
4379
return buffer;
4380
4381
/* vmalloc failed, lets dig into swap here */
4382
gfp_flags &= ~__GFP_NORETRY;
4383
buffer = (char *) __get_free_pages(gfp_flags, order);
4384
if (buffer)
4385
return buffer;
4386
4387
/* complete and utter failure */
4388
return NULL;
4389
}
4390
4391
static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4392
{
4393
unsigned int block_nr = req->tp_block_nr;
4394
struct pgv *pg_vec;
4395
int i;
4396
4397
pg_vec = kzalloc_objs(struct pgv, block_nr, GFP_KERNEL | __GFP_NOWARN);
4398
if (unlikely(!pg_vec))
4399
goto out;
4400
4401
for (i = 0; i < block_nr; i++) {
4402
pg_vec[i].buffer = alloc_one_pg_vec_page(order);
4403
if (unlikely(!pg_vec[i].buffer))
4404
goto out_free_pgvec;
4405
}
4406
4407
out:
4408
return pg_vec;
4409
4410
out_free_pgvec:
4411
free_pg_vec(pg_vec, order, block_nr);
4412
pg_vec = NULL;
4413
goto out;
4414
}
4415
4416
static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
4417
int closing, int tx_ring)
4418
{
4419
struct pgv *pg_vec = NULL;
4420
struct packet_sock *po = pkt_sk(sk);
4421
unsigned long *rx_owner_map = NULL;
4422
int was_running, order = 0;
4423
struct packet_ring_buffer *rb;
4424
struct sk_buff_head *rb_queue;
4425
__be16 num;
4426
int err;
4427
/* Added to avoid minimal code churn */
4428
struct tpacket_req *req = &req_u->req;
4429
4430
rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4431
rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
4432
4433
err = -EBUSY;
4434
if (!closing) {
4435
if (atomic_long_read(&po->mapped))
4436
goto out;
4437
if (packet_read_pending(rb))
4438
goto out;
4439
}
4440
4441
if (req->tp_block_nr) {
4442
unsigned int min_frame_size;
4443
4444
/* Sanity tests and some calculations */
4445
err = -EBUSY;
4446
if (unlikely(rb->pg_vec))
4447
goto out;
4448
4449
switch (po->tp_version) {
4450
case TPACKET_V1:
4451
po->tp_hdrlen = TPACKET_HDRLEN;
4452
break;
4453
case TPACKET_V2:
4454
po->tp_hdrlen = TPACKET2_HDRLEN;
4455
break;
4456
case TPACKET_V3:
4457
po->tp_hdrlen = TPACKET3_HDRLEN;
4458
break;
4459
}
4460
4461
err = -EINVAL;
4462
if (unlikely((int)req->tp_block_size <= 0))
4463
goto out;
4464
if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
4465
goto out;
4466
min_frame_size = po->tp_hdrlen + po->tp_reserve;
4467
if (po->tp_version >= TPACKET_V3 &&
4468
req->tp_block_size <
4469
BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
4470
goto out;
4471
if (unlikely(req->tp_frame_size < min_frame_size))
4472
goto out;
4473
if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
4474
goto out;
4475
4476
rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4477
if (unlikely(rb->frames_per_block == 0))
4478
goto out;
4479
if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
4480
goto out;
4481
if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4482
req->tp_frame_nr))
4483
goto out;
4484
4485
err = -ENOMEM;
4486
order = get_order(req->tp_block_size);
4487
pg_vec = alloc_pg_vec(req, order);
4488
if (unlikely(!pg_vec))
4489
goto out;
4490
switch (po->tp_version) {
4491
case TPACKET_V3:
4492
/* Block transmit is not supported yet */
4493
if (!tx_ring) {
4494
init_prb_bdqc(po, rb, pg_vec, req_u);
4495
} else {
4496
struct tpacket_req3 *req3 = &req_u->req3;
4497
4498
if (req3->tp_retire_blk_tov ||
4499
req3->tp_sizeof_priv ||
4500
req3->tp_feature_req_word) {
4501
err = -EINVAL;
4502
goto out_free_pg_vec;
4503
}
4504
}
4505
break;
4506
default:
4507
if (!tx_ring) {
4508
rx_owner_map = bitmap_alloc(req->tp_frame_nr,
4509
GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
4510
if (!rx_owner_map)
4511
goto out_free_pg_vec;
4512
}
4513
break;
4514
}
4515
}
4516
/* Done */
4517
else {
4518
err = -EINVAL;
4519
if (unlikely(req->tp_frame_nr))
4520
goto out;
4521
}
4522
4523
4524
/* Detach socket from network */
4525
spin_lock(&po->bind_lock);
4526
was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING);
4527
num = po->num;
4528
WRITE_ONCE(po->num, 0);
4529
if (was_running)
4530
__unregister_prot_hook(sk, false);
4531
4532
spin_unlock(&po->bind_lock);
4533
4534
synchronize_net();
4535
4536
err = -EBUSY;
4537
mutex_lock(&po->pg_vec_lock);
4538
if (closing || atomic_long_read(&po->mapped) == 0) {
4539
err = 0;
4540
spin_lock_bh(&rb_queue->lock);
4541
swap(rb->pg_vec, pg_vec);
4542
if (po->tp_version <= TPACKET_V2)
4543
swap(rb->rx_owner_map, rx_owner_map);
4544
rb->frame_max = (req->tp_frame_nr - 1);
4545
rb->head = 0;
4546
rb->frame_size = req->tp_frame_size;
4547
spin_unlock_bh(&rb_queue->lock);
4548
4549
swap(rb->pg_vec_order, order);
4550
swap(rb->pg_vec_len, req->tp_block_nr);
4551
4552
rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4553
po->prot_hook.func = (po->rx_ring.pg_vec) ?
4554
tpacket_rcv : packet_rcv;
4555
skb_queue_purge(rb_queue);
4556
if (atomic_long_read(&po->mapped))
4557
pr_err("packet_mmap: vma is busy: %ld\n",
4558
atomic_long_read(&po->mapped));
4559
}
4560
mutex_unlock(&po->pg_vec_lock);
4561
4562
spin_lock(&po->bind_lock);
4563
WRITE_ONCE(po->num, num);
4564
if (was_running)
4565
register_prot_hook(sk);
4566
4567
spin_unlock(&po->bind_lock);
4568
if (pg_vec && (po->tp_version > TPACKET_V2)) {
4569
/* Because we don't support block-based V3 on tx-ring */
4570
if (!tx_ring)
4571
prb_shutdown_retire_blk_timer(po, rb_queue);
4572
}
4573
4574
out_free_pg_vec:
4575
if (pg_vec) {
4576
bitmap_free(rx_owner_map);
4577
free_pg_vec(pg_vec, order, req->tp_block_nr);
4578
}
4579
out:
4580
return err;
4581
}
4582
4583
static int packet_mmap(struct file *file, struct socket *sock,
4584
struct vm_area_struct *vma)
4585
{
4586
struct sock *sk = sock->sk;
4587
struct packet_sock *po = pkt_sk(sk);
4588
unsigned long size, expected_size;
4589
struct packet_ring_buffer *rb;
4590
unsigned long start;
4591
int err = -EINVAL;
4592
int i;
4593
4594
if (vma->vm_pgoff)
4595
return -EINVAL;
4596
4597
mutex_lock(&po->pg_vec_lock);
4598
4599
expected_size = 0;
4600
for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4601
if (rb->pg_vec) {
4602
expected_size += rb->pg_vec_len
4603
* rb->pg_vec_pages
4604
* PAGE_SIZE;
4605
}
4606
}
4607
4608
if (expected_size == 0)
4609
goto out;
4610
4611
size = vma->vm_end - vma->vm_start;
4612
if (size != expected_size)
4613
goto out;
4614
4615
start = vma->vm_start;
4616
for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4617
if (rb->pg_vec == NULL)
4618
continue;
4619
4620
for (i = 0; i < rb->pg_vec_len; i++) {
4621
struct page *page;
4622
void *kaddr = rb->pg_vec[i].buffer;
4623
int pg_num;
4624
4625
for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4626
page = pgv_to_page(kaddr);
4627
err = vm_insert_page(vma, start, page);
4628
if (unlikely(err))
4629
goto out;
4630
start += PAGE_SIZE;
4631
kaddr += PAGE_SIZE;
4632
}
4633
}
4634
}
4635
4636
atomic_long_inc(&po->mapped);
4637
vma->vm_ops = &packet_mmap_ops;
4638
err = 0;
4639
4640
out:
4641
mutex_unlock(&po->pg_vec_lock);
4642
return err;
4643
}
4644
4645
static const struct proto_ops packet_ops_spkt = {
4646
.family = PF_PACKET,
4647
.owner = THIS_MODULE,
4648
.release = packet_release,
4649
.bind = packet_bind_spkt,
4650
.connect = sock_no_connect,
4651
.socketpair = sock_no_socketpair,
4652
.accept = sock_no_accept,
4653
.getname = packet_getname_spkt,
4654
.poll = datagram_poll,
4655
.ioctl = packet_ioctl,
4656
.gettstamp = sock_gettstamp,
4657
.listen = sock_no_listen,
4658
.shutdown = sock_no_shutdown,
4659
.sendmsg = packet_sendmsg_spkt,
4660
.recvmsg = packet_recvmsg,
4661
.mmap = sock_no_mmap,
4662
};
4663
4664
static const struct proto_ops packet_ops = {
4665
.family = PF_PACKET,
4666
.owner = THIS_MODULE,
4667
.release = packet_release,
4668
.bind = packet_bind,
4669
.connect = sock_no_connect,
4670
.socketpair = sock_no_socketpair,
4671
.accept = sock_no_accept,
4672
.getname = packet_getname,
4673
.poll = packet_poll,
4674
.ioctl = packet_ioctl,
4675
.gettstamp = sock_gettstamp,
4676
.listen = sock_no_listen,
4677
.shutdown = sock_no_shutdown,
4678
.setsockopt = packet_setsockopt,
4679
.getsockopt_iter = packet_getsockopt,
4680
.sendmsg = packet_sendmsg,
4681
.recvmsg = packet_recvmsg,
4682
.mmap = packet_mmap,
4683
};
4684
4685
static const struct net_proto_family packet_family_ops = {
4686
.family = PF_PACKET,
4687
.create = packet_create,
4688
.owner = THIS_MODULE,
4689
};
4690
4691
static struct notifier_block packet_netdev_notifier = {
4692
.notifier_call = packet_notifier,
4693
};
4694
4695
#ifdef CONFIG_PROC_FS
4696
4697
static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4698
__acquires(RCU)
4699
{
4700
struct net *net = seq_file_net(seq);
4701
4702
rcu_read_lock();
4703
return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
4704
}
4705
4706
static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4707
{
4708
struct net *net = seq_file_net(seq);
4709
return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
4710
}
4711
4712
static void packet_seq_stop(struct seq_file *seq, void *v)
4713
__releases(RCU)
4714
{
4715
rcu_read_unlock();
4716
}
4717
4718
static int packet_seq_show(struct seq_file *seq, void *v)
4719
{
4720
if (v == SEQ_START_TOKEN)
4721
seq_printf(seq,
4722
"%*sRefCnt Type Proto Iface R Rmem User Inode\n",
4723
IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk");
4724
else {
4725
struct sock *s = sk_entry(v);
4726
const struct packet_sock *po = pkt_sk(s);
4727
4728
seq_printf(seq,
4729
"%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6llu\n",
4730
s,
4731
refcount_read(&s->sk_refcnt),
4732
s->sk_type,
4733
ntohs(READ_ONCE(po->num)),
4734
READ_ONCE(po->ifindex),
4735
packet_sock_flag(po, PACKET_SOCK_RUNNING),
4736
atomic_read(&s->sk_rmem_alloc),
4737
from_kuid_munged(seq_user_ns(seq), sk_uid(s)),
4738
sock_i_ino(s));
4739
}
4740
4741
return 0;
4742
}
4743
4744
static const struct seq_operations packet_seq_ops = {
4745
.start = packet_seq_start,
4746
.next = packet_seq_next,
4747
.stop = packet_seq_stop,
4748
.show = packet_seq_show,
4749
};
4750
#endif
4751
4752
static int __net_init packet_net_init(struct net *net)
4753
{
4754
mutex_init(&net->packet.sklist_lock);
4755
INIT_HLIST_HEAD(&net->packet.sklist);
4756
4757
#ifdef CONFIG_PROC_FS
4758
if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4759
sizeof(struct seq_net_private)))
4760
return -ENOMEM;
4761
#endif /* CONFIG_PROC_FS */
4762
4763
return 0;
4764
}
4765
4766
static void __net_exit packet_net_exit(struct net *net)
4767
{
4768
remove_proc_entry("packet", net->proc_net);
4769
WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
4770
}
4771
4772
static struct pernet_operations packet_net_ops = {
4773
.init = packet_net_init,
4774
.exit = packet_net_exit,
4775
};
4776
4777
4778
static void __exit packet_exit(void)
4779
{
4780
sock_unregister(PF_PACKET);
4781
proto_unregister(&packet_proto);
4782
unregister_netdevice_notifier(&packet_netdev_notifier);
4783
unregister_pernet_subsys(&packet_net_ops);
4784
}
4785
4786
static int __init packet_init(void)
4787
{
4788
int rc;
4789
4790
rc = register_pernet_subsys(&packet_net_ops);
4791
if (rc)
4792
goto out;
4793
rc = register_netdevice_notifier(&packet_netdev_notifier);
4794
if (rc)
4795
goto out_pernet;
4796
rc = proto_register(&packet_proto, 0);
4797
if (rc)
4798
goto out_notifier;
4799
rc = sock_register(&packet_family_ops);
4800
if (rc)
4801
goto out_proto;
4802
4803
return 0;
4804
4805
out_proto:
4806
proto_unregister(&packet_proto);
4807
out_notifier:
4808
unregister_netdevice_notifier(&packet_netdev_notifier);
4809
out_pernet:
4810
unregister_pernet_subsys(&packet_net_ops);
4811
out:
4812
return rc;
4813
}
4814
4815
module_init(packet_init);
4816
module_exit(packet_exit);
4817
MODULE_DESCRIPTION("Packet socket support (AF_PACKET)");
4818
MODULE_LICENSE("GPL");
4819
MODULE_ALIAS_NETPROTO(PF_PACKET);
4820
4821