Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/ipv4/ip_output.c
15109 views
1
/*
2
* INET An implementation of the TCP/IP protocol suite for the LINUX
3
* operating system. INET is implemented using the BSD Socket
4
* interface as the means of communication with the user level.
5
*
6
* The Internet Protocol (IP) output module.
7
*
8
* Authors: Ross Biro
9
* Fred N. van Kempen, <[email protected]>
10
* Donald Becker, <[email protected]>
11
* Alan Cox, <[email protected]>
12
* Richard Underwood
13
* Stefan Becker, <[email protected]>
14
* Jorge Cwik, <[email protected]>
15
* Arnt Gulbrandsen, <[email protected]>
16
* Hirokazu Takahashi, <[email protected]>
17
*
18
* See ip_input.c for original log
19
*
20
* Fixes:
21
* Alan Cox : Missing nonblock feature in ip_build_xmit.
22
* Mike Kilburn : htons() missing in ip_build_xmit.
23
* Bradford Johnson: Fix faulty handling of some frames when
24
* no route is found.
25
* Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26
* (in case if packet not accepted by
27
* output firewall rules)
28
* Mike McLagan : Routing by source
29
* Alexey Kuznetsov: use new route cache
30
* Andi Kleen: Fix broken PMTU recovery and remove
31
* some redundant tests.
32
* Vitaly E. Lavrov : Transparent proxy revived after year coma.
33
* Andi Kleen : Replace ip_reply with ip_send_reply.
34
* Andi Kleen : Split fast and slow ip_build_xmit path
35
* for decreased register pressure on x86
36
* and more readibility.
37
* Marc Boucher : When call_out_firewall returns FW_QUEUE,
38
* silently drop skb instead of failing with -EPERM.
39
* Detlev Wengorz : Copy protocol for fragments.
40
* Hirokazu Takahashi: HW checksumming for outgoing UDP
41
* datagrams.
42
* Hirokazu Takahashi: sendfile() on UDP works now.
43
*/
44
45
#include <asm/uaccess.h>
46
#include <asm/system.h>
47
#include <linux/module.h>
48
#include <linux/types.h>
49
#include <linux/kernel.h>
50
#include <linux/mm.h>
51
#include <linux/string.h>
52
#include <linux/errno.h>
53
#include <linux/highmem.h>
54
#include <linux/slab.h>
55
56
#include <linux/socket.h>
57
#include <linux/sockios.h>
58
#include <linux/in.h>
59
#include <linux/inet.h>
60
#include <linux/netdevice.h>
61
#include <linux/etherdevice.h>
62
#include <linux/proc_fs.h>
63
#include <linux/stat.h>
64
#include <linux/init.h>
65
66
#include <net/snmp.h>
67
#include <net/ip.h>
68
#include <net/protocol.h>
69
#include <net/route.h>
70
#include <net/xfrm.h>
71
#include <linux/skbuff.h>
72
#include <net/sock.h>
73
#include <net/arp.h>
74
#include <net/icmp.h>
75
#include <net/checksum.h>
76
#include <net/inetpeer.h>
77
#include <linux/igmp.h>
78
#include <linux/netfilter_ipv4.h>
79
#include <linux/netfilter_bridge.h>
80
#include <linux/mroute.h>
81
#include <linux/netlink.h>
82
#include <linux/tcp.h>
83
84
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85
EXPORT_SYMBOL(sysctl_ip_default_ttl);
86
87
/* Generate a checksum for an outgoing IP datagram. */
88
__inline__ void ip_send_check(struct iphdr *iph)
89
{
90
iph->check = 0;
91
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92
}
93
EXPORT_SYMBOL(ip_send_check);
94
95
int __ip_local_out(struct sk_buff *skb)
96
{
97
struct iphdr *iph = ip_hdr(skb);
98
99
iph->tot_len = htons(skb->len);
100
ip_send_check(iph);
101
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102
skb_dst(skb)->dev, dst_output);
103
}
104
105
int ip_local_out(struct sk_buff *skb)
106
{
107
int err;
108
109
err = __ip_local_out(skb);
110
if (likely(err == 1))
111
err = dst_output(skb);
112
113
return err;
114
}
115
EXPORT_SYMBOL_GPL(ip_local_out);
116
117
/* dev_loopback_xmit for use with netfilter. */
118
static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119
{
120
skb_reset_mac_header(newskb);
121
__skb_pull(newskb, skb_network_offset(newskb));
122
newskb->pkt_type = PACKET_LOOPBACK;
123
newskb->ip_summed = CHECKSUM_UNNECESSARY;
124
WARN_ON(!skb_dst(newskb));
125
netif_rx_ni(newskb);
126
return 0;
127
}
128
129
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130
{
131
int ttl = inet->uc_ttl;
132
133
if (ttl < 0)
134
ttl = ip4_dst_hoplimit(dst);
135
return ttl;
136
}
137
138
/*
139
* Add an ip header to a skbuff and send it out.
140
*
141
*/
142
int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143
__be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
144
{
145
struct inet_sock *inet = inet_sk(sk);
146
struct rtable *rt = skb_rtable(skb);
147
struct iphdr *iph;
148
149
/* Build the IP header. */
150
skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151
skb_reset_network_header(skb);
152
iph = ip_hdr(skb);
153
iph->version = 4;
154
iph->ihl = 5;
155
iph->tos = inet->tos;
156
if (ip_dont_fragment(sk, &rt->dst))
157
iph->frag_off = htons(IP_DF);
158
else
159
iph->frag_off = 0;
160
iph->ttl = ip_select_ttl(inet, &rt->dst);
161
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162
iph->saddr = saddr;
163
iph->protocol = sk->sk_protocol;
164
ip_select_ident(iph, &rt->dst, sk);
165
166
if (opt && opt->opt.optlen) {
167
iph->ihl += opt->opt.optlen>>2;
168
ip_options_build(skb, &opt->opt, daddr, rt, 0);
169
}
170
171
skb->priority = sk->sk_priority;
172
skb->mark = sk->sk_mark;
173
174
/* Send it out. */
175
return ip_local_out(skb);
176
}
177
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
179
static inline int ip_finish_output2(struct sk_buff *skb)
180
{
181
struct dst_entry *dst = skb_dst(skb);
182
struct rtable *rt = (struct rtable *)dst;
183
struct net_device *dev = dst->dev;
184
unsigned int hh_len = LL_RESERVED_SPACE(dev);
185
186
if (rt->rt_type == RTN_MULTICAST) {
187
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
188
} else if (rt->rt_type == RTN_BROADCAST)
189
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
190
191
/* Be paranoid, rather than too clever. */
192
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
193
struct sk_buff *skb2;
194
195
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
196
if (skb2 == NULL) {
197
kfree_skb(skb);
198
return -ENOMEM;
199
}
200
if (skb->sk)
201
skb_set_owner_w(skb2, skb->sk);
202
kfree_skb(skb);
203
skb = skb2;
204
}
205
206
if (dst->hh)
207
return neigh_hh_output(dst->hh, skb);
208
else if (dst->neighbour)
209
return dst->neighbour->output(skb);
210
211
if (net_ratelimit())
212
printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213
kfree_skb(skb);
214
return -EINVAL;
215
}
216
217
static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218
{
219
struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220
221
return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
222
skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
223
}
224
225
static int ip_finish_output(struct sk_buff *skb)
226
{
227
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228
/* Policy lookup after SNAT yielded a new policy */
229
if (skb_dst(skb)->xfrm != NULL) {
230
IPCB(skb)->flags |= IPSKB_REROUTED;
231
return dst_output(skb);
232
}
233
#endif
234
if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
235
return ip_fragment(skb, ip_finish_output2);
236
else
237
return ip_finish_output2(skb);
238
}
239
240
int ip_mc_output(struct sk_buff *skb)
241
{
242
struct sock *sk = skb->sk;
243
struct rtable *rt = skb_rtable(skb);
244
struct net_device *dev = rt->dst.dev;
245
246
/*
247
* If the indicated interface is up and running, send the packet.
248
*/
249
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
250
251
skb->dev = dev;
252
skb->protocol = htons(ETH_P_IP);
253
254
/*
255
* Multicasts are looped back for other local users
256
*/
257
258
if (rt->rt_flags&RTCF_MULTICAST) {
259
if (sk_mc_loop(sk)
260
#ifdef CONFIG_IP_MROUTE
261
/* Small optimization: do not loopback not local frames,
262
which returned after forwarding; they will be dropped
263
by ip_mr_input in any case.
264
Note, that local frames are looped back to be delivered
265
to local recipients.
266
267
This check is duplicated in ip_mr_input at the moment.
268
*/
269
&&
270
((rt->rt_flags & RTCF_LOCAL) ||
271
!(IPCB(skb)->flags & IPSKB_FORWARDED))
272
#endif
273
) {
274
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275
if (newskb)
276
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277
newskb, NULL, newskb->dev,
278
ip_dev_loopback_xmit);
279
}
280
281
/* Multicasts with ttl 0 must not go beyond the host */
282
283
if (ip_hdr(skb)->ttl == 0) {
284
kfree_skb(skb);
285
return 0;
286
}
287
}
288
289
if (rt->rt_flags&RTCF_BROADCAST) {
290
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291
if (newskb)
292
NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293
NULL, newskb->dev, ip_dev_loopback_xmit);
294
}
295
296
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297
skb->dev, ip_finish_output,
298
!(IPCB(skb)->flags & IPSKB_REROUTED));
299
}
300
301
int ip_output(struct sk_buff *skb)
302
{
303
struct net_device *dev = skb_dst(skb)->dev;
304
305
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
306
307
skb->dev = dev;
308
skb->protocol = htons(ETH_P_IP);
309
310
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
311
ip_finish_output,
312
!(IPCB(skb)->flags & IPSKB_REROUTED));
313
}
314
315
int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
316
{
317
struct sock *sk = skb->sk;
318
struct inet_sock *inet = inet_sk(sk);
319
struct ip_options_rcu *inet_opt;
320
struct flowi4 *fl4;
321
struct rtable *rt;
322
struct iphdr *iph;
323
int res;
324
325
/* Skip all of this if the packet is already routed,
326
* f.e. by something like SCTP.
327
*/
328
rcu_read_lock();
329
inet_opt = rcu_dereference(inet->inet_opt);
330
fl4 = &fl->u.ip4;
331
rt = skb_rtable(skb);
332
if (rt != NULL)
333
goto packet_routed;
334
335
/* Make sure we can route this packet. */
336
rt = (struct rtable *)__sk_dst_check(sk, 0);
337
if (rt == NULL) {
338
__be32 daddr;
339
340
/* Use correct destination address if we have options. */
341
daddr = inet->inet_daddr;
342
if (inet_opt && inet_opt->opt.srr)
343
daddr = inet_opt->opt.faddr;
344
345
/* If this fails, retransmit mechanism of transport layer will
346
* keep trying until route appears or the connection times
347
* itself out.
348
*/
349
rt = ip_route_output_ports(sock_net(sk), fl4, sk,
350
daddr, inet->inet_saddr,
351
inet->inet_dport,
352
inet->inet_sport,
353
sk->sk_protocol,
354
RT_CONN_FLAGS(sk),
355
sk->sk_bound_dev_if);
356
if (IS_ERR(rt))
357
goto no_route;
358
sk_setup_caps(sk, &rt->dst);
359
}
360
skb_dst_set_noref(skb, &rt->dst);
361
362
packet_routed:
363
if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
364
goto no_route;
365
366
/* OK, we know where to send it, allocate and build IP header. */
367
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
368
skb_reset_network_header(skb);
369
iph = ip_hdr(skb);
370
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
371
if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
372
iph->frag_off = htons(IP_DF);
373
else
374
iph->frag_off = 0;
375
iph->ttl = ip_select_ttl(inet, &rt->dst);
376
iph->protocol = sk->sk_protocol;
377
iph->saddr = fl4->saddr;
378
iph->daddr = fl4->daddr;
379
/* Transport layer set skb->h.foo itself. */
380
381
if (inet_opt && inet_opt->opt.optlen) {
382
iph->ihl += inet_opt->opt.optlen >> 2;
383
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
384
}
385
386
ip_select_ident_more(iph, &rt->dst, sk,
387
(skb_shinfo(skb)->gso_segs ?: 1) - 1);
388
389
skb->priority = sk->sk_priority;
390
skb->mark = sk->sk_mark;
391
392
res = ip_local_out(skb);
393
rcu_read_unlock();
394
return res;
395
396
no_route:
397
rcu_read_unlock();
398
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
399
kfree_skb(skb);
400
return -EHOSTUNREACH;
401
}
402
EXPORT_SYMBOL(ip_queue_xmit);
403
404
405
static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
406
{
407
to->pkt_type = from->pkt_type;
408
to->priority = from->priority;
409
to->protocol = from->protocol;
410
skb_dst_drop(to);
411
skb_dst_copy(to, from);
412
to->dev = from->dev;
413
to->mark = from->mark;
414
415
/* Copy the flags to each fragment. */
416
IPCB(to)->flags = IPCB(from)->flags;
417
418
#ifdef CONFIG_NET_SCHED
419
to->tc_index = from->tc_index;
420
#endif
421
nf_copy(to, from);
422
#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
423
defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
424
to->nf_trace = from->nf_trace;
425
#endif
426
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
427
to->ipvs_property = from->ipvs_property;
428
#endif
429
skb_copy_secmark(to, from);
430
}
431
432
/*
433
* This IP datagram is too large to be sent in one piece. Break it up into
434
* smaller pieces (each of size equal to IP header plus
435
* a block of the data of the original IP data part) that will yet fit in a
436
* single device frame, and queue such a frame for sending.
437
*/
438
439
int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
440
{
441
struct iphdr *iph;
442
int ptr;
443
struct net_device *dev;
444
struct sk_buff *skb2;
445
unsigned int mtu, hlen, left, len, ll_rs;
446
int offset;
447
__be16 not_last_frag;
448
struct rtable *rt = skb_rtable(skb);
449
int err = 0;
450
451
dev = rt->dst.dev;
452
453
/*
454
* Point into the IP datagram header.
455
*/
456
457
iph = ip_hdr(skb);
458
459
if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
460
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
461
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
462
htonl(ip_skb_dst_mtu(skb)));
463
kfree_skb(skb);
464
return -EMSGSIZE;
465
}
466
467
/*
468
* Setup starting values.
469
*/
470
471
hlen = iph->ihl * 4;
472
mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
473
#ifdef CONFIG_BRIDGE_NETFILTER
474
if (skb->nf_bridge)
475
mtu -= nf_bridge_mtu_reduction(skb);
476
#endif
477
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
478
479
/* When frag_list is given, use it. First, check its validity:
480
* some transformers could create wrong frag_list or break existing
481
* one, it is not prohibited. In this case fall back to copying.
482
*
483
* LATER: this step can be merged to real generation of fragments,
484
* we can switch to copy when see the first bad fragment.
485
*/
486
if (skb_has_frag_list(skb)) {
487
struct sk_buff *frag, *frag2;
488
int first_len = skb_pagelen(skb);
489
490
if (first_len - hlen > mtu ||
491
((first_len - hlen) & 7) ||
492
(iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
493
skb_cloned(skb))
494
goto slow_path;
495
496
skb_walk_frags(skb, frag) {
497
/* Correct geometry. */
498
if (frag->len > mtu ||
499
((frag->len & 7) && frag->next) ||
500
skb_headroom(frag) < hlen)
501
goto slow_path_clean;
502
503
/* Partially cloned skb? */
504
if (skb_shared(frag))
505
goto slow_path_clean;
506
507
BUG_ON(frag->sk);
508
if (skb->sk) {
509
frag->sk = skb->sk;
510
frag->destructor = sock_wfree;
511
}
512
skb->truesize -= frag->truesize;
513
}
514
515
/* Everything is OK. Generate! */
516
517
err = 0;
518
offset = 0;
519
frag = skb_shinfo(skb)->frag_list;
520
skb_frag_list_init(skb);
521
skb->data_len = first_len - skb_headlen(skb);
522
skb->len = first_len;
523
iph->tot_len = htons(first_len);
524
iph->frag_off = htons(IP_MF);
525
ip_send_check(iph);
526
527
for (;;) {
528
/* Prepare header of the next frame,
529
* before previous one went down. */
530
if (frag) {
531
frag->ip_summed = CHECKSUM_NONE;
532
skb_reset_transport_header(frag);
533
__skb_push(frag, hlen);
534
skb_reset_network_header(frag);
535
memcpy(skb_network_header(frag), iph, hlen);
536
iph = ip_hdr(frag);
537
iph->tot_len = htons(frag->len);
538
ip_copy_metadata(frag, skb);
539
if (offset == 0)
540
ip_options_fragment(frag);
541
offset += skb->len - hlen;
542
iph->frag_off = htons(offset>>3);
543
if (frag->next != NULL)
544
iph->frag_off |= htons(IP_MF);
545
/* Ready, complete checksum */
546
ip_send_check(iph);
547
}
548
549
err = output(skb);
550
551
if (!err)
552
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
553
if (err || !frag)
554
break;
555
556
skb = frag;
557
frag = skb->next;
558
skb->next = NULL;
559
}
560
561
if (err == 0) {
562
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
563
return 0;
564
}
565
566
while (frag) {
567
skb = frag->next;
568
kfree_skb(frag);
569
frag = skb;
570
}
571
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
572
return err;
573
574
slow_path_clean:
575
skb_walk_frags(skb, frag2) {
576
if (frag2 == frag)
577
break;
578
frag2->sk = NULL;
579
frag2->destructor = NULL;
580
skb->truesize += frag2->truesize;
581
}
582
}
583
584
slow_path:
585
left = skb->len - hlen; /* Space per frame */
586
ptr = hlen; /* Where to start from */
587
588
/* for bridged IP traffic encapsulated inside f.e. a vlan header,
589
* we need to make room for the encapsulating header
590
*/
591
ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
592
593
/*
594
* Fragment the datagram.
595
*/
596
597
offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
598
not_last_frag = iph->frag_off & htons(IP_MF);
599
600
/*
601
* Keep copying data until we run out.
602
*/
603
604
while (left > 0) {
605
len = left;
606
/* IF: it doesn't fit, use 'mtu' - the data space left */
607
if (len > mtu)
608
len = mtu;
609
/* IF: we are not sending up to and including the packet end
610
then align the next start on an eight byte boundary */
611
if (len < left) {
612
len &= ~7;
613
}
614
/*
615
* Allocate buffer.
616
*/
617
618
if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
619
NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
620
err = -ENOMEM;
621
goto fail;
622
}
623
624
/*
625
* Set up data on packet
626
*/
627
628
ip_copy_metadata(skb2, skb);
629
skb_reserve(skb2, ll_rs);
630
skb_put(skb2, len + hlen);
631
skb_reset_network_header(skb2);
632
skb2->transport_header = skb2->network_header + hlen;
633
634
/*
635
* Charge the memory for the fragment to any owner
636
* it might possess
637
*/
638
639
if (skb->sk)
640
skb_set_owner_w(skb2, skb->sk);
641
642
/*
643
* Copy the packet header into the new buffer.
644
*/
645
646
skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
647
648
/*
649
* Copy a block of the IP datagram.
650
*/
651
if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
652
BUG();
653
left -= len;
654
655
/*
656
* Fill in the new header fields.
657
*/
658
iph = ip_hdr(skb2);
659
iph->frag_off = htons((offset >> 3));
660
661
/* ANK: dirty, but effective trick. Upgrade options only if
662
* the segment to be fragmented was THE FIRST (otherwise,
663
* options are already fixed) and make it ONCE
664
* on the initial skb, so that all the following fragments
665
* will inherit fixed options.
666
*/
667
if (offset == 0)
668
ip_options_fragment(skb);
669
670
/*
671
* Added AC : If we are fragmenting a fragment that's not the
672
* last fragment then keep MF on each bit
673
*/
674
if (left > 0 || not_last_frag)
675
iph->frag_off |= htons(IP_MF);
676
ptr += len;
677
offset += len;
678
679
/*
680
* Put this fragment into the sending queue.
681
*/
682
iph->tot_len = htons(len + hlen);
683
684
ip_send_check(iph);
685
686
err = output(skb2);
687
if (err)
688
goto fail;
689
690
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
691
}
692
kfree_skb(skb);
693
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
694
return err;
695
696
fail:
697
kfree_skb(skb);
698
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
699
return err;
700
}
701
EXPORT_SYMBOL(ip_fragment);
702
703
int
704
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
705
{
706
struct iovec *iov = from;
707
708
if (skb->ip_summed == CHECKSUM_PARTIAL) {
709
if (memcpy_fromiovecend(to, iov, offset, len) < 0)
710
return -EFAULT;
711
} else {
712
__wsum csum = 0;
713
if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
714
return -EFAULT;
715
skb->csum = csum_block_add(skb->csum, csum, odd);
716
}
717
return 0;
718
}
719
EXPORT_SYMBOL(ip_generic_getfrag);
720
721
static inline __wsum
722
csum_page(struct page *page, int offset, int copy)
723
{
724
char *kaddr;
725
__wsum csum;
726
kaddr = kmap(page);
727
csum = csum_partial(kaddr + offset, copy, 0);
728
kunmap(page);
729
return csum;
730
}
731
732
static inline int ip_ufo_append_data(struct sock *sk,
733
struct sk_buff_head *queue,
734
int getfrag(void *from, char *to, int offset, int len,
735
int odd, struct sk_buff *skb),
736
void *from, int length, int hh_len, int fragheaderlen,
737
int transhdrlen, int mtu, unsigned int flags)
738
{
739
struct sk_buff *skb;
740
int err;
741
742
/* There is support for UDP fragmentation offload by network
743
* device, so create one single skb packet containing complete
744
* udp datagram
745
*/
746
if ((skb = skb_peek_tail(queue)) == NULL) {
747
skb = sock_alloc_send_skb(sk,
748
hh_len + fragheaderlen + transhdrlen + 20,
749
(flags & MSG_DONTWAIT), &err);
750
751
if (skb == NULL)
752
return err;
753
754
/* reserve space for Hardware header */
755
skb_reserve(skb, hh_len);
756
757
/* create space for UDP/IP header */
758
skb_put(skb, fragheaderlen + transhdrlen);
759
760
/* initialize network header pointer */
761
skb_reset_network_header(skb);
762
763
/* initialize protocol header pointer */
764
skb->transport_header = skb->network_header + fragheaderlen;
765
766
skb->ip_summed = CHECKSUM_PARTIAL;
767
skb->csum = 0;
768
769
/* specify the length of each IP datagram fragment */
770
skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
771
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
772
__skb_queue_tail(queue, skb);
773
}
774
775
return skb_append_datato_frags(sk, skb, getfrag, from,
776
(length - transhdrlen));
777
}
778
779
static int __ip_append_data(struct sock *sk,
780
struct flowi4 *fl4,
781
struct sk_buff_head *queue,
782
struct inet_cork *cork,
783
int getfrag(void *from, char *to, int offset,
784
int len, int odd, struct sk_buff *skb),
785
void *from, int length, int transhdrlen,
786
unsigned int flags)
787
{
788
struct inet_sock *inet = inet_sk(sk);
789
struct sk_buff *skb;
790
791
struct ip_options *opt = cork->opt;
792
int hh_len;
793
int exthdrlen;
794
int mtu;
795
int copy;
796
int err;
797
int offset = 0;
798
unsigned int maxfraglen, fragheaderlen;
799
int csummode = CHECKSUM_NONE;
800
struct rtable *rt = (struct rtable *)cork->dst;
801
802
skb = skb_peek_tail(queue);
803
804
exthdrlen = !skb ? rt->dst.header_len : 0;
805
mtu = cork->fragsize;
806
807
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
808
809
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
810
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
811
812
if (cork->length + length > 0xFFFF - fragheaderlen) {
813
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
814
mtu-exthdrlen);
815
return -EMSGSIZE;
816
}
817
818
/*
819
* transhdrlen > 0 means that this is the first fragment and we wish
820
* it won't be fragmented in the future.
821
*/
822
if (transhdrlen &&
823
length + fragheaderlen <= mtu &&
824
rt->dst.dev->features & NETIF_F_V4_CSUM &&
825
!exthdrlen)
826
csummode = CHECKSUM_PARTIAL;
827
828
cork->length += length;
829
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
830
(sk->sk_protocol == IPPROTO_UDP) &&
831
(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
832
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
833
hh_len, fragheaderlen, transhdrlen,
834
mtu, flags);
835
if (err)
836
goto error;
837
return 0;
838
}
839
840
/* So, what's going on in the loop below?
841
*
842
* We use calculated fragment length to generate chained skb,
843
* each of segments is IP fragment ready for sending to network after
844
* adding appropriate IP header.
845
*/
846
847
if (!skb)
848
goto alloc_new_skb;
849
850
while (length > 0) {
851
/* Check if the remaining data fits into current packet. */
852
copy = mtu - skb->len;
853
if (copy < length)
854
copy = maxfraglen - skb->len;
855
if (copy <= 0) {
856
char *data;
857
unsigned int datalen;
858
unsigned int fraglen;
859
unsigned int fraggap;
860
unsigned int alloclen;
861
struct sk_buff *skb_prev;
862
alloc_new_skb:
863
skb_prev = skb;
864
if (skb_prev)
865
fraggap = skb_prev->len - maxfraglen;
866
else
867
fraggap = 0;
868
869
/*
870
* If remaining data exceeds the mtu,
871
* we know we need more fragment(s).
872
*/
873
datalen = length + fraggap;
874
if (datalen > mtu - fragheaderlen)
875
datalen = maxfraglen - fragheaderlen;
876
fraglen = datalen + fragheaderlen;
877
878
if ((flags & MSG_MORE) &&
879
!(rt->dst.dev->features&NETIF_F_SG))
880
alloclen = mtu;
881
else
882
alloclen = fraglen;
883
884
alloclen += exthdrlen;
885
886
/* The last fragment gets additional space at tail.
887
* Note, with MSG_MORE we overallocate on fragments,
888
* because we have no idea what fragment will be
889
* the last.
890
*/
891
if (datalen == length + fraggap)
892
alloclen += rt->dst.trailer_len;
893
894
if (transhdrlen) {
895
skb = sock_alloc_send_skb(sk,
896
alloclen + hh_len + 15,
897
(flags & MSG_DONTWAIT), &err);
898
} else {
899
skb = NULL;
900
if (atomic_read(&sk->sk_wmem_alloc) <=
901
2 * sk->sk_sndbuf)
902
skb = sock_wmalloc(sk,
903
alloclen + hh_len + 15, 1,
904
sk->sk_allocation);
905
if (unlikely(skb == NULL))
906
err = -ENOBUFS;
907
else
908
/* only the initial fragment is
909
time stamped */
910
cork->tx_flags = 0;
911
}
912
if (skb == NULL)
913
goto error;
914
915
/*
916
* Fill in the control structures
917
*/
918
skb->ip_summed = csummode;
919
skb->csum = 0;
920
skb_reserve(skb, hh_len);
921
skb_shinfo(skb)->tx_flags = cork->tx_flags;
922
923
/*
924
* Find where to start putting bytes.
925
*/
926
data = skb_put(skb, fraglen + exthdrlen);
927
skb_set_network_header(skb, exthdrlen);
928
skb->transport_header = (skb->network_header +
929
fragheaderlen);
930
data += fragheaderlen + exthdrlen;
931
932
if (fraggap) {
933
skb->csum = skb_copy_and_csum_bits(
934
skb_prev, maxfraglen,
935
data + transhdrlen, fraggap, 0);
936
skb_prev->csum = csum_sub(skb_prev->csum,
937
skb->csum);
938
data += fraggap;
939
pskb_trim_unique(skb_prev, maxfraglen);
940
}
941
942
copy = datalen - transhdrlen - fraggap;
943
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
944
err = -EFAULT;
945
kfree_skb(skb);
946
goto error;
947
}
948
949
offset += copy;
950
length -= datalen - fraggap;
951
transhdrlen = 0;
952
exthdrlen = 0;
953
csummode = CHECKSUM_NONE;
954
955
/*
956
* Put the packet on the pending queue.
957
*/
958
__skb_queue_tail(queue, skb);
959
continue;
960
}
961
962
if (copy > length)
963
copy = length;
964
965
if (!(rt->dst.dev->features&NETIF_F_SG)) {
966
unsigned int off;
967
968
off = skb->len;
969
if (getfrag(from, skb_put(skb, copy),
970
offset, copy, off, skb) < 0) {
971
__skb_trim(skb, off);
972
err = -EFAULT;
973
goto error;
974
}
975
} else {
976
int i = skb_shinfo(skb)->nr_frags;
977
skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
978
struct page *page = cork->page;
979
int off = cork->off;
980
unsigned int left;
981
982
if (page && (left = PAGE_SIZE - off) > 0) {
983
if (copy >= left)
984
copy = left;
985
if (page != frag->page) {
986
if (i == MAX_SKB_FRAGS) {
987
err = -EMSGSIZE;
988
goto error;
989
}
990
get_page(page);
991
skb_fill_page_desc(skb, i, page, off, 0);
992
frag = &skb_shinfo(skb)->frags[i];
993
}
994
} else if (i < MAX_SKB_FRAGS) {
995
if (copy > PAGE_SIZE)
996
copy = PAGE_SIZE;
997
page = alloc_pages(sk->sk_allocation, 0);
998
if (page == NULL) {
999
err = -ENOMEM;
1000
goto error;
1001
}
1002
cork->page = page;
1003
cork->off = 0;
1004
1005
skb_fill_page_desc(skb, i, page, 0, 0);
1006
frag = &skb_shinfo(skb)->frags[i];
1007
} else {
1008
err = -EMSGSIZE;
1009
goto error;
1010
}
1011
if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1012
err = -EFAULT;
1013
goto error;
1014
}
1015
cork->off += copy;
1016
frag->size += copy;
1017
skb->len += copy;
1018
skb->data_len += copy;
1019
skb->truesize += copy;
1020
atomic_add(copy, &sk->sk_wmem_alloc);
1021
}
1022
offset += copy;
1023
length -= copy;
1024
}
1025
1026
return 0;
1027
1028
error:
1029
cork->length -= length;
1030
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1031
return err;
1032
}
1033
1034
static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1035
struct ipcm_cookie *ipc, struct rtable **rtp)
1036
{
1037
struct inet_sock *inet = inet_sk(sk);
1038
struct ip_options_rcu *opt;
1039
struct rtable *rt;
1040
1041
/*
1042
* setup for corking.
1043
*/
1044
opt = ipc->opt;
1045
if (opt) {
1046
if (cork->opt == NULL) {
1047
cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1048
sk->sk_allocation);
1049
if (unlikely(cork->opt == NULL))
1050
return -ENOBUFS;
1051
}
1052
memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1053
cork->flags |= IPCORK_OPT;
1054
cork->addr = ipc->addr;
1055
}
1056
rt = *rtp;
1057
if (unlikely(!rt))
1058
return -EFAULT;
1059
/*
1060
* We steal reference to this route, caller should not release it
1061
*/
1062
*rtp = NULL;
1063
cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1064
rt->dst.dev->mtu : dst_mtu(&rt->dst);
1065
cork->dst = &rt->dst;
1066
cork->length = 0;
1067
cork->tx_flags = ipc->tx_flags;
1068
cork->page = NULL;
1069
cork->off = 0;
1070
1071
return 0;
1072
}
1073
1074
/*
1075
* ip_append_data() and ip_append_page() can make one large IP datagram
1076
* from many pieces of data. Each pieces will be holded on the socket
1077
* until ip_push_pending_frames() is called. Each piece can be a page
1078
* or non-page data.
1079
*
1080
* Not only UDP, other transport protocols - e.g. raw sockets - can use
1081
* this interface potentially.
1082
*
1083
* LATER: length must be adjusted by pad at tail, when it is required.
1084
*/
1085
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1086
int getfrag(void *from, char *to, int offset, int len,
1087
int odd, struct sk_buff *skb),
1088
void *from, int length, int transhdrlen,
1089
struct ipcm_cookie *ipc, struct rtable **rtp,
1090
unsigned int flags)
1091
{
1092
struct inet_sock *inet = inet_sk(sk);
1093
int err;
1094
1095
if (flags&MSG_PROBE)
1096
return 0;
1097
1098
if (skb_queue_empty(&sk->sk_write_queue)) {
1099
err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1100
if (err)
1101
return err;
1102
} else {
1103
transhdrlen = 0;
1104
}
1105
1106
return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1107
from, length, transhdrlen, flags);
1108
}
1109
1110
ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1111
int offset, size_t size, int flags)
1112
{
1113
struct inet_sock *inet = inet_sk(sk);
1114
struct sk_buff *skb;
1115
struct rtable *rt;
1116
struct ip_options *opt = NULL;
1117
struct inet_cork *cork;
1118
int hh_len;
1119
int mtu;
1120
int len;
1121
int err;
1122
unsigned int maxfraglen, fragheaderlen, fraggap;
1123
1124
if (inet->hdrincl)
1125
return -EPERM;
1126
1127
if (flags&MSG_PROBE)
1128
return 0;
1129
1130
if (skb_queue_empty(&sk->sk_write_queue))
1131
return -EINVAL;
1132
1133
cork = &inet->cork.base;
1134
rt = (struct rtable *)cork->dst;
1135
if (cork->flags & IPCORK_OPT)
1136
opt = cork->opt;
1137
1138
if (!(rt->dst.dev->features&NETIF_F_SG))
1139
return -EOPNOTSUPP;
1140
1141
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1142
mtu = cork->fragsize;
1143
1144
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1145
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1146
1147
if (cork->length + size > 0xFFFF - fragheaderlen) {
1148
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1149
return -EMSGSIZE;
1150
}
1151
1152
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1153
return -EINVAL;
1154
1155
cork->length += size;
1156
if ((size + skb->len > mtu) &&
1157
(sk->sk_protocol == IPPROTO_UDP) &&
1158
(rt->dst.dev->features & NETIF_F_UFO)) {
1159
skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1160
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1161
}
1162
1163
1164
while (size > 0) {
1165
int i;
1166
1167
if (skb_is_gso(skb))
1168
len = size;
1169
else {
1170
1171
/* Check if the remaining data fits into current packet. */
1172
len = mtu - skb->len;
1173
if (len < size)
1174
len = maxfraglen - skb->len;
1175
}
1176
if (len <= 0) {
1177
struct sk_buff *skb_prev;
1178
int alloclen;
1179
1180
skb_prev = skb;
1181
fraggap = skb_prev->len - maxfraglen;
1182
1183
alloclen = fragheaderlen + hh_len + fraggap + 15;
1184
skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1185
if (unlikely(!skb)) {
1186
err = -ENOBUFS;
1187
goto error;
1188
}
1189
1190
/*
1191
* Fill in the control structures
1192
*/
1193
skb->ip_summed = CHECKSUM_NONE;
1194
skb->csum = 0;
1195
skb_reserve(skb, hh_len);
1196
1197
/*
1198
* Find where to start putting bytes.
1199
*/
1200
skb_put(skb, fragheaderlen + fraggap);
1201
skb_reset_network_header(skb);
1202
skb->transport_header = (skb->network_header +
1203
fragheaderlen);
1204
if (fraggap) {
1205
skb->csum = skb_copy_and_csum_bits(skb_prev,
1206
maxfraglen,
1207
skb_transport_header(skb),
1208
fraggap, 0);
1209
skb_prev->csum = csum_sub(skb_prev->csum,
1210
skb->csum);
1211
pskb_trim_unique(skb_prev, maxfraglen);
1212
}
1213
1214
/*
1215
* Put the packet on the pending queue.
1216
*/
1217
__skb_queue_tail(&sk->sk_write_queue, skb);
1218
continue;
1219
}
1220
1221
i = skb_shinfo(skb)->nr_frags;
1222
if (len > size)
1223
len = size;
1224
if (skb_can_coalesce(skb, i, page, offset)) {
1225
skb_shinfo(skb)->frags[i-1].size += len;
1226
} else if (i < MAX_SKB_FRAGS) {
1227
get_page(page);
1228
skb_fill_page_desc(skb, i, page, offset, len);
1229
} else {
1230
err = -EMSGSIZE;
1231
goto error;
1232
}
1233
1234
if (skb->ip_summed == CHECKSUM_NONE) {
1235
__wsum csum;
1236
csum = csum_page(page, offset, len);
1237
skb->csum = csum_block_add(skb->csum, csum, skb->len);
1238
}
1239
1240
skb->len += len;
1241
skb->data_len += len;
1242
skb->truesize += len;
1243
atomic_add(len, &sk->sk_wmem_alloc);
1244
offset += len;
1245
size -= len;
1246
}
1247
return 0;
1248
1249
error:
1250
cork->length -= size;
1251
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1252
return err;
1253
}
1254
1255
static void ip_cork_release(struct inet_cork *cork)
1256
{
1257
cork->flags &= ~IPCORK_OPT;
1258
kfree(cork->opt);
1259
cork->opt = NULL;
1260
dst_release(cork->dst);
1261
cork->dst = NULL;
1262
}
1263
1264
/*
1265
* Combined all pending IP fragments on the socket as one IP datagram
1266
* and push them out.
1267
*/
1268
struct sk_buff *__ip_make_skb(struct sock *sk,
1269
struct flowi4 *fl4,
1270
struct sk_buff_head *queue,
1271
struct inet_cork *cork)
1272
{
1273
struct sk_buff *skb, *tmp_skb;
1274
struct sk_buff **tail_skb;
1275
struct inet_sock *inet = inet_sk(sk);
1276
struct net *net = sock_net(sk);
1277
struct ip_options *opt = NULL;
1278
struct rtable *rt = (struct rtable *)cork->dst;
1279
struct iphdr *iph;
1280
__be16 df = 0;
1281
__u8 ttl;
1282
1283
if ((skb = __skb_dequeue(queue)) == NULL)
1284
goto out;
1285
tail_skb = &(skb_shinfo(skb)->frag_list);
1286
1287
/* move skb->data to ip header from ext header */
1288
if (skb->data < skb_network_header(skb))
1289
__skb_pull(skb, skb_network_offset(skb));
1290
while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1291
__skb_pull(tmp_skb, skb_network_header_len(skb));
1292
*tail_skb = tmp_skb;
1293
tail_skb = &(tmp_skb->next);
1294
skb->len += tmp_skb->len;
1295
skb->data_len += tmp_skb->len;
1296
skb->truesize += tmp_skb->truesize;
1297
tmp_skb->destructor = NULL;
1298
tmp_skb->sk = NULL;
1299
}
1300
1301
/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1302
* to fragment the frame generated here. No matter, what transforms
1303
* how transforms change size of the packet, it will come out.
1304
*/
1305
if (inet->pmtudisc < IP_PMTUDISC_DO)
1306
skb->local_df = 1;
1307
1308
/* DF bit is set when we want to see DF on outgoing frames.
1309
* If local_df is set too, we still allow to fragment this frame
1310
* locally. */
1311
if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1312
(skb->len <= dst_mtu(&rt->dst) &&
1313
ip_dont_fragment(sk, &rt->dst)))
1314
df = htons(IP_DF);
1315
1316
if (cork->flags & IPCORK_OPT)
1317
opt = cork->opt;
1318
1319
if (rt->rt_type == RTN_MULTICAST)
1320
ttl = inet->mc_ttl;
1321
else
1322
ttl = ip_select_ttl(inet, &rt->dst);
1323
1324
iph = (struct iphdr *)skb->data;
1325
iph->version = 4;
1326
iph->ihl = 5;
1327
iph->tos = inet->tos;
1328
iph->frag_off = df;
1329
ip_select_ident(iph, &rt->dst, sk);
1330
iph->ttl = ttl;
1331
iph->protocol = sk->sk_protocol;
1332
iph->saddr = fl4->saddr;
1333
iph->daddr = fl4->daddr;
1334
1335
if (opt) {
1336
iph->ihl += opt->optlen>>2;
1337
ip_options_build(skb, opt, cork->addr, rt, 0);
1338
}
1339
1340
skb->priority = sk->sk_priority;
1341
skb->mark = sk->sk_mark;
1342
/*
1343
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1344
* on dst refcount
1345
*/
1346
cork->dst = NULL;
1347
skb_dst_set(skb, &rt->dst);
1348
1349
if (iph->protocol == IPPROTO_ICMP)
1350
icmp_out_count(net, ((struct icmphdr *)
1351
skb_transport_header(skb))->type);
1352
1353
ip_cork_release(cork);
1354
out:
1355
return skb;
1356
}
1357
1358
int ip_send_skb(struct sk_buff *skb)
1359
{
1360
struct net *net = sock_net(skb->sk);
1361
int err;
1362
1363
err = ip_local_out(skb);
1364
if (err) {
1365
if (err > 0)
1366
err = net_xmit_errno(err);
1367
if (err)
1368
IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1369
}
1370
1371
return err;
1372
}
1373
1374
int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1375
{
1376
struct sk_buff *skb;
1377
1378
skb = ip_finish_skb(sk, fl4);
1379
if (!skb)
1380
return 0;
1381
1382
/* Netfilter gets whole the not fragmented skb. */
1383
return ip_send_skb(skb);
1384
}
1385
1386
/*
1387
* Throw away all pending data on the socket.
1388
*/
1389
static void __ip_flush_pending_frames(struct sock *sk,
1390
struct sk_buff_head *queue,
1391
struct inet_cork *cork)
1392
{
1393
struct sk_buff *skb;
1394
1395
while ((skb = __skb_dequeue_tail(queue)) != NULL)
1396
kfree_skb(skb);
1397
1398
ip_cork_release(cork);
1399
}
1400
1401
void ip_flush_pending_frames(struct sock *sk)
1402
{
1403
__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1404
}
1405
1406
struct sk_buff *ip_make_skb(struct sock *sk,
1407
struct flowi4 *fl4,
1408
int getfrag(void *from, char *to, int offset,
1409
int len, int odd, struct sk_buff *skb),
1410
void *from, int length, int transhdrlen,
1411
struct ipcm_cookie *ipc, struct rtable **rtp,
1412
unsigned int flags)
1413
{
1414
struct inet_cork cork;
1415
struct sk_buff_head queue;
1416
int err;
1417
1418
if (flags & MSG_PROBE)
1419
return NULL;
1420
1421
__skb_queue_head_init(&queue);
1422
1423
cork.flags = 0;
1424
cork.addr = 0;
1425
cork.opt = NULL;
1426
err = ip_setup_cork(sk, &cork, ipc, rtp);
1427
if (err)
1428
return ERR_PTR(err);
1429
1430
err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1431
from, length, transhdrlen, flags);
1432
if (err) {
1433
__ip_flush_pending_frames(sk, &queue, &cork);
1434
return ERR_PTR(err);
1435
}
1436
1437
return __ip_make_skb(sk, fl4, &queue, &cork);
1438
}
1439
1440
/*
1441
* Fetch data from kernel space and fill in checksum if needed.
1442
*/
1443
static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1444
int len, int odd, struct sk_buff *skb)
1445
{
1446
__wsum csum;
1447
1448
csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1449
skb->csum = csum_block_add(skb->csum, csum, odd);
1450
return 0;
1451
}
1452
1453
/*
1454
* Generic function to send a packet as reply to another packet.
1455
* Used to send TCP resets so far. ICMP should use this function too.
1456
*
1457
* Should run single threaded per socket because it uses the sock
1458
* structure to pass arguments.
1459
*/
1460
void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1461
struct ip_reply_arg *arg, unsigned int len)
1462
{
1463
struct inet_sock *inet = inet_sk(sk);
1464
struct ip_options_data replyopts;
1465
struct ipcm_cookie ipc;
1466
struct flowi4 fl4;
1467
struct rtable *rt = skb_rtable(skb);
1468
1469
if (ip_options_echo(&replyopts.opt.opt, skb))
1470
return;
1471
1472
ipc.addr = daddr;
1473
ipc.opt = NULL;
1474
ipc.tx_flags = 0;
1475
1476
if (replyopts.opt.opt.optlen) {
1477
ipc.opt = &replyopts.opt;
1478
1479
if (replyopts.opt.opt.srr)
1480
daddr = replyopts.opt.opt.faddr;
1481
}
1482
1483
flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1484
RT_TOS(ip_hdr(skb)->tos),
1485
RT_SCOPE_UNIVERSE, sk->sk_protocol,
1486
ip_reply_arg_flowi_flags(arg),
1487
daddr, rt->rt_spec_dst,
1488
tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1489
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1490
rt = ip_route_output_key(sock_net(sk), &fl4);
1491
if (IS_ERR(rt))
1492
return;
1493
1494
/* And let IP do all the hard work.
1495
1496
This chunk is not reenterable, hence spinlock.
1497
Note that it uses the fact, that this function is called
1498
with locally disabled BH and that sk cannot be already spinlocked.
1499
*/
1500
bh_lock_sock(sk);
1501
inet->tos = ip_hdr(skb)->tos;
1502
sk->sk_priority = skb->priority;
1503
sk->sk_protocol = ip_hdr(skb)->protocol;
1504
sk->sk_bound_dev_if = arg->bound_dev_if;
1505
ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1506
&ipc, &rt, MSG_DONTWAIT);
1507
if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1508
if (arg->csumoffset >= 0)
1509
*((__sum16 *)skb_transport_header(skb) +
1510
arg->csumoffset) = csum_fold(csum_add(skb->csum,
1511
arg->csum));
1512
skb->ip_summed = CHECKSUM_NONE;
1513
ip_push_pending_frames(sk, &fl4);
1514
}
1515
1516
bh_unlock_sock(sk);
1517
1518
ip_rt_put(rt);
1519
}
1520
1521
void __init ip_init(void)
1522
{
1523
ip_rt_init();
1524
inet_initpeers();
1525
1526
#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1527
igmp_mc_proc_init();
1528
#endif
1529
}
1530
1531