CoCalc -- ip_output.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/ipv4/ip_output.c
¹⁷³⁸⁸ views
1
/*
2
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3
 *		operating system.  INET is implemented using the  BSD Socket
4
 *		interface as the means of communication with the user level.
5
 *
6
 *		The Internet Protocol (IP) output module.
7
 *
8
 * Authors:	Ross Biro
9
 *		Fred N. van Kempen, <[email protected]>
10
 *		Donald Becker, <[email protected]>
11
 *		Alan Cox, <[email protected]>
12
 *		Richard Underwood
13
 *		Stefan Becker, <[email protected]>
14
 *		Jorge Cwik, <[email protected]>
15
 *		Arnt Gulbrandsen, <[email protected]>
16
 *		Hirokazu Takahashi, <[email protected]>
17
 *
18
 *	See ip_input.c for original log
19
 *
20
 *	Fixes:
21
 *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
22
 *		Mike Kilburn	:	htons() missing in ip_build_xmit.
23
 *		Bradford Johnson:	Fix faulty handling of some frames when
24
 *					no route is found.
25
 *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
26
 *					(in case if packet not accepted by
27
 *					output firewall rules)
28
 *		Mike McLagan	:	Routing by source
29
 *		Alexey Kuznetsov:	use new route cache
30
 *		Andi Kleen:		Fix broken PMTU recovery and remove
31
 *					some redundant tests.
32
 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
33
 *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
34
 *		Andi Kleen	:	Split fast and slow ip_build_xmit path
35
 *					for decreased register pressure on x86
36
 *					and more readibility.
37
 *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
38
 *					silently drop skb instead of failing with -EPERM.
39
 *		Detlev Wengorz	:	Copy protocol for fragments.
40
 *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
41
 *					datagrams.
42
 *		Hirokazu Takahashi:	sendfile() on UDP works now.
43
 */
44

45
#include <asm/uaccess.h>
46
#include <asm/system.h>
47
#include <linux/module.h>
48
#include <linux/types.h>
49
#include <linux/kernel.h>
50
#include <linux/mm.h>
51
#include <linux/string.h>
52
#include <linux/errno.h>
53
#include <linux/highmem.h>
54
#include <linux/slab.h>
55

56
#include <linux/socket.h>
57
#include <linux/sockios.h>
58
#include <linux/in.h>
59
#include <linux/inet.h>
60
#include <linux/netdevice.h>
61
#include <linux/etherdevice.h>
62
#include <linux/proc_fs.h>
63
#include <linux/stat.h>
64
#include <linux/init.h>
65

66
#include <net/snmp.h>
67
#include <net/ip.h>
68
#include <net/protocol.h>
69
#include <net/route.h>
70
#include <net/xfrm.h>
71
#include <linux/skbuff.h>
72
#include <net/sock.h>
73
#include <net/arp.h>
74
#include <net/icmp.h>
75
#include <net/checksum.h>
76
#include <net/inetpeer.h>
77
#include <linux/igmp.h>
78
#include <linux/netfilter_ipv4.h>
79
#include <linux/netfilter_bridge.h>
80
#include <linux/mroute.h>
81
#include <linux/netlink.h>
82
#include <linux/tcp.h>
83

84
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85
EXPORT_SYMBOL(sysctl_ip_default_ttl);
86

87
/* Generate a checksum for an outgoing IP datagram. */
88
__inline__ void ip_send_check(struct iphdr *iph)
89
{
90
	iph->check = 0;
91
	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92
}
93
EXPORT_SYMBOL(ip_send_check);
94

95
int __ip_local_out(struct sk_buff *skb)
96
{
97
	struct iphdr *iph = ip_hdr(skb);
98

99
	iph->tot_len = htons(skb->len);
100
	ip_send_check(iph);
101
	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102
		       skb_dst(skb)->dev, dst_output);
103
}
104

105
int ip_local_out(struct sk_buff *skb)
106
{
107
	int err;
108

109
	err = __ip_local_out(skb);
110
	if (likely(err == 1))
111
		err = dst_output(skb);
112

113
	return err;
114
}
115
EXPORT_SYMBOL_GPL(ip_local_out);
116

117
/* dev_loopback_xmit for use with netfilter. */
118
static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119
{
120
	skb_reset_mac_header(newskb);
121
	__skb_pull(newskb, skb_network_offset(newskb));
122
	newskb->pkt_type = PACKET_LOOPBACK;
123
	newskb->ip_summed = CHECKSUM_UNNECESSARY;
124
	WARN_ON(!skb_dst(newskb));
125
	netif_rx_ni(newskb);
126
	return 0;
127
}
128

129
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130
{
131
	int ttl = inet->uc_ttl;
132

133
	if (ttl < 0)
134
		ttl = ip4_dst_hoplimit(dst);
135
	return ttl;
136
}
137

138
/*
139
 *		Add an ip header to a skbuff and send it out.
140
 *
141
 */
142
int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143
			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
144
{
145
	struct inet_sock *inet = inet_sk(sk);
146
	struct rtable *rt = skb_rtable(skb);
147
	struct iphdr *iph;
148

149
	/* Build the IP header. */
150
	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151
	skb_reset_network_header(skb);
152
	iph = ip_hdr(skb);
153
	iph->version  = 4;
154
	iph->ihl      = 5;
155
	iph->tos      = inet->tos;
156
	if (ip_dont_fragment(sk, &rt->dst))
157
		iph->frag_off = htons(IP_DF);
158
	else
159
		iph->frag_off = 0;
160
	iph->ttl      = ip_select_ttl(inet, &rt->dst);
161
	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162
	iph->saddr    = saddr;
163
	iph->protocol = sk->sk_protocol;
164
	ip_select_ident(iph, &rt->dst, sk);
165

166
	if (opt && opt->opt.optlen) {
167
		iph->ihl += opt->opt.optlen>>2;
168
		ip_options_build(skb, &opt->opt, daddr, rt, 0);
169
	}
170

171
	skb->priority = sk->sk_priority;
172
	skb->mark = sk->sk_mark;
173

174
	/* Send it out. */
175
	return ip_local_out(skb);
176
}
177
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178

179
static inline int ip_finish_output2(struct sk_buff *skb)
180
{
181
	struct dst_entry *dst = skb_dst(skb);
182
	struct rtable *rt = (struct rtable *)dst;
183
	struct net_device *dev = dst->dev;
184
	unsigned int hh_len = LL_RESERVED_SPACE(dev);
185

186
	if (rt->rt_type == RTN_MULTICAST) {
187
		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
188
	} else if (rt->rt_type == RTN_BROADCAST)
189
		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
190

191
	/* Be paranoid, rather than too clever. */
192
	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
193
		struct sk_buff *skb2;
194

195
		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
196
		if (skb2 == NULL) {
197
			kfree_skb(skb);
198
			return -ENOMEM;
199
		}
200
		if (skb->sk)
201
			skb_set_owner_w(skb2, skb->sk);
202
		kfree_skb(skb);
203
		skb = skb2;
204
	}
205

206
	if (dst->hh)
207
		return neigh_hh_output(dst->hh, skb);
208
	else if (dst->neighbour)
209
		return dst->neighbour->output(skb);
210

211
	if (net_ratelimit())
212
		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213
	kfree_skb(skb);
214
	return -EINVAL;
215
}
216

217
static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218
{
219
	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220

221
	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
222
	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
223
}
224

225
static int ip_finish_output(struct sk_buff *skb)
226
{
227
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228
	/* Policy lookup after SNAT yielded a new policy */
229
	if (skb_dst(skb)->xfrm != NULL) {
230
		IPCB(skb)->flags |= IPSKB_REROUTED;
231
		return dst_output(skb);
232
	}
233
#endif
234
	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
235
		return ip_fragment(skb, ip_finish_output2);
236
	else
237
		return ip_finish_output2(skb);
238
}
239

240
int ip_mc_output(struct sk_buff *skb)
241
{
242
	struct sock *sk = skb->sk;
243
	struct rtable *rt = skb_rtable(skb);
244
	struct net_device *dev = rt->dst.dev;
245

246
	/*
247
	 *	If the indicated interface is up and running, send the packet.
248
	 */
249
	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
250

251
	skb->dev = dev;
252
	skb->protocol = htons(ETH_P_IP);
253

254
	/*
255
	 *	Multicasts are looped back for other local users
256
	 */
257

258
	if (rt->rt_flags&RTCF_MULTICAST) {
259
		if (sk_mc_loop(sk)
260
#ifdef CONFIG_IP_MROUTE
261
		/* Small optimization: do not loopback not local frames,
262
		   which returned after forwarding; they will be  dropped
263
		   by ip_mr_input in any case.
264
		   Note, that local frames are looped back to be delivered
265
		   to local recipients.
266

267
		   This check is duplicated in ip_mr_input at the moment.
268
		 */
269
		    &&
270
		    ((rt->rt_flags & RTCF_LOCAL) ||
271
		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
272
#endif
273
		   ) {
274
			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275
			if (newskb)
276
				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277
					newskb, NULL, newskb->dev,
278
					ip_dev_loopback_xmit);
279
		}
280

281
		/* Multicasts with ttl 0 must not go beyond the host */
282

283
		if (ip_hdr(skb)->ttl == 0) {
284
			kfree_skb(skb);
285
			return 0;
286
		}
287
	}
288

289
	if (rt->rt_flags&RTCF_BROADCAST) {
290
		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291
		if (newskb)
292
			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293
				NULL, newskb->dev, ip_dev_loopback_xmit);
294
	}
295

296
	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297
			    skb->dev, ip_finish_output,
298
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
299
}
300

301
int ip_output(struct sk_buff *skb)
302
{
303
	struct net_device *dev = skb_dst(skb)->dev;
304

305
	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
306

307
	skb->dev = dev;
308
	skb->protocol = htons(ETH_P_IP);
309

310
	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
311
			    ip_finish_output,
312
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
313
}
314

315
int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
316
{
317
	struct sock *sk = skb->sk;
318
	struct inet_sock *inet = inet_sk(sk);
319
	struct ip_options_rcu *inet_opt;
320
	struct flowi4 *fl4;
321
	struct rtable *rt;
322
	struct iphdr *iph;
323
	int res;
324

325
	/* Skip all of this if the packet is already routed,
326
	 * f.e. by something like SCTP.
327
	 */
328
	rcu_read_lock();
329
	inet_opt = rcu_dereference(inet->inet_opt);
330
	fl4 = &fl->u.ip4;
331
	rt = skb_rtable(skb);
332
	if (rt != NULL)
333
		goto packet_routed;
334

335
	/* Make sure we can route this packet. */
336
	rt = (struct rtable *)__sk_dst_check(sk, 0);
337
	if (rt == NULL) {
338
		__be32 daddr;
339

340
		/* Use correct destination address if we have options. */
341
		daddr = inet->inet_daddr;
342
		if (inet_opt && inet_opt->opt.srr)
343
			daddr = inet_opt->opt.faddr;
344

345
		/* If this fails, retransmit mechanism of transport layer will
346
		 * keep trying until route appears or the connection times
347
		 * itself out.
348
		 */
349
		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
350
					   daddr, inet->inet_saddr,
351
					   inet->inet_dport,
352
					   inet->inet_sport,
353
					   sk->sk_protocol,
354
					   RT_CONN_FLAGS(sk),
355
					   sk->sk_bound_dev_if);
356
		if (IS_ERR(rt))
357
			goto no_route;
358
		sk_setup_caps(sk, &rt->dst);
359
	}
360
	skb_dst_set_noref(skb, &rt->dst);
361

362
packet_routed:
363
	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
364
		goto no_route;
365

366
	/* OK, we know where to send it, allocate and build IP header. */
367
	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
368
	skb_reset_network_header(skb);
369
	iph = ip_hdr(skb);
370
	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
371
	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
372
		iph->frag_off = htons(IP_DF);
373
	else
374
		iph->frag_off = 0;
375
	iph->ttl      = ip_select_ttl(inet, &rt->dst);
376
	iph->protocol = sk->sk_protocol;
377
	iph->saddr    = fl4->saddr;
378
	iph->daddr    = fl4->daddr;
379
	/* Transport layer set skb->h.foo itself. */
380

381
	if (inet_opt && inet_opt->opt.optlen) {
382
		iph->ihl += inet_opt->opt.optlen >> 2;
383
		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
384
	}
385

386
	ip_select_ident_more(iph, &rt->dst, sk,
387
			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
388

389
	skb->priority = sk->sk_priority;
390
	skb->mark = sk->sk_mark;
391

392
	res = ip_local_out(skb);
393
	rcu_read_unlock();
394
	return res;
395

396
no_route:
397
	rcu_read_unlock();
398
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
399
	kfree_skb(skb);
400
	return -EHOSTUNREACH;
401
}
402
EXPORT_SYMBOL(ip_queue_xmit);
403

404

405
static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
406
{
407
	to->pkt_type = from->pkt_type;
408
	to->priority = from->priority;
409
	to->protocol = from->protocol;
410
	skb_dst_drop(to);
411
	skb_dst_copy(to, from);
412
	to->dev = from->dev;
413
	to->mark = from->mark;
414

415
	/* Copy the flags to each fragment. */
416
	IPCB(to)->flags = IPCB(from)->flags;
417

418
#ifdef CONFIG_NET_SCHED
419
	to->tc_index = from->tc_index;
420
#endif
421
	nf_copy(to, from);
422
#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
423
    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
424
	to->nf_trace = from->nf_trace;
425
#endif
426
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
427
	to->ipvs_property = from->ipvs_property;
428
#endif
429
	skb_copy_secmark(to, from);
430
}
431

432
/*
433
 *	This IP datagram is too large to be sent in one piece.  Break it up into
434
 *	smaller pieces (each of size equal to IP header plus
435
 *	a block of the data of the original IP data part) that will yet fit in a
436
 *	single device frame, and queue such a frame for sending.
437
 */
438

439
int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
440
{
441
	struct iphdr *iph;
442
	int ptr;
443
	struct net_device *dev;
444
	struct sk_buff *skb2;
445
	unsigned int mtu, hlen, left, len, ll_rs;
446
	int offset;
447
	__be16 not_last_frag;
448
	struct rtable *rt = skb_rtable(skb);
449
	int err = 0;
450

451
	dev = rt->dst.dev;
452

453
	/*
454
	 *	Point into the IP datagram header.
455
	 */
456

457
	iph = ip_hdr(skb);
458

459
	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
460
		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
461
		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
462
			  htonl(ip_skb_dst_mtu(skb)));
463
		kfree_skb(skb);
464
		return -EMSGSIZE;
465
	}
466

467
	/*
468
	 *	Setup starting values.
469
	 */
470

471
	hlen = iph->ihl * 4;
472
	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
473
#ifdef CONFIG_BRIDGE_NETFILTER
474
	if (skb->nf_bridge)
475
		mtu -= nf_bridge_mtu_reduction(skb);
476
#endif
477
	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
478

479
	/* When frag_list is given, use it. First, check its validity:
480
	 * some transformers could create wrong frag_list or break existing
481
	 * one, it is not prohibited. In this case fall back to copying.
482
	 *
483
	 * LATER: this step can be merged to real generation of fragments,
484
	 * we can switch to copy when see the first bad fragment.
485
	 */
486
	if (skb_has_frag_list(skb)) {
487
		struct sk_buff *frag, *frag2;
488
		int first_len = skb_pagelen(skb);
489

490
		if (first_len - hlen > mtu ||
491
		    ((first_len - hlen) & 7) ||
492
		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
493
		    skb_cloned(skb))
494
			goto slow_path;
495

496
		skb_walk_frags(skb, frag) {
497
			/* Correct geometry. */
498
			if (frag->len > mtu ||
499
			    ((frag->len & 7) && frag->next) ||
500
			    skb_headroom(frag) < hlen)
501
				goto slow_path_clean;
502

503
			/* Partially cloned skb? */
504
			if (skb_shared(frag))
505
				goto slow_path_clean;
506

507
			BUG_ON(frag->sk);
508
			if (skb->sk) {
509
				frag->sk = skb->sk;
510
				frag->destructor = sock_wfree;
511
			}
512
			skb->truesize -= frag->truesize;
513
		}
514

515
		/* Everything is OK. Generate! */
516

517
		err = 0;
518
		offset = 0;
519
		frag = skb_shinfo(skb)->frag_list;
520
		skb_frag_list_init(skb);
521
		skb->data_len = first_len - skb_headlen(skb);
522
		skb->len = first_len;
523
		iph->tot_len = htons(first_len);
524
		iph->frag_off = htons(IP_MF);
525
		ip_send_check(iph);
526

527
		for (;;) {
528
			/* Prepare header of the next frame,
529
			 * before previous one went down. */
530
			if (frag) {
531
				frag->ip_summed = CHECKSUM_NONE;
532
				skb_reset_transport_header(frag);
533
				__skb_push(frag, hlen);
534
				skb_reset_network_header(frag);
535
				memcpy(skb_network_header(frag), iph, hlen);
536
				iph = ip_hdr(frag);
537
				iph->tot_len = htons(frag->len);
538
				ip_copy_metadata(frag, skb);
539
				if (offset == 0)
540
					ip_options_fragment(frag);
541
				offset += skb->len - hlen;
542
				iph->frag_off = htons(offset>>3);
543
				if (frag->next != NULL)
544
					iph->frag_off |= htons(IP_MF);
545
				/* Ready, complete checksum */
546
				ip_send_check(iph);
547
			}
548

549
			err = output(skb);
550

551
			if (!err)
552
				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
553
			if (err || !frag)
554
				break;
555

556
			skb = frag;
557
			frag = skb->next;
558
			skb->next = NULL;
559
		}
560

561
		if (err == 0) {
562
			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
563
			return 0;
564
		}
565

566
		while (frag) {
567
			skb = frag->next;
568
			kfree_skb(frag);
569
			frag = skb;
570
		}
571
		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
572
		return err;
573

574
slow_path_clean:
575
		skb_walk_frags(skb, frag2) {
576
			if (frag2 == frag)
577
				break;
578
			frag2->sk = NULL;
579
			frag2->destructor = NULL;
580
			skb->truesize += frag2->truesize;
581
		}
582
	}
583

584
slow_path:
585
	left = skb->len - hlen;		/* Space per frame */
586
	ptr = hlen;		/* Where to start from */
587

588
	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
589
	 * we need to make room for the encapsulating header
590
	 */
591
	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
592

593
	/*
594
	 *	Fragment the datagram.
595
	 */
596

597
	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
598
	not_last_frag = iph->frag_off & htons(IP_MF);
599

600
	/*
601
	 *	Keep copying data until we run out.
602
	 */
603

604
	while (left > 0) {
605
		len = left;
606
		/* IF: it doesn't fit, use 'mtu' - the data space left */
607
		if (len > mtu)
608
			len = mtu;
609
		/* IF: we are not sending up to and including the packet end
610
		   then align the next start on an eight byte boundary */
611
		if (len < left)	{
612
			len &= ~7;
613
		}
614
		/*
615
		 *	Allocate buffer.
616
		 */
617

618
		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
619
			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
620
			err = -ENOMEM;
621
			goto fail;
622
		}
623

624
		/*
625
		 *	Set up data on packet
626
		 */
627

628
		ip_copy_metadata(skb2, skb);
629
		skb_reserve(skb2, ll_rs);
630
		skb_put(skb2, len + hlen);
631
		skb_reset_network_header(skb2);
632
		skb2->transport_header = skb2->network_header + hlen;
633

634
		/*
635
		 *	Charge the memory for the fragment to any owner
636
		 *	it might possess
637
		 */
638

639
		if (skb->sk)
640
			skb_set_owner_w(skb2, skb->sk);
641

642
		/*
643
		 *	Copy the packet header into the new buffer.
644
		 */
645

646
		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
647

648
		/*
649
		 *	Copy a block of the IP datagram.
650
		 */
651
		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
652
			BUG();
653
		left -= len;
654

655
		/*
656
		 *	Fill in the new header fields.
657
		 */
658
		iph = ip_hdr(skb2);
659
		iph->frag_off = htons((offset >> 3));
660

661
		/* ANK: dirty, but effective trick. Upgrade options only if
662
		 * the segment to be fragmented was THE FIRST (otherwise,
663
		 * options are already fixed) and make it ONCE
664
		 * on the initial skb, so that all the following fragments
665
		 * will inherit fixed options.
666
		 */
667
		if (offset == 0)
668
			ip_options_fragment(skb);
669

670
		/*
671
		 *	Added AC : If we are fragmenting a fragment that's not the
672
		 *		   last fragment then keep MF on each bit
673
		 */
674
		if (left > 0 || not_last_frag)
675
			iph->frag_off |= htons(IP_MF);
676
		ptr += len;
677
		offset += len;
678

679
		/*
680
		 *	Put this fragment into the sending queue.
681
		 */
682
		iph->tot_len = htons(len + hlen);
683

684
		ip_send_check(iph);
685

686
		err = output(skb2);
687
		if (err)
688
			goto fail;
689

690
		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
691
	}
692
	kfree_skb(skb);
693
	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
694
	return err;
695

696
fail:
697
	kfree_skb(skb);
698
	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
699
	return err;
700
}
701
EXPORT_SYMBOL(ip_fragment);
702

703
int
704
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
705
{
706
	struct iovec *iov = from;
707

708
	if (skb->ip_summed == CHECKSUM_PARTIAL) {
709
		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
710
			return -EFAULT;
711
	} else {
712
		__wsum csum = 0;
713
		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
714
			return -EFAULT;
715
		skb->csum = csum_block_add(skb->csum, csum, odd);
716
	}
717
	return 0;
718
}
719
EXPORT_SYMBOL(ip_generic_getfrag);
720

721
static inline __wsum
722
csum_page(struct page *page, int offset, int copy)
723
{
724
	char *kaddr;
725
	__wsum csum;
726
	kaddr = kmap(page);
727
	csum = csum_partial(kaddr + offset, copy, 0);
728
	kunmap(page);
729
	return csum;
730
}
731

732
static inline int ip_ufo_append_data(struct sock *sk,
733
			struct sk_buff_head *queue,
734
			int getfrag(void *from, char *to, int offset, int len,
735
			       int odd, struct sk_buff *skb),
736
			void *from, int length, int hh_len, int fragheaderlen,
737
			int transhdrlen, int mtu, unsigned int flags)
738
{
739
	struct sk_buff *skb;
740
	int err;
741

742
	/* There is support for UDP fragmentation offload by network
743
	 * device, so create one single skb packet containing complete
744
	 * udp datagram
745
	 */
746
	if ((skb = skb_peek_tail(queue)) == NULL) {
747
		skb = sock_alloc_send_skb(sk,
748
			hh_len + fragheaderlen + transhdrlen + 20,
749
			(flags & MSG_DONTWAIT), &err);
750

751
		if (skb == NULL)
752
			return err;
753

754
		/* reserve space for Hardware header */
755
		skb_reserve(skb, hh_len);
756

757
		/* create space for UDP/IP header */
758
		skb_put(skb, fragheaderlen + transhdrlen);
759

760
		/* initialize network header pointer */
761
		skb_reset_network_header(skb);
762

763
		/* initialize protocol header pointer */
764
		skb->transport_header = skb->network_header + fragheaderlen;
765

766
		skb->ip_summed = CHECKSUM_PARTIAL;
767
		skb->csum = 0;
768

769
		/* specify the length of each IP datagram fragment */
770
		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
771
		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
772
		__skb_queue_tail(queue, skb);
773
	}
774

775
	return skb_append_datato_frags(sk, skb, getfrag, from,
776
				       (length - transhdrlen));
777
}
778

779
static int __ip_append_data(struct sock *sk,
780
			    struct flowi4 *fl4,
781
			    struct sk_buff_head *queue,
782
			    struct inet_cork *cork,
783
			    int getfrag(void *from, char *to, int offset,
784
					int len, int odd, struct sk_buff *skb),
785
			    void *from, int length, int transhdrlen,
786
			    unsigned int flags)
787
{
788
	struct inet_sock *inet = inet_sk(sk);
789
	struct sk_buff *skb;
790

791
	struct ip_options *opt = cork->opt;
792
	int hh_len;
793
	int exthdrlen;
794
	int mtu;
795
	int copy;
796
	int err;
797
	int offset = 0;
798
	unsigned int maxfraglen, fragheaderlen;
799
	int csummode = CHECKSUM_NONE;
800
	struct rtable *rt = (struct rtable *)cork->dst;
801

802
	skb = skb_peek_tail(queue);
803

804
	exthdrlen = !skb ? rt->dst.header_len : 0;
805
	mtu = cork->fragsize;
806

807
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
808

809
	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
810
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
811

812
	if (cork->length + length > 0xFFFF - fragheaderlen) {
813
		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
814
			       mtu-exthdrlen);
815
		return -EMSGSIZE;
816
	}
817

818
	/*
819
	 * transhdrlen > 0 means that this is the first fragment and we wish
820
	 * it won't be fragmented in the future.
821
	 */
822
	if (transhdrlen &&
823
	    length + fragheaderlen <= mtu &&
824
	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
825
	    !exthdrlen)
826
		csummode = CHECKSUM_PARTIAL;
827

828
	cork->length += length;
829
	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
830
	    (sk->sk_protocol == IPPROTO_UDP) &&
831
	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
832
		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
833
					 hh_len, fragheaderlen, transhdrlen,
834
					 mtu, flags);
835
		if (err)
836
			goto error;
837
		return 0;
838
	}
839

840
	/* So, what's going on in the loop below?
841
	 *
842
	 * We use calculated fragment length to generate chained skb,
843
	 * each of segments is IP fragment ready for sending to network after
844
	 * adding appropriate IP header.
845
	 */
846

847
	if (!skb)
848
		goto alloc_new_skb;
849

850
	while (length > 0) {
851
		/* Check if the remaining data fits into current packet. */
852
		copy = mtu - skb->len;
853
		if (copy < length)
854
			copy = maxfraglen - skb->len;
855
		if (copy <= 0) {
856
			char *data;
857
			unsigned int datalen;
858
			unsigned int fraglen;
859
			unsigned int fraggap;
860
			unsigned int alloclen;
861
			struct sk_buff *skb_prev;
862
alloc_new_skb:
863
			skb_prev = skb;
864
			if (skb_prev)
865
				fraggap = skb_prev->len - maxfraglen;
866
			else
867
				fraggap = 0;
868

869
			/*
870
			 * If remaining data exceeds the mtu,
871
			 * we know we need more fragment(s).
872
			 */
873
			datalen = length + fraggap;
874
			if (datalen > mtu - fragheaderlen)
875
				datalen = maxfraglen - fragheaderlen;
876
			fraglen = datalen + fragheaderlen;
877

878
			if ((flags & MSG_MORE) &&
879
			    !(rt->dst.dev->features&NETIF_F_SG))
880
				alloclen = mtu;
881
			else
882
				alloclen = fraglen;
883

884
			alloclen += exthdrlen;
885

886
			/* The last fragment gets additional space at tail.
887
			 * Note, with MSG_MORE we overallocate on fragments,
888
			 * because we have no idea what fragment will be
889
			 * the last.
890
			 */
891
			if (datalen == length + fraggap)
892
				alloclen += rt->dst.trailer_len;
893

894
			if (transhdrlen) {
895
				skb = sock_alloc_send_skb(sk,
896
						alloclen + hh_len + 15,
897
						(flags & MSG_DONTWAIT), &err);
898
			} else {
899
				skb = NULL;
900
				if (atomic_read(&sk->sk_wmem_alloc) <=
901
				    2 * sk->sk_sndbuf)
902
					skb = sock_wmalloc(sk,
903
							   alloclen + hh_len + 15, 1,
904
							   sk->sk_allocation);
905
				if (unlikely(skb == NULL))
906
					err = -ENOBUFS;
907
				else
908
					/* only the initial fragment is
909
					   time stamped */
910
					cork->tx_flags = 0;
911
			}
912
			if (skb == NULL)
913
				goto error;
914

915
			/*
916
			 *	Fill in the control structures
917
			 */
918
			skb->ip_summed = csummode;
919
			skb->csum = 0;
920
			skb_reserve(skb, hh_len);
921
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
922

923
			/*
924
			 *	Find where to start putting bytes.
925
			 */
926
			data = skb_put(skb, fraglen + exthdrlen);
927
			skb_set_network_header(skb, exthdrlen);
928
			skb->transport_header = (skb->network_header +
929
						 fragheaderlen);
930
			data += fragheaderlen + exthdrlen;
931

932
			if (fraggap) {
933
				skb->csum = skb_copy_and_csum_bits(
934
					skb_prev, maxfraglen,
935
					data + transhdrlen, fraggap, 0);
936
				skb_prev->csum = csum_sub(skb_prev->csum,
937
							  skb->csum);
938
				data += fraggap;
939
				pskb_trim_unique(skb_prev, maxfraglen);
940
			}
941

942
			copy = datalen - transhdrlen - fraggap;
943
			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
944
				err = -EFAULT;
945
				kfree_skb(skb);
946
				goto error;
947
			}
948

949
			offset += copy;
950
			length -= datalen - fraggap;
951
			transhdrlen = 0;
952
			exthdrlen = 0;
953
			csummode = CHECKSUM_NONE;
954

955
			/*
956
			 * Put the packet on the pending queue.
957
			 */
958
			__skb_queue_tail(queue, skb);
959
			continue;
960
		}
961

962
		if (copy > length)
963
			copy = length;
964

965
		if (!(rt->dst.dev->features&NETIF_F_SG)) {
966
			unsigned int off;
967

968
			off = skb->len;
969
			if (getfrag(from, skb_put(skb, copy),
970
					offset, copy, off, skb) < 0) {
971
				__skb_trim(skb, off);
972
				err = -EFAULT;
973
				goto error;
974
			}
975
		} else {
976
			int i = skb_shinfo(skb)->nr_frags;
977
			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
978
			struct page *page = cork->page;
979
			int off = cork->off;
980
			unsigned int left;
981

982
			if (page && (left = PAGE_SIZE - off) > 0) {
983
				if (copy >= left)
984
					copy = left;
985
				if (page != frag->page) {
986
					if (i == MAX_SKB_FRAGS) {
987
						err = -EMSGSIZE;
988
						goto error;
989
					}
990
					get_page(page);
991
					skb_fill_page_desc(skb, i, page, off, 0);
992
					frag = &skb_shinfo(skb)->frags[i];
993
				}
994
			} else if (i < MAX_SKB_FRAGS) {
995
				if (copy > PAGE_SIZE)
996
					copy = PAGE_SIZE;
997
				page = alloc_pages(sk->sk_allocation, 0);
998
				if (page == NULL)  {
999
					err = -ENOMEM;
1000
					goto error;
1001
				}
1002
				cork->page = page;
1003
				cork->off = 0;
1004

1005
				skb_fill_page_desc(skb, i, page, 0, 0);
1006
				frag = &skb_shinfo(skb)->frags[i];
1007
			} else {
1008
				err = -EMSGSIZE;
1009
				goto error;
1010
			}
1011
			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1012
				err = -EFAULT;
1013
				goto error;
1014
			}
1015
			cork->off += copy;
1016
			frag->size += copy;
1017
			skb->len += copy;
1018
			skb->data_len += copy;
1019
			skb->truesize += copy;
1020
			atomic_add(copy, &sk->sk_wmem_alloc);
1021
		}
1022
		offset += copy;
1023
		length -= copy;
1024
	}
1025

1026
	return 0;
1027

1028
error:
1029
	cork->length -= length;
1030
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1031
	return err;
1032
}
1033

1034
static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1035
			 struct ipcm_cookie *ipc, struct rtable **rtp)
1036
{
1037
	struct inet_sock *inet = inet_sk(sk);
1038
	struct ip_options_rcu *opt;
1039
	struct rtable *rt;
1040

1041
	/*
1042
	 * setup for corking.
1043
	 */
1044
	opt = ipc->opt;
1045
	if (opt) {
1046
		if (cork->opt == NULL) {
1047
			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1048
					    sk->sk_allocation);
1049
			if (unlikely(cork->opt == NULL))
1050
				return -ENOBUFS;
1051
		}
1052
		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1053
		cork->flags |= IPCORK_OPT;
1054
		cork->addr = ipc->addr;
1055
	}
1056
	rt = *rtp;
1057
	if (unlikely(!rt))
1058
		return -EFAULT;
1059
	/*
1060
	 * We steal reference to this route, caller should not release it
1061
	 */
1062
	*rtp = NULL;
1063
	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1064
			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1065
	cork->dst = &rt->dst;
1066
	cork->length = 0;
1067
	cork->tx_flags = ipc->tx_flags;
1068
	cork->page = NULL;
1069
	cork->off = 0;
1070

1071
	return 0;
1072
}
1073

1074
/*
1075
 *	ip_append_data() and ip_append_page() can make one large IP datagram
1076
 *	from many pieces of data. Each pieces will be holded on the socket
1077
 *	until ip_push_pending_frames() is called. Each piece can be a page
1078
 *	or non-page data.
1079
 *
1080
 *	Not only UDP, other transport protocols - e.g. raw sockets - can use
1081
 *	this interface potentially.
1082
 *
1083
 *	LATER: length must be adjusted by pad at tail, when it is required.
1084
 */
1085
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1086
		   int getfrag(void *from, char *to, int offset, int len,
1087
			       int odd, struct sk_buff *skb),
1088
		   void *from, int length, int transhdrlen,
1089
		   struct ipcm_cookie *ipc, struct rtable **rtp,
1090
		   unsigned int flags)
1091
{
1092
	struct inet_sock *inet = inet_sk(sk);
1093
	int err;
1094

1095
	if (flags&MSG_PROBE)
1096
		return 0;
1097

1098
	if (skb_queue_empty(&sk->sk_write_queue)) {
1099
		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1100
		if (err)
1101
			return err;
1102
	} else {
1103
		transhdrlen = 0;
1104
	}
1105

1106
	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1107
				from, length, transhdrlen, flags);
1108
}
1109

1110
ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1111
		       int offset, size_t size, int flags)
1112
{
1113
	struct inet_sock *inet = inet_sk(sk);
1114
	struct sk_buff *skb;
1115
	struct rtable *rt;
1116
	struct ip_options *opt = NULL;
1117
	struct inet_cork *cork;
1118
	int hh_len;
1119
	int mtu;
1120
	int len;
1121
	int err;
1122
	unsigned int maxfraglen, fragheaderlen, fraggap;
1123

1124
	if (inet->hdrincl)
1125
		return -EPERM;
1126

1127
	if (flags&MSG_PROBE)
1128
		return 0;
1129

1130
	if (skb_queue_empty(&sk->sk_write_queue))
1131
		return -EINVAL;
1132

1133
	cork = &inet->cork.base;
1134
	rt = (struct rtable *)cork->dst;
1135
	if (cork->flags & IPCORK_OPT)
1136
		opt = cork->opt;
1137

1138
	if (!(rt->dst.dev->features&NETIF_F_SG))
1139
		return -EOPNOTSUPP;
1140

1141
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1142
	mtu = cork->fragsize;
1143

1144
	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1145
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1146

1147
	if (cork->length + size > 0xFFFF - fragheaderlen) {
1148
		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1149
		return -EMSGSIZE;
1150
	}
1151

1152
	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1153
		return -EINVAL;
1154

1155
	cork->length += size;
1156
	if ((size + skb->len > mtu) &&
1157
	    (sk->sk_protocol == IPPROTO_UDP) &&
1158
	    (rt->dst.dev->features & NETIF_F_UFO)) {
1159
		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1160
		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1161
	}
1162

1163

1164
	while (size > 0) {
1165
		int i;
1166

1167
		if (skb_is_gso(skb))
1168
			len = size;
1169
		else {
1170

1171
			/* Check if the remaining data fits into current packet. */
1172
			len = mtu - skb->len;
1173
			if (len < size)
1174
				len = maxfraglen - skb->len;
1175
		}
1176
		if (len <= 0) {
1177
			struct sk_buff *skb_prev;
1178
			int alloclen;
1179

1180
			skb_prev = skb;
1181
			fraggap = skb_prev->len - maxfraglen;
1182

1183
			alloclen = fragheaderlen + hh_len + fraggap + 15;
1184
			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1185
			if (unlikely(!skb)) {
1186
				err = -ENOBUFS;
1187
				goto error;
1188
			}
1189

1190
			/*
1191
			 *	Fill in the control structures
1192
			 */
1193
			skb->ip_summed = CHECKSUM_NONE;
1194
			skb->csum = 0;
1195
			skb_reserve(skb, hh_len);
1196

1197
			/*
1198
			 *	Find where to start putting bytes.
1199
			 */
1200
			skb_put(skb, fragheaderlen + fraggap);
1201
			skb_reset_network_header(skb);
1202
			skb->transport_header = (skb->network_header +
1203
						 fragheaderlen);
1204
			if (fraggap) {
1205
				skb->csum = skb_copy_and_csum_bits(skb_prev,
1206
								   maxfraglen,
1207
						    skb_transport_header(skb),
1208
								   fraggap, 0);
1209
				skb_prev->csum = csum_sub(skb_prev->csum,
1210
							  skb->csum);
1211
				pskb_trim_unique(skb_prev, maxfraglen);
1212
			}
1213

1214
			/*
1215
			 * Put the packet on the pending queue.
1216
			 */
1217
			__skb_queue_tail(&sk->sk_write_queue, skb);
1218
			continue;
1219
		}
1220

1221
		i = skb_shinfo(skb)->nr_frags;
1222
		if (len > size)
1223
			len = size;
1224
		if (skb_can_coalesce(skb, i, page, offset)) {
1225
			skb_shinfo(skb)->frags[i-1].size += len;
1226
		} else if (i < MAX_SKB_FRAGS) {
1227
			get_page(page);
1228
			skb_fill_page_desc(skb, i, page, offset, len);
1229
		} else {
1230
			err = -EMSGSIZE;
1231
			goto error;
1232
		}
1233

1234
		if (skb->ip_summed == CHECKSUM_NONE) {
1235
			__wsum csum;
1236
			csum = csum_page(page, offset, len);
1237
			skb->csum = csum_block_add(skb->csum, csum, skb->len);
1238
		}
1239

1240
		skb->len += len;
1241
		skb->data_len += len;
1242
		skb->truesize += len;
1243
		atomic_add(len, &sk->sk_wmem_alloc);
1244
		offset += len;
1245
		size -= len;
1246
	}
1247
	return 0;
1248

1249
error:
1250
	cork->length -= size;
1251
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1252
	return err;
1253
}
1254

1255
static void ip_cork_release(struct inet_cork *cork)
1256
{
1257
	cork->flags &= ~IPCORK_OPT;
1258
	kfree(cork->opt);
1259
	cork->opt = NULL;
1260
	dst_release(cork->dst);
1261
	cork->dst = NULL;
1262
}
1263

1264
/*
1265
 *	Combined all pending IP fragments on the socket as one IP datagram
1266
 *	and push them out.
1267
 */
1268
struct sk_buff *__ip_make_skb(struct sock *sk,
1269
			      struct flowi4 *fl4,
1270
			      struct sk_buff_head *queue,
1271
			      struct inet_cork *cork)
1272
{
1273
	struct sk_buff *skb, *tmp_skb;
1274
	struct sk_buff **tail_skb;
1275
	struct inet_sock *inet = inet_sk(sk);
1276
	struct net *net = sock_net(sk);
1277
	struct ip_options *opt = NULL;
1278
	struct rtable *rt = (struct rtable *)cork->dst;
1279
	struct iphdr *iph;
1280
	__be16 df = 0;
1281
	__u8 ttl;
1282

1283
	if ((skb = __skb_dequeue(queue)) == NULL)
1284
		goto out;
1285
	tail_skb = &(skb_shinfo(skb)->frag_list);
1286

1287
	/* move skb->data to ip header from ext header */
1288
	if (skb->data < skb_network_header(skb))
1289
		__skb_pull(skb, skb_network_offset(skb));
1290
	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1291
		__skb_pull(tmp_skb, skb_network_header_len(skb));
1292
		*tail_skb = tmp_skb;
1293
		tail_skb = &(tmp_skb->next);
1294
		skb->len += tmp_skb->len;
1295
		skb->data_len += tmp_skb->len;
1296
		skb->truesize += tmp_skb->truesize;
1297
		tmp_skb->destructor = NULL;
1298
		tmp_skb->sk = NULL;
1299
	}
1300

1301
	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1302
	 * to fragment the frame generated here. No matter, what transforms
1303
	 * how transforms change size of the packet, it will come out.
1304
	 */
1305
	if (inet->pmtudisc < IP_PMTUDISC_DO)
1306
		skb->local_df = 1;
1307

1308
	/* DF bit is set when we want to see DF on outgoing frames.
1309
	 * If local_df is set too, we still allow to fragment this frame
1310
	 * locally. */
1311
	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1312
	    (skb->len <= dst_mtu(&rt->dst) &&
1313
	     ip_dont_fragment(sk, &rt->dst)))
1314
		df = htons(IP_DF);
1315

1316
	if (cork->flags & IPCORK_OPT)
1317
		opt = cork->opt;
1318

1319
	if (rt->rt_type == RTN_MULTICAST)
1320
		ttl = inet->mc_ttl;
1321
	else
1322
		ttl = ip_select_ttl(inet, &rt->dst);
1323

1324
	iph = (struct iphdr *)skb->data;
1325
	iph->version = 4;
1326
	iph->ihl = 5;
1327
	iph->tos = inet->tos;
1328
	iph->frag_off = df;
1329
	ip_select_ident(iph, &rt->dst, sk);
1330
	iph->ttl = ttl;
1331
	iph->protocol = sk->sk_protocol;
1332
	iph->saddr = fl4->saddr;
1333
	iph->daddr = fl4->daddr;
1334

1335
	if (opt) {
1336
		iph->ihl += opt->optlen>>2;
1337
		ip_options_build(skb, opt, cork->addr, rt, 0);
1338
	}
1339

1340
	skb->priority = sk->sk_priority;
1341
	skb->mark = sk->sk_mark;
1342
	/*
1343
	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1344
	 * on dst refcount
1345
	 */
1346
	cork->dst = NULL;
1347
	skb_dst_set(skb, &rt->dst);
1348

1349
	if (iph->protocol == IPPROTO_ICMP)
1350
		icmp_out_count(net, ((struct icmphdr *)
1351
			skb_transport_header(skb))->type);
1352

1353
	ip_cork_release(cork);
1354
out:
1355
	return skb;
1356
}
1357

1358
int ip_send_skb(struct sk_buff *skb)
1359
{
1360
	struct net *net = sock_net(skb->sk);
1361
	int err;
1362

1363
	err = ip_local_out(skb);
1364
	if (err) {
1365
		if (err > 0)
1366
			err = net_xmit_errno(err);
1367
		if (err)
1368
			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1369
	}
1370

1371
	return err;
1372
}
1373

1374
int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1375
{
1376
	struct sk_buff *skb;
1377

1378
	skb = ip_finish_skb(sk, fl4);
1379
	if (!skb)
1380
		return 0;
1381

1382
	/* Netfilter gets whole the not fragmented skb. */
1383
	return ip_send_skb(skb);
1384
}
1385

1386
/*
1387
 *	Throw away all pending data on the socket.
1388
 */
1389
static void __ip_flush_pending_frames(struct sock *sk,
1390
				      struct sk_buff_head *queue,
1391
				      struct inet_cork *cork)
1392
{
1393
	struct sk_buff *skb;
1394

1395
	while ((skb = __skb_dequeue_tail(queue)) != NULL)
1396
		kfree_skb(skb);
1397

1398
	ip_cork_release(cork);
1399
}
1400

1401
void ip_flush_pending_frames(struct sock *sk)
1402
{
1403
	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1404
}
1405

1406
struct sk_buff *ip_make_skb(struct sock *sk,
1407
			    struct flowi4 *fl4,
1408
			    int getfrag(void *from, char *to, int offset,
1409
					int len, int odd, struct sk_buff *skb),
1410
			    void *from, int length, int transhdrlen,
1411
			    struct ipcm_cookie *ipc, struct rtable **rtp,
1412
			    unsigned int flags)
1413
{
1414
	struct inet_cork cork;
1415
	struct sk_buff_head queue;
1416
	int err;
1417

1418
	if (flags & MSG_PROBE)
1419
		return NULL;
1420

1421
	__skb_queue_head_init(&queue);
1422

1423
	cork.flags = 0;
1424
	cork.addr = 0;
1425
	cork.opt = NULL;
1426
	err = ip_setup_cork(sk, &cork, ipc, rtp);
1427
	if (err)
1428
		return ERR_PTR(err);
1429

1430
	err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1431
			       from, length, transhdrlen, flags);
1432
	if (err) {
1433
		__ip_flush_pending_frames(sk, &queue, &cork);
1434
		return ERR_PTR(err);
1435
	}
1436

1437
	return __ip_make_skb(sk, fl4, &queue, &cork);
1438
}
1439

1440
/*
1441
 *	Fetch data from kernel space and fill in checksum if needed.
1442
 */
1443
static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1444
			      int len, int odd, struct sk_buff *skb)
1445
{
1446
	__wsum csum;
1447

1448
	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1449
	skb->csum = csum_block_add(skb->csum, csum, odd);
1450
	return 0;
1451
}
1452

1453
/*
1454
 *	Generic function to send a packet as reply to another packet.
1455
 *	Used to send TCP resets so far. ICMP should use this function too.
1456
 *
1457
 *	Should run single threaded per socket because it uses the sock
1458
 *     	structure to pass arguments.
1459
 */
1460
void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1461
		   struct ip_reply_arg *arg, unsigned int len)
1462
{
1463
	struct inet_sock *inet = inet_sk(sk);
1464
	struct ip_options_data replyopts;
1465
	struct ipcm_cookie ipc;
1466
	struct flowi4 fl4;
1467
	struct rtable *rt = skb_rtable(skb);
1468

1469
	if (ip_options_echo(&replyopts.opt.opt, skb))
1470
		return;
1471

1472
	ipc.addr = daddr;
1473
	ipc.opt = NULL;
1474
	ipc.tx_flags = 0;
1475

1476
	if (replyopts.opt.opt.optlen) {
1477
		ipc.opt = &replyopts.opt;
1478

1479
		if (replyopts.opt.opt.srr)
1480
			daddr = replyopts.opt.opt.faddr;
1481
	}
1482

1483
	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1484
			   RT_TOS(ip_hdr(skb)->tos),
1485
			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
1486
			   ip_reply_arg_flowi_flags(arg),
1487
			   daddr, rt->rt_spec_dst,
1488
			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1489
	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1490
	rt = ip_route_output_key(sock_net(sk), &fl4);
1491
	if (IS_ERR(rt))
1492
		return;
1493

1494
	/* And let IP do all the hard work.
1495

1496
	   This chunk is not reenterable, hence spinlock.
1497
	   Note that it uses the fact, that this function is called
1498
	   with locally disabled BH and that sk cannot be already spinlocked.
1499
	 */
1500
	bh_lock_sock(sk);
1501
	inet->tos = ip_hdr(skb)->tos;
1502
	sk->sk_priority = skb->priority;
1503
	sk->sk_protocol = ip_hdr(skb)->protocol;
1504
	sk->sk_bound_dev_if = arg->bound_dev_if;
1505
	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1506
		       &ipc, &rt, MSG_DONTWAIT);
1507
	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1508
		if (arg->csumoffset >= 0)
1509
			*((__sum16 *)skb_transport_header(skb) +
1510
			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
1511
								arg->csum));
1512
		skb->ip_summed = CHECKSUM_NONE;
1513
		ip_push_pending_frames(sk, &fl4);
1514
	}
1515

1516
	bh_unlock_sock(sk);
1517

1518
	ip_rt_put(rt);
1519
}
1520

1521
void __init ip_init(void)
1522
{
1523
	ip_rt_init();
1524
	inet_initpeers();
1525

1526
#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1527
	igmp_mc_proc_init();
1528
#endif
1529
}
1530

1531
Product

Resources

Company