CoCalc -- ip6_output.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/ipv6/ip6_output.c
¹⁷⁴⁸² views
1
/*
2
 *	IPv6 output functions
3
 *	Linux INET6 implementation
4
 *
5
 *	Authors:
6
 *	Pedro Roque		<[email protected]>
7
 *
8
 *	Based on linux/net/ipv4/ip_output.c
9
 *
10
 *	This program is free software; you can redistribute it and/or
11
 *      modify it under the terms of the GNU General Public License
12
 *      as published by the Free Software Foundation; either version
13
 *      2 of the License, or (at your option) any later version.
14
 *
15
 *	Changes:
16
 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17
 *				extension headers are implemented.
18
 *				route changes now work.
19
 *				ip6_forward does not confuse sniffers.
20
 *				etc.
21
 *
22
 *      H. von Brand    :       Added missing #include <linux/string.h>
23
 *	Imran Patel	: 	frag id should be in NBO
24
 *      Kazunori MIYAZAWA @USAGI
25
 *			:       add ip6_append_data and related functions
26
 *				for datagram xmit
27
 */
28

29
#include <linux/errno.h>
30
#include <linux/kernel.h>
31
#include <linux/string.h>
32
#include <linux/socket.h>
33
#include <linux/net.h>
34
#include <linux/netdevice.h>
35
#include <linux/if_arp.h>
36
#include <linux/in6.h>
37
#include <linux/tcp.h>
38
#include <linux/route.h>
39
#include <linux/module.h>
40
#include <linux/slab.h>
41

42
#include <linux/netfilter.h>
43
#include <linux/netfilter_ipv6.h>
44

45
#include <net/sock.h>
46
#include <net/snmp.h>
47

48
#include <net/ipv6.h>
49
#include <net/ndisc.h>
50
#include <net/protocol.h>
51
#include <net/ip6_route.h>
52
#include <net/addrconf.h>
53
#include <net/rawv6.h>
54
#include <net/icmp.h>
55
#include <net/xfrm.h>
56
#include <net/checksum.h>
57
#include <linux/mroute6.h>
58

59
int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60

61
int __ip6_local_out(struct sk_buff *skb)
62
{
63
	int len;
64

65
	len = skb->len - sizeof(struct ipv6hdr);
66
	if (len > IPV6_MAXPLEN)
67
		len = 0;
68
	ipv6_hdr(skb)->payload_len = htons(len);
69

70
	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71
		       skb_dst(skb)->dev, dst_output);
72
}
73

74
int ip6_local_out(struct sk_buff *skb)
75
{
76
	int err;
77

78
	err = __ip6_local_out(skb);
79
	if (likely(err == 1))
80
		err = dst_output(skb);
81

82
	return err;
83
}
84
EXPORT_SYMBOL_GPL(ip6_local_out);
85

86
/* dev_loopback_xmit for use with netfilter. */
87
static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88
{
89
	skb_reset_mac_header(newskb);
90
	__skb_pull(newskb, skb_network_offset(newskb));
91
	newskb->pkt_type = PACKET_LOOPBACK;
92
	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93
	WARN_ON(!skb_dst(newskb));
94

95
	netif_rx_ni(newskb);
96
	return 0;
97
}
98

99
static int ip6_finish_output2(struct sk_buff *skb)
100
{
101
	struct dst_entry *dst = skb_dst(skb);
102
	struct net_device *dev = dst->dev;
103

104
	skb->protocol = htons(ETH_P_IPV6);
105
	skb->dev = dev;
106

107
	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108
		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
109

110
		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111
		    ((mroute6_socket(dev_net(dev), skb) &&
112
		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113
		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114
					 &ipv6_hdr(skb)->saddr))) {
115
			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
116

117
			/* Do not check for IFF_ALLMULTI; multicast routing
118
			   is not supported in any case.
119
			 */
120
			if (newskb)
121
				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122
					newskb, NULL, newskb->dev,
123
					ip6_dev_loopback_xmit);
124

125
			if (ipv6_hdr(skb)->hop_limit == 0) {
126
				IP6_INC_STATS(dev_net(dev), idev,
127
					      IPSTATS_MIB_OUTDISCARDS);
128
				kfree_skb(skb);
129
				return 0;
130
			}
131
		}
132

133
		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
134
				skb->len);
135
	}
136

137
	if (dst->hh)
138
		return neigh_hh_output(dst->hh, skb);
139
	else if (dst->neighbour)
140
		return dst->neighbour->output(skb);
141

142
	IP6_INC_STATS_BH(dev_net(dst->dev),
143
			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144
	kfree_skb(skb);
145
	return -EINVAL;
146
}
147

148
static int ip6_finish_output(struct sk_buff *skb)
149
{
150
	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
151
	    dst_allfrag(skb_dst(skb)))
152
		return ip6_fragment(skb, ip6_finish_output2);
153
	else
154
		return ip6_finish_output2(skb);
155
}
156

157
int ip6_output(struct sk_buff *skb)
158
{
159
	struct net_device *dev = skb_dst(skb)->dev;
160
	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161
	if (unlikely(idev->cnf.disable_ipv6)) {
162
		IP6_INC_STATS(dev_net(dev), idev,
163
			      IPSTATS_MIB_OUTDISCARDS);
164
		kfree_skb(skb);
165
		return 0;
166
	}
167

168
	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
169
			    ip6_finish_output,
170
			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171
}
172

173
/*
174
 *	xmit an sk_buff (used by TCP, SCTP and DCCP)
175
 */
176

177
int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
178
	     struct ipv6_txoptions *opt)
179
{
180
	struct net *net = sock_net(sk);
181
	struct ipv6_pinfo *np = inet6_sk(sk);
182
	struct in6_addr *first_hop = &fl6->daddr;
183
	struct dst_entry *dst = skb_dst(skb);
184
	struct ipv6hdr *hdr;
185
	u8  proto = fl6->flowi6_proto;
186
	int seg_len = skb->len;
187
	int hlimit = -1;
188
	int tclass = 0;
189
	u32 mtu;
190

191
	if (opt) {
192
		unsigned int head_room;
193

194
		/* First: exthdrs may take lots of space (~8K for now)
195
		   MAX_HEADER is not enough.
196
		 */
197
		head_room = opt->opt_nflen + opt->opt_flen;
198
		seg_len += head_room;
199
		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
200

201
		if (skb_headroom(skb) < head_room) {
202
			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
203
			if (skb2 == NULL) {
204
				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
205
					      IPSTATS_MIB_OUTDISCARDS);
206
				kfree_skb(skb);
207
				return -ENOBUFS;
208
			}
209
			kfree_skb(skb);
210
			skb = skb2;
211
			skb_set_owner_w(skb, sk);
212
		}
213
		if (opt->opt_flen)
214
			ipv6_push_frag_opts(skb, opt, &proto);
215
		if (opt->opt_nflen)
216
			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
217
	}
218

219
	skb_push(skb, sizeof(struct ipv6hdr));
220
	skb_reset_network_header(skb);
221
	hdr = ipv6_hdr(skb);
222

223
	/*
224
	 *	Fill in the IPv6 header
225
	 */
226
	if (np) {
227
		tclass = np->tclass;
228
		hlimit = np->hop_limit;
229
	}
230
	if (hlimit < 0)
231
		hlimit = ip6_dst_hoplimit(dst);
232

233
	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
234

235
	hdr->payload_len = htons(seg_len);
236
	hdr->nexthdr = proto;
237
	hdr->hop_limit = hlimit;
238

239
	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
240
	ipv6_addr_copy(&hdr->daddr, first_hop);
241

242
	skb->priority = sk->sk_priority;
243
	skb->mark = sk->sk_mark;
244

245
	mtu = dst_mtu(dst);
246
	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
247
		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248
			      IPSTATS_MIB_OUT, skb->len);
249
		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
250
			       dst->dev, dst_output);
251
	}
252

253
	if (net_ratelimit())
254
		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
255
	skb->dev = dst->dev;
256
	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
257
	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
258
	kfree_skb(skb);
259
	return -EMSGSIZE;
260
}
261

262
EXPORT_SYMBOL(ip6_xmit);
263

264
/*
265
 *	To avoid extra problems ND packets are send through this
266
 *	routine. It's code duplication but I really want to avoid
267
 *	extra checks since ipv6_build_header is used by TCP (which
268
 *	is for us performance critical)
269
 */
270

271
int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
272
	       const struct in6_addr *saddr, const struct in6_addr *daddr,
273
	       int proto, int len)
274
{
275
	struct ipv6_pinfo *np = inet6_sk(sk);
276
	struct ipv6hdr *hdr;
277

278
	skb->protocol = htons(ETH_P_IPV6);
279
	skb->dev = dev;
280

281
	skb_reset_network_header(skb);
282
	skb_put(skb, sizeof(struct ipv6hdr));
283
	hdr = ipv6_hdr(skb);
284

285
	*(__be32*)hdr = htonl(0x60000000);
286

287
	hdr->payload_len = htons(len);
288
	hdr->nexthdr = proto;
289
	hdr->hop_limit = np->hop_limit;
290

291
	ipv6_addr_copy(&hdr->saddr, saddr);
292
	ipv6_addr_copy(&hdr->daddr, daddr);
293

294
	return 0;
295
}
296

297
static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
298
{
299
	struct ip6_ra_chain *ra;
300
	struct sock *last = NULL;
301

302
	read_lock(&ip6_ra_lock);
303
	for (ra = ip6_ra_chain; ra; ra = ra->next) {
304
		struct sock *sk = ra->sk;
305
		if (sk && ra->sel == sel &&
306
		    (!sk->sk_bound_dev_if ||
307
		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
308
			if (last) {
309
				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
310
				if (skb2)
311
					rawv6_rcv(last, skb2);
312
			}
313
			last = sk;
314
		}
315
	}
316

317
	if (last) {
318
		rawv6_rcv(last, skb);
319
		read_unlock(&ip6_ra_lock);
320
		return 1;
321
	}
322
	read_unlock(&ip6_ra_lock);
323
	return 0;
324
}
325

326
static int ip6_forward_proxy_check(struct sk_buff *skb)
327
{
328
	struct ipv6hdr *hdr = ipv6_hdr(skb);
329
	u8 nexthdr = hdr->nexthdr;
330
	int offset;
331

332
	if (ipv6_ext_hdr(nexthdr)) {
333
		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
334
		if (offset < 0)
335
			return 0;
336
	} else
337
		offset = sizeof(struct ipv6hdr);
338

339
	if (nexthdr == IPPROTO_ICMPV6) {
340
		struct icmp6hdr *icmp6;
341

342
		if (!pskb_may_pull(skb, (skb_network_header(skb) +
343
					 offset + 1 - skb->data)))
344
			return 0;
345

346
		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
347

348
		switch (icmp6->icmp6_type) {
349
		case NDISC_ROUTER_SOLICITATION:
350
		case NDISC_ROUTER_ADVERTISEMENT:
351
		case NDISC_NEIGHBOUR_SOLICITATION:
352
		case NDISC_NEIGHBOUR_ADVERTISEMENT:
353
		case NDISC_REDIRECT:
354
			/* For reaction involving unicast neighbor discovery
355
			 * message destined to the proxied address, pass it to
356
			 * input function.
357
			 */
358
			return 1;
359
		default:
360
			break;
361
		}
362
	}
363

364
	/*
365
	 * The proxying router can't forward traffic sent to a link-local
366
	 * address, so signal the sender and discard the packet. This
367
	 * behavior is clarified by the MIPv6 specification.
368
	 */
369
	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
370
		dst_link_failure(skb);
371
		return -1;
372
	}
373

374
	return 0;
375
}
376

377
static inline int ip6_forward_finish(struct sk_buff *skb)
378
{
379
	return dst_output(skb);
380
}
381

382
int ip6_forward(struct sk_buff *skb)
383
{
384
	struct dst_entry *dst = skb_dst(skb);
385
	struct ipv6hdr *hdr = ipv6_hdr(skb);
386
	struct inet6_skb_parm *opt = IP6CB(skb);
387
	struct net *net = dev_net(dst->dev);
388
	u32 mtu;
389

390
	if (net->ipv6.devconf_all->forwarding == 0)
391
		goto error;
392

393
	if (skb_warn_if_lro(skb))
394
		goto drop;
395

396
	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
397
		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
398
		goto drop;
399
	}
400

401
	if (skb->pkt_type != PACKET_HOST)
402
		goto drop;
403

404
	skb_forward_csum(skb);
405

406
	/*
407
	 *	We DO NOT make any processing on
408
	 *	RA packets, pushing them to user level AS IS
409
	 *	without ane WARRANTY that application will be able
410
	 *	to interpret them. The reason is that we
411
	 *	cannot make anything clever here.
412
	 *
413
	 *	We are not end-node, so that if packet contains
414
	 *	AH/ESP, we cannot make anything.
415
	 *	Defragmentation also would be mistake, RA packets
416
	 *	cannot be fragmented, because there is no warranty
417
	 *	that different fragments will go along one path. --ANK
418
	 */
419
	if (opt->ra) {
420
		u8 *ptr = skb_network_header(skb) + opt->ra;
421
		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
422
			return 0;
423
	}
424

425
	/*
426
	 *	check and decrement ttl
427
	 */
428
	if (hdr->hop_limit <= 1) {
429
		/* Force OUTPUT device used as source address */
430
		skb->dev = dst->dev;
431
		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
432
		IP6_INC_STATS_BH(net,
433
				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
434

435
		kfree_skb(skb);
436
		return -ETIMEDOUT;
437
	}
438

439
	/* XXX: idev->cnf.proxy_ndp? */
440
	if (net->ipv6.devconf_all->proxy_ndp &&
441
	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
442
		int proxied = ip6_forward_proxy_check(skb);
443
		if (proxied > 0)
444
			return ip6_input(skb);
445
		else if (proxied < 0) {
446
			IP6_INC_STATS(net, ip6_dst_idev(dst),
447
				      IPSTATS_MIB_INDISCARDS);
448
			goto drop;
449
		}
450
	}
451

452
	if (!xfrm6_route_forward(skb)) {
453
		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
454
		goto drop;
455
	}
456
	dst = skb_dst(skb);
457

458
	/* IPv6 specs say nothing about it, but it is clear that we cannot
459
	   send redirects to source routed frames.
460
	   We don't send redirects to frames decapsulated from IPsec.
461
	 */
462
	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
463
	    !skb_sec_path(skb)) {
464
		struct in6_addr *target = NULL;
465
		struct rt6_info *rt;
466
		struct neighbour *n = dst->neighbour;
467

468
		/*
469
		 *	incoming and outgoing devices are the same
470
		 *	send a redirect.
471
		 */
472

473
		rt = (struct rt6_info *) dst;
474
		if ((rt->rt6i_flags & RTF_GATEWAY))
475
			target = (struct in6_addr*)&n->primary_key;
476
		else
477
			target = &hdr->daddr;
478

479
		if (!rt->rt6i_peer)
480
			rt6_bind_peer(rt, 1);
481

482
		/* Limit redirects both by destination (here)
483
		   and by source (inside ndisc_send_redirect)
484
		 */
485
		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486
			ndisc_send_redirect(skb, n, target);
487
	} else {
488
		int addrtype = ipv6_addr_type(&hdr->saddr);
489

490
		/* This check is security critical. */
491
		if (addrtype == IPV6_ADDR_ANY ||
492
		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493
			goto error;
494
		if (addrtype & IPV6_ADDR_LINKLOCAL) {
495
			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496
				    ICMPV6_NOT_NEIGHBOUR, 0);
497
			goto error;
498
		}
499
	}
500

501
	mtu = dst_mtu(dst);
502
	if (mtu < IPV6_MIN_MTU)
503
		mtu = IPV6_MIN_MTU;
504

505
	if (skb->len > mtu && !skb_is_gso(skb)) {
506
		/* Again, force OUTPUT device used as source address */
507
		skb->dev = dst->dev;
508
		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509
		IP6_INC_STATS_BH(net,
510
				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511
		IP6_INC_STATS_BH(net,
512
				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513
		kfree_skb(skb);
514
		return -EMSGSIZE;
515
	}
516

517
	if (skb_cow(skb, dst->dev->hard_header_len)) {
518
		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519
		goto drop;
520
	}
521

522
	hdr = ipv6_hdr(skb);
523

524
	/* Mangling hops number delayed to point after skb COW */
525

526
	hdr->hop_limit--;
527

528
	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529
	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
530
		       ip6_forward_finish);
531

532
error:
533
	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534
drop:
535
	kfree_skb(skb);
536
	return -EINVAL;
537
}
538

539
static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540
{
541
	to->pkt_type = from->pkt_type;
542
	to->priority = from->priority;
543
	to->protocol = from->protocol;
544
	skb_dst_drop(to);
545
	skb_dst_set(to, dst_clone(skb_dst(from)));
546
	to->dev = from->dev;
547
	to->mark = from->mark;
548

549
#ifdef CONFIG_NET_SCHED
550
	to->tc_index = from->tc_index;
551
#endif
552
	nf_copy(to, from);
553
#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554
    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555
	to->nf_trace = from->nf_trace;
556
#endif
557
	skb_copy_secmark(to, from);
558
}
559

560
int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
561
{
562
	u16 offset = sizeof(struct ipv6hdr);
563
	struct ipv6_opt_hdr *exthdr =
564
				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565
	unsigned int packet_len = skb->tail - skb->network_header;
566
	int found_rhdr = 0;
567
	*nexthdr = &ipv6_hdr(skb)->nexthdr;
568

569
	while (offset + 1 <= packet_len) {
570

571
		switch (**nexthdr) {
572

573
		case NEXTHDR_HOP:
574
			break;
575
		case NEXTHDR_ROUTING:
576
			found_rhdr = 1;
577
			break;
578
		case NEXTHDR_DEST:
579
#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580
			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
581
				break;
582
#endif
583
			if (found_rhdr)
584
				return offset;
585
			break;
586
		default :
587
			return offset;
588
		}
589

590
		offset += ipv6_optlen(exthdr);
591
		*nexthdr = &exthdr->nexthdr;
592
		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
593
						 offset);
594
	}
595

596
	return offset;
597
}
598

599
int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
600
{
601
	struct sk_buff *frag;
602
	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
603
	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
604
	struct ipv6hdr *tmp_hdr;
605
	struct frag_hdr *fh;
606
	unsigned int mtu, hlen, left, len;
607
	__be32 frag_id = 0;
608
	int ptr, offset = 0, err=0;
609
	u8 *prevhdr, nexthdr = 0;
610
	struct net *net = dev_net(skb_dst(skb)->dev);
611

612
	hlen = ip6_find_1stfragopt(skb, &prevhdr);
613
	nexthdr = *prevhdr;
614

615
	mtu = ip6_skb_dst_mtu(skb);
616

617
	/* We must not fragment if the socket is set to force MTU discovery
618
	 * or if the skb it not generated by a local socket.
619
	 */
620
	if (!skb->local_df && skb->len > mtu) {
621
		skb->dev = skb_dst(skb)->dev;
622
		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
623
		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
624
			      IPSTATS_MIB_FRAGFAILS);
625
		kfree_skb(skb);
626
		return -EMSGSIZE;
627
	}
628

629
	if (np && np->frag_size < mtu) {
630
		if (np->frag_size)
631
			mtu = np->frag_size;
632
	}
633
	mtu -= hlen + sizeof(struct frag_hdr);
634

635
	if (skb_has_frag_list(skb)) {
636
		int first_len = skb_pagelen(skb);
637
		struct sk_buff *frag2;
638

639
		if (first_len - hlen > mtu ||
640
		    ((first_len - hlen) & 7) ||
641
		    skb_cloned(skb))
642
			goto slow_path;
643

644
		skb_walk_frags(skb, frag) {
645
			/* Correct geometry. */
646
			if (frag->len > mtu ||
647
			    ((frag->len & 7) && frag->next) ||
648
			    skb_headroom(frag) < hlen)
649
				goto slow_path_clean;
650

651
			/* Partially cloned skb? */
652
			if (skb_shared(frag))
653
				goto slow_path_clean;
654

655
			BUG_ON(frag->sk);
656
			if (skb->sk) {
657
				frag->sk = skb->sk;
658
				frag->destructor = sock_wfree;
659
			}
660
			skb->truesize -= frag->truesize;
661
		}
662

663
		err = 0;
664
		offset = 0;
665
		frag = skb_shinfo(skb)->frag_list;
666
		skb_frag_list_init(skb);
667
		/* BUILD HEADER */
668

669
		*prevhdr = NEXTHDR_FRAGMENT;
670
		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
671
		if (!tmp_hdr) {
672
			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
673
				      IPSTATS_MIB_FRAGFAILS);
674
			return -ENOMEM;
675
		}
676

677
		__skb_pull(skb, hlen);
678
		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
679
		__skb_push(skb, hlen);
680
		skb_reset_network_header(skb);
681
		memcpy(skb_network_header(skb), tmp_hdr, hlen);
682

683
		ipv6_select_ident(fh);
684
		fh->nexthdr = nexthdr;
685
		fh->reserved = 0;
686
		fh->frag_off = htons(IP6_MF);
687
		frag_id = fh->identification;
688

689
		first_len = skb_pagelen(skb);
690
		skb->data_len = first_len - skb_headlen(skb);
691
		skb->len = first_len;
692
		ipv6_hdr(skb)->payload_len = htons(first_len -
693
						   sizeof(struct ipv6hdr));
694

695
		dst_hold(&rt->dst);
696

697
		for (;;) {
698
			/* Prepare header of the next frame,
699
			 * before previous one went down. */
700
			if (frag) {
701
				frag->ip_summed = CHECKSUM_NONE;
702
				skb_reset_transport_header(frag);
703
				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
704
				__skb_push(frag, hlen);
705
				skb_reset_network_header(frag);
706
				memcpy(skb_network_header(frag), tmp_hdr,
707
				       hlen);
708
				offset += skb->len - hlen - sizeof(struct frag_hdr);
709
				fh->nexthdr = nexthdr;
710
				fh->reserved = 0;
711
				fh->frag_off = htons(offset);
712
				if (frag->next != NULL)
713
					fh->frag_off |= htons(IP6_MF);
714
				fh->identification = frag_id;
715
				ipv6_hdr(frag)->payload_len =
716
						htons(frag->len -
717
						      sizeof(struct ipv6hdr));
718
				ip6_copy_metadata(frag, skb);
719
			}
720

721
			err = output(skb);
722
			if(!err)
723
				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
724
					      IPSTATS_MIB_FRAGCREATES);
725

726
			if (err || !frag)
727
				break;
728

729
			skb = frag;
730
			frag = skb->next;
731
			skb->next = NULL;
732
		}
733

734
		kfree(tmp_hdr);
735

736
		if (err == 0) {
737
			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738
				      IPSTATS_MIB_FRAGOKS);
739
			dst_release(&rt->dst);
740
			return 0;
741
		}
742

743
		while (frag) {
744
			skb = frag->next;
745
			kfree_skb(frag);
746
			frag = skb;
747
		}
748

749
		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
750
			      IPSTATS_MIB_FRAGFAILS);
751
		dst_release(&rt->dst);
752
		return err;
753

754
slow_path_clean:
755
		skb_walk_frags(skb, frag2) {
756
			if (frag2 == frag)
757
				break;
758
			frag2->sk = NULL;
759
			frag2->destructor = NULL;
760
			skb->truesize += frag2->truesize;
761
		}
762
	}
763

764
slow_path:
765
	left = skb->len - hlen;		/* Space per frame */
766
	ptr = hlen;			/* Where to start from */
767

768
	/*
769
	 *	Fragment the datagram.
770
	 */
771

772
	*prevhdr = NEXTHDR_FRAGMENT;
773

774
	/*
775
	 *	Keep copying data until we run out.
776
	 */
777
	while(left > 0)	{
778
		len = left;
779
		/* IF: it doesn't fit, use 'mtu' - the data space left */
780
		if (len > mtu)
781
			len = mtu;
782
		/* IF: we are not sending up to and including the packet end
783
		   then align the next start on an eight byte boundary */
784
		if (len < left)	{
785
			len &= ~7;
786
		}
787
		/*
788
		 *	Allocate buffer.
789
		 */
790

791
		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
792
			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
793
			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
794
				      IPSTATS_MIB_FRAGFAILS);
795
			err = -ENOMEM;
796
			goto fail;
797
		}
798

799
		/*
800
		 *	Set up data on packet
801
		 */
802

803
		ip6_copy_metadata(frag, skb);
804
		skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
805
		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
806
		skb_reset_network_header(frag);
807
		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
808
		frag->transport_header = (frag->network_header + hlen +
809
					  sizeof(struct frag_hdr));
810

811
		/*
812
		 *	Charge the memory for the fragment to any owner
813
		 *	it might possess
814
		 */
815
		if (skb->sk)
816
			skb_set_owner_w(frag, skb->sk);
817

818
		/*
819
		 *	Copy the packet header into the new buffer.
820
		 */
821
		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
822

823
		/*
824
		 *	Build fragment header.
825
		 */
826
		fh->nexthdr = nexthdr;
827
		fh->reserved = 0;
828
		if (!frag_id) {
829
			ipv6_select_ident(fh);
830
			frag_id = fh->identification;
831
		} else
832
			fh->identification = frag_id;
833

834
		/*
835
		 *	Copy a block of the IP datagram.
836
		 */
837
		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
838
			BUG();
839
		left -= len;
840

841
		fh->frag_off = htons(offset);
842
		if (left > 0)
843
			fh->frag_off |= htons(IP6_MF);
844
		ipv6_hdr(frag)->payload_len = htons(frag->len -
845
						    sizeof(struct ipv6hdr));
846

847
		ptr += len;
848
		offset += len;
849

850
		/*
851
		 *	Put this fragment into the sending queue.
852
		 */
853
		err = output(frag);
854
		if (err)
855
			goto fail;
856

857
		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
858
			      IPSTATS_MIB_FRAGCREATES);
859
	}
860
	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
861
		      IPSTATS_MIB_FRAGOKS);
862
	kfree_skb(skb);
863
	return err;
864

865
fail:
866
	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
867
		      IPSTATS_MIB_FRAGFAILS);
868
	kfree_skb(skb);
869
	return err;
870
}
871

872
static inline int ip6_rt_check(const struct rt6key *rt_key,
873
			       const struct in6_addr *fl_addr,
874
			       const struct in6_addr *addr_cache)
875
{
876
	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
877
		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
878
}
879

880
static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
881
					  struct dst_entry *dst,
882
					  const struct flowi6 *fl6)
883
{
884
	struct ipv6_pinfo *np = inet6_sk(sk);
885
	struct rt6_info *rt = (struct rt6_info *)dst;
886

887
	if (!dst)
888
		goto out;
889

890
	/* Yes, checking route validity in not connected
891
	 * case is not very simple. Take into account,
892
	 * that we do not support routing by source, TOS,
893
	 * and MSG_DONTROUTE 		--ANK (980726)
894
	 *
895
	 * 1. ip6_rt_check(): If route was host route,
896
	 *    check that cached destination is current.
897
	 *    If it is network route, we still may
898
	 *    check its validity using saved pointer
899
	 *    to the last used address: daddr_cache.
900
	 *    We do not want to save whole address now,
901
	 *    (because main consumer of this service
902
	 *    is tcp, which has not this problem),
903
	 *    so that the last trick works only on connected
904
	 *    sockets.
905
	 * 2. oif also should be the same.
906
	 */
907
	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
908
#ifdef CONFIG_IPV6_SUBTREES
909
	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
910
#endif
911
	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
912
		dst_release(dst);
913
		dst = NULL;
914
	}
915

916
out:
917
	return dst;
918
}
919

920
static int ip6_dst_lookup_tail(struct sock *sk,
921
			       struct dst_entry **dst, struct flowi6 *fl6)
922
{
923
	int err;
924
	struct net *net = sock_net(sk);
925

926
	if (*dst == NULL)
927
		*dst = ip6_route_output(net, sk, fl6);
928

929
	if ((err = (*dst)->error))
930
		goto out_err_release;
931

932
	if (ipv6_addr_any(&fl6->saddr)) {
933
		struct rt6_info *rt = (struct rt6_info *) *dst;
934
		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
935
					  sk ? inet6_sk(sk)->srcprefs : 0,
936
					  &fl6->saddr);
937
		if (err)
938
			goto out_err_release;
939
	}
940

941
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
942
	/*
943
	 * Here if the dst entry we've looked up
944
	 * has a neighbour entry that is in the INCOMPLETE
945
	 * state and the src address from the flow is
946
	 * marked as OPTIMISTIC, we release the found
947
	 * dst entry and replace it instead with the
948
	 * dst entry of the nexthop router
949
	 */
950
	if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
951
		struct inet6_ifaddr *ifp;
952
		struct flowi6 fl_gw6;
953
		int redirect;
954

955
		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
956
				      (*dst)->dev, 1);
957

958
		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
959
		if (ifp)
960
			in6_ifa_put(ifp);
961

962
		if (redirect) {
963
			/*
964
			 * We need to get the dst entry for the
965
			 * default router instead
966
			 */
967
			dst_release(*dst);
968
			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
969
			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
970
			*dst = ip6_route_output(net, sk, &fl_gw6);
971
			if ((err = (*dst)->error))
972
				goto out_err_release;
973
		}
974
	}
975
#endif
976

977
	return 0;
978

979
out_err_release:
980
	if (err == -ENETUNREACH)
981
		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
982
	dst_release(*dst);
983
	*dst = NULL;
984
	return err;
985
}
986

987
/**
988
 *	ip6_dst_lookup - perform route lookup on flow
989
 *	@sk: socket which provides route info
990
 *	@dst: pointer to dst_entry * for result
991
 *	@fl6: flow to lookup
992
 *
993
 *	This function performs a route lookup on the given flow.
994
 *
995
 *	It returns zero on success, or a standard errno code on error.
996
 */
997
int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
998
{
999
	*dst = NULL;
1000
	return ip6_dst_lookup_tail(sk, dst, fl6);
1001
}
1002
EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003

1004
/**
1005
 *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1006
 *	@sk: socket which provides route info
1007
 *	@fl6: flow to lookup
1008
 *	@final_dst: final destination address for ipsec lookup
1009
 *	@can_sleep: we are in a sleepable context
1010
 *
1011
 *	This function performs a route lookup on the given flow.
1012
 *
1013
 *	It returns a valid dst pointer on success, or a pointer encoded
1014
 *	error code.
1015
 */
1016
struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1017
				      const struct in6_addr *final_dst,
1018
				      bool can_sleep)
1019
{
1020
	struct dst_entry *dst = NULL;
1021
	int err;
1022

1023
	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1024
	if (err)
1025
		return ERR_PTR(err);
1026
	if (final_dst)
1027
		ipv6_addr_copy(&fl6->daddr, final_dst);
1028
	if (can_sleep)
1029
		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1030

1031
	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1032
}
1033
EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1034

1035
/**
1036
 *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1037
 *	@sk: socket which provides the dst cache and route info
1038
 *	@fl6: flow to lookup
1039
 *	@final_dst: final destination address for ipsec lookup
1040
 *	@can_sleep: we are in a sleepable context
1041
 *
1042
 *	This function performs a route lookup on the given flow with the
1043
 *	possibility of using the cached route in the socket if it is valid.
1044
 *	It will take the socket dst lock when operating on the dst cache.
1045
 *	As a result, this function can only be used in process context.
1046
 *
1047
 *	It returns a valid dst pointer on success, or a pointer encoded
1048
 *	error code.
1049
 */
1050
struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1051
					 const struct in6_addr *final_dst,
1052
					 bool can_sleep)
1053
{
1054
	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1055
	int err;
1056

1057
	dst = ip6_sk_dst_check(sk, dst, fl6);
1058

1059
	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1060
	if (err)
1061
		return ERR_PTR(err);
1062
	if (final_dst)
1063
		ipv6_addr_copy(&fl6->daddr, final_dst);
1064
	if (can_sleep)
1065
		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1066

1067
	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1068
}
1069
EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1070

1071
static inline int ip6_ufo_append_data(struct sock *sk,
1072
			int getfrag(void *from, char *to, int offset, int len,
1073
			int odd, struct sk_buff *skb),
1074
			void *from, int length, int hh_len, int fragheaderlen,
1075
			int transhdrlen, int mtu,unsigned int flags)
1076

1077
{
1078
	struct sk_buff *skb;
1079
	int err;
1080

1081
	/* There is support for UDP large send offload by network
1082
	 * device, so create one single skb packet containing complete
1083
	 * udp datagram
1084
	 */
1085
	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1086
		skb = sock_alloc_send_skb(sk,
1087
			hh_len + fragheaderlen + transhdrlen + 20,
1088
			(flags & MSG_DONTWAIT), &err);
1089
		if (skb == NULL)
1090
			return -ENOMEM;
1091

1092
		/* reserve space for Hardware header */
1093
		skb_reserve(skb, hh_len);
1094

1095
		/* create space for UDP/IP header */
1096
		skb_put(skb,fragheaderlen + transhdrlen);
1097

1098
		/* initialize network header pointer */
1099
		skb_reset_network_header(skb);
1100

1101
		/* initialize protocol header pointer */
1102
		skb->transport_header = skb->network_header + fragheaderlen;
1103

1104
		skb->ip_summed = CHECKSUM_PARTIAL;
1105
		skb->csum = 0;
1106
	}
1107

1108
	err = skb_append_datato_frags(sk,skb, getfrag, from,
1109
				      (length - transhdrlen));
1110
	if (!err) {
1111
		struct frag_hdr fhdr;
1112

1113
		/* Specify the length of each IPv6 datagram fragment.
1114
		 * It has to be a multiple of 8.
1115
		 */
1116
		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1117
					     sizeof(struct frag_hdr)) & ~7;
1118
		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1119
		ipv6_select_ident(&fhdr);
1120
		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1121
		__skb_queue_tail(&sk->sk_write_queue, skb);
1122

1123
		return 0;
1124
	}
1125
	/* There is not enough support do UPD LSO,
1126
	 * so follow normal path
1127
	 */
1128
	kfree_skb(skb);
1129

1130
	return err;
1131
}
1132

1133
static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1134
					       gfp_t gfp)
1135
{
1136
	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1137
}
1138

1139
static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1140
						gfp_t gfp)
1141
{
1142
	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1143
}
1144

1145
int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1146
	int offset, int len, int odd, struct sk_buff *skb),
1147
	void *from, int length, int transhdrlen,
1148
	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1149
	struct rt6_info *rt, unsigned int flags, int dontfrag)
1150
{
1151
	struct inet_sock *inet = inet_sk(sk);
1152
	struct ipv6_pinfo *np = inet6_sk(sk);
1153
	struct inet_cork *cork;
1154
	struct sk_buff *skb;
1155
	unsigned int maxfraglen, fragheaderlen;
1156
	int exthdrlen;
1157
	int hh_len;
1158
	int mtu;
1159
	int copy;
1160
	int err;
1161
	int offset = 0;
1162
	int csummode = CHECKSUM_NONE;
1163
	__u8 tx_flags = 0;
1164

1165
	if (flags&MSG_PROBE)
1166
		return 0;
1167
	cork = &inet->cork.base;
1168
	if (skb_queue_empty(&sk->sk_write_queue)) {
1169
		/*
1170
		 * setup for corking
1171
		 */
1172
		if (opt) {
1173
			if (WARN_ON(np->cork.opt))
1174
				return -EINVAL;
1175

1176
			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1177
			if (unlikely(np->cork.opt == NULL))
1178
				return -ENOBUFS;
1179

1180
			np->cork.opt->tot_len = opt->tot_len;
1181
			np->cork.opt->opt_flen = opt->opt_flen;
1182
			np->cork.opt->opt_nflen = opt->opt_nflen;
1183

1184
			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1185
							    sk->sk_allocation);
1186
			if (opt->dst0opt && !np->cork.opt->dst0opt)
1187
				return -ENOBUFS;
1188

1189
			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1190
							    sk->sk_allocation);
1191
			if (opt->dst1opt && !np->cork.opt->dst1opt)
1192
				return -ENOBUFS;
1193

1194
			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1195
							   sk->sk_allocation);
1196
			if (opt->hopopt && !np->cork.opt->hopopt)
1197
				return -ENOBUFS;
1198

1199
			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1200
							    sk->sk_allocation);
1201
			if (opt->srcrt && !np->cork.opt->srcrt)
1202
				return -ENOBUFS;
1203

1204
			/* need source address above miyazawa*/
1205
		}
1206
		dst_hold(&rt->dst);
1207
		cork->dst = &rt->dst;
1208
		inet->cork.fl.u.ip6 = *fl6;
1209
		np->cork.hop_limit = hlimit;
1210
		np->cork.tclass = tclass;
1211
		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1212
		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1213
		if (np->frag_size < mtu) {
1214
			if (np->frag_size)
1215
				mtu = np->frag_size;
1216
		}
1217
		cork->fragsize = mtu;
1218
		if (dst_allfrag(rt->dst.path))
1219
			cork->flags |= IPCORK_ALLFRAG;
1220
		cork->length = 0;
1221
		sk->sk_sndmsg_page = NULL;
1222
		sk->sk_sndmsg_off = 0;
1223
		exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1224
			    rt->rt6i_nfheader_len;
1225
		length += exthdrlen;
1226
		transhdrlen += exthdrlen;
1227
	} else {
1228
		rt = (struct rt6_info *)cork->dst;
1229
		fl6 = &inet->cork.fl.u.ip6;
1230
		opt = np->cork.opt;
1231
		transhdrlen = 0;
1232
		exthdrlen = 0;
1233
		mtu = cork->fragsize;
1234
	}
1235

1236
	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1237

1238
	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1239
			(opt ? opt->opt_nflen : 0);
1240
	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1241

1242
	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1243
		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1244
			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1245
			return -EMSGSIZE;
1246
		}
1247
	}
1248

1249
	/* For UDP, check if TX timestamp is enabled */
1250
	if (sk->sk_type == SOCK_DGRAM) {
1251
		err = sock_tx_timestamp(sk, &tx_flags);
1252
		if (err)
1253
			goto error;
1254
	}
1255

1256
	/*
1257
	 * Let's try using as much space as possible.
1258
	 * Use MTU if total length of the message fits into the MTU.
1259
	 * Otherwise, we need to reserve fragment header and
1260
	 * fragment alignment (= 8-15 octects, in total).
1261
	 *
1262
	 * Note that we may need to "move" the data from the tail of
1263
	 * of the buffer to the new fragment when we split
1264
	 * the message.
1265
	 *
1266
	 * FIXME: It may be fragmented into multiple chunks
1267
	 *        at once if non-fragmentable extension headers
1268
	 *        are too large.
1269
	 * --yoshfuji
1270
	 */
1271

1272
	cork->length += length;
1273
	if (length > mtu) {
1274
		int proto = sk->sk_protocol;
1275
		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1276
			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1277
			return -EMSGSIZE;
1278
		}
1279

1280
		if (proto == IPPROTO_UDP &&
1281
		    (rt->dst.dev->features & NETIF_F_UFO)) {
1282

1283
			err = ip6_ufo_append_data(sk, getfrag, from, length,
1284
						  hh_len, fragheaderlen,
1285
						  transhdrlen, mtu, flags);
1286
			if (err)
1287
				goto error;
1288
			return 0;
1289
		}
1290
	}
1291

1292
	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1293
		goto alloc_new_skb;
1294

1295
	while (length > 0) {
1296
		/* Check if the remaining data fits into current packet. */
1297
		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1298
		if (copy < length)
1299
			copy = maxfraglen - skb->len;
1300

1301
		if (copy <= 0) {
1302
			char *data;
1303
			unsigned int datalen;
1304
			unsigned int fraglen;
1305
			unsigned int fraggap;
1306
			unsigned int alloclen;
1307
			struct sk_buff *skb_prev;
1308
alloc_new_skb:
1309
			skb_prev = skb;
1310

1311
			/* There's no room in the current skb */
1312
			if (skb_prev)
1313
				fraggap = skb_prev->len - maxfraglen;
1314
			else
1315
				fraggap = 0;
1316

1317
			/*
1318
			 * If remaining data exceeds the mtu,
1319
			 * we know we need more fragment(s).
1320
			 */
1321
			datalen = length + fraggap;
1322
			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1323
				datalen = maxfraglen - fragheaderlen;
1324

1325
			fraglen = datalen + fragheaderlen;
1326
			if ((flags & MSG_MORE) &&
1327
			    !(rt->dst.dev->features&NETIF_F_SG))
1328
				alloclen = mtu;
1329
			else
1330
				alloclen = datalen + fragheaderlen;
1331

1332
			/*
1333
			 * The last fragment gets additional space at tail.
1334
			 * Note: we overallocate on fragments with MSG_MODE
1335
			 * because we have no idea if we're the last one.
1336
			 */
1337
			if (datalen == length + fraggap)
1338
				alloclen += rt->dst.trailer_len;
1339

1340
			/*
1341
			 * We just reserve space for fragment header.
1342
			 * Note: this may be overallocation if the message
1343
			 * (without MSG_MORE) fits into the MTU.
1344
			 */
1345
			alloclen += sizeof(struct frag_hdr);
1346

1347
			if (transhdrlen) {
1348
				skb = sock_alloc_send_skb(sk,
1349
						alloclen + hh_len,
1350
						(flags & MSG_DONTWAIT), &err);
1351
			} else {
1352
				skb = NULL;
1353
				if (atomic_read(&sk->sk_wmem_alloc) <=
1354
				    2 * sk->sk_sndbuf)
1355
					skb = sock_wmalloc(sk,
1356
							   alloclen + hh_len, 1,
1357
							   sk->sk_allocation);
1358
				if (unlikely(skb == NULL))
1359
					err = -ENOBUFS;
1360
				else {
1361
					/* Only the initial fragment
1362
					 * is time stamped.
1363
					 */
1364
					tx_flags = 0;
1365
				}
1366
			}
1367
			if (skb == NULL)
1368
				goto error;
1369
			/*
1370
			 *	Fill in the control structures
1371
			 */
1372
			skb->ip_summed = csummode;
1373
			skb->csum = 0;
1374
			/* reserve for fragmentation */
1375
			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1376

1377
			if (sk->sk_type == SOCK_DGRAM)
1378
				skb_shinfo(skb)->tx_flags = tx_flags;
1379

1380
			/*
1381
			 *	Find where to start putting bytes
1382
			 */
1383
			data = skb_put(skb, fraglen);
1384
			skb_set_network_header(skb, exthdrlen);
1385
			data += fragheaderlen;
1386
			skb->transport_header = (skb->network_header +
1387
						 fragheaderlen);
1388
			if (fraggap) {
1389
				skb->csum = skb_copy_and_csum_bits(
1390
					skb_prev, maxfraglen,
1391
					data + transhdrlen, fraggap, 0);
1392
				skb_prev->csum = csum_sub(skb_prev->csum,
1393
							  skb->csum);
1394
				data += fraggap;
1395
				pskb_trim_unique(skb_prev, maxfraglen);
1396
			}
1397
			copy = datalen - transhdrlen - fraggap;
1398
			if (copy < 0) {
1399
				err = -EINVAL;
1400
				kfree_skb(skb);
1401
				goto error;
1402
			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1403
				err = -EFAULT;
1404
				kfree_skb(skb);
1405
				goto error;
1406
			}
1407

1408
			offset += copy;
1409
			length -= datalen - fraggap;
1410
			transhdrlen = 0;
1411
			exthdrlen = 0;
1412
			csummode = CHECKSUM_NONE;
1413

1414
			/*
1415
			 * Put the packet on the pending queue
1416
			 */
1417
			__skb_queue_tail(&sk->sk_write_queue, skb);
1418
			continue;
1419
		}
1420

1421
		if (copy > length)
1422
			copy = length;
1423

1424
		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1425
			unsigned int off;
1426

1427
			off = skb->len;
1428
			if (getfrag(from, skb_put(skb, copy),
1429
						offset, copy, off, skb) < 0) {
1430
				__skb_trim(skb, off);
1431
				err = -EFAULT;
1432
				goto error;
1433
			}
1434
		} else {
1435
			int i = skb_shinfo(skb)->nr_frags;
1436
			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1437
			struct page *page = sk->sk_sndmsg_page;
1438
			int off = sk->sk_sndmsg_off;
1439
			unsigned int left;
1440

1441
			if (page && (left = PAGE_SIZE - off) > 0) {
1442
				if (copy >= left)
1443
					copy = left;
1444
				if (page != frag->page) {
1445
					if (i == MAX_SKB_FRAGS) {
1446
						err = -EMSGSIZE;
1447
						goto error;
1448
					}
1449
					get_page(page);
1450
					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1451
					frag = &skb_shinfo(skb)->frags[i];
1452
				}
1453
			} else if(i < MAX_SKB_FRAGS) {
1454
				if (copy > PAGE_SIZE)
1455
					copy = PAGE_SIZE;
1456
				page = alloc_pages(sk->sk_allocation, 0);
1457
				if (page == NULL) {
1458
					err = -ENOMEM;
1459
					goto error;
1460
				}
1461
				sk->sk_sndmsg_page = page;
1462
				sk->sk_sndmsg_off = 0;
1463

1464
				skb_fill_page_desc(skb, i, page, 0, 0);
1465
				frag = &skb_shinfo(skb)->frags[i];
1466
			} else {
1467
				err = -EMSGSIZE;
1468
				goto error;
1469
			}
1470
			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1471
				err = -EFAULT;
1472
				goto error;
1473
			}
1474
			sk->sk_sndmsg_off += copy;
1475
			frag->size += copy;
1476
			skb->len += copy;
1477
			skb->data_len += copy;
1478
			skb->truesize += copy;
1479
			atomic_add(copy, &sk->sk_wmem_alloc);
1480
		}
1481
		offset += copy;
1482
		length -= copy;
1483
	}
1484
	return 0;
1485
error:
1486
	cork->length -= length;
1487
	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1488
	return err;
1489
}
1490

1491
static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1492
{
1493
	if (np->cork.opt) {
1494
		kfree(np->cork.opt->dst0opt);
1495
		kfree(np->cork.opt->dst1opt);
1496
		kfree(np->cork.opt->hopopt);
1497
		kfree(np->cork.opt->srcrt);
1498
		kfree(np->cork.opt);
1499
		np->cork.opt = NULL;
1500
	}
1501

1502
	if (inet->cork.base.dst) {
1503
		dst_release(inet->cork.base.dst);
1504
		inet->cork.base.dst = NULL;
1505
		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1506
	}
1507
	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1508
}
1509

1510
int ip6_push_pending_frames(struct sock *sk)
1511
{
1512
	struct sk_buff *skb, *tmp_skb;
1513
	struct sk_buff **tail_skb;
1514
	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1515
	struct inet_sock *inet = inet_sk(sk);
1516
	struct ipv6_pinfo *np = inet6_sk(sk);
1517
	struct net *net = sock_net(sk);
1518
	struct ipv6hdr *hdr;
1519
	struct ipv6_txoptions *opt = np->cork.opt;
1520
	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1521
	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1522
	unsigned char proto = fl6->flowi6_proto;
1523
	int err = 0;
1524

1525
	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1526
		goto out;
1527
	tail_skb = &(skb_shinfo(skb)->frag_list);
1528

1529
	/* move skb->data to ip header from ext header */
1530
	if (skb->data < skb_network_header(skb))
1531
		__skb_pull(skb, skb_network_offset(skb));
1532
	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1533
		__skb_pull(tmp_skb, skb_network_header_len(skb));
1534
		*tail_skb = tmp_skb;
1535
		tail_skb = &(tmp_skb->next);
1536
		skb->len += tmp_skb->len;
1537
		skb->data_len += tmp_skb->len;
1538
		skb->truesize += tmp_skb->truesize;
1539
		tmp_skb->destructor = NULL;
1540
		tmp_skb->sk = NULL;
1541
	}
1542

1543
	/* Allow local fragmentation. */
1544
	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1545
		skb->local_df = 1;
1546

1547
	ipv6_addr_copy(final_dst, &fl6->daddr);
1548
	__skb_pull(skb, skb_network_header_len(skb));
1549
	if (opt && opt->opt_flen)
1550
		ipv6_push_frag_opts(skb, opt, &proto);
1551
	if (opt && opt->opt_nflen)
1552
		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1553

1554
	skb_push(skb, sizeof(struct ipv6hdr));
1555
	skb_reset_network_header(skb);
1556
	hdr = ipv6_hdr(skb);
1557

1558
	*(__be32*)hdr = fl6->flowlabel |
1559
		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1560

1561
	hdr->hop_limit = np->cork.hop_limit;
1562
	hdr->nexthdr = proto;
1563
	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1564
	ipv6_addr_copy(&hdr->daddr, final_dst);
1565

1566
	skb->priority = sk->sk_priority;
1567
	skb->mark = sk->sk_mark;
1568

1569
	skb_dst_set(skb, dst_clone(&rt->dst));
1570
	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1571
	if (proto == IPPROTO_ICMPV6) {
1572
		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1573

1574
		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1575
		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1576
	}
1577

1578
	err = ip6_local_out(skb);
1579
	if (err) {
1580
		if (err > 0)
1581
			err = net_xmit_errno(err);
1582
		if (err)
1583
			goto error;
1584
	}
1585

1586
out:
1587
	ip6_cork_release(inet, np);
1588
	return err;
1589
error:
1590
	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1591
	goto out;
1592
}
1593

1594
void ip6_flush_pending_frames(struct sock *sk)
1595
{
1596
	struct sk_buff *skb;
1597

1598
	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1599
		if (skb_dst(skb))
1600
			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1601
				      IPSTATS_MIB_OUTDISCARDS);
1602
		kfree_skb(skb);
1603
	}
1604

1605
	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1606
}
1607

1608
Product

Resources

Company