/*1* INET An implementation of the TCP/IP protocol suite for the LINUX2* operating system. INET is implemented using the BSD Socket3* interface as the means of communication with the user level.4*5* The Internet Protocol (IP) module.6*7* Authors: Ross Biro8* Fred N. van Kempen, <[email protected]>9* Donald Becker, <[email protected]>10* Alan Cox, <[email protected]>11* Richard Underwood12* Stefan Becker, <[email protected]>13* Jorge Cwik, <[email protected]>14* Arnt Gulbrandsen, <[email protected]>15*16*17* Fixes:18* Alan Cox : Commented a couple of minor bits of surplus code19* Alan Cox : Undefining IP_FORWARD doesn't include the code20* (just stops a compiler warning).21* Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes22* are junked rather than corrupting things.23* Alan Cox : Frames to bad broadcast subnets are dumped24* We used to process them non broadcast and25* boy could that cause havoc.26* Alan Cox : ip_forward sets the free flag on the27* new frame it queues. Still crap because28* it copies the frame but at least it29* doesn't eat memory too.30* Alan Cox : Generic queue code and memory fixes.31* Fred Van Kempen : IP fragment support (borrowed from NET2E)32* Gerhard Koerting: Forward fragmented frames correctly.33* Gerhard Koerting: Fixes to my fix of the above 8-).34* Gerhard Koerting: IP interface addressing fix.35* Linus Torvalds : More robustness checks36* Alan Cox : Even more checks: Still not as robust as it ought to be37* Alan Cox : Save IP header pointer for later38* Alan Cox : ip option setting39* Alan Cox : Use ip_tos/ip_ttl settings40* Alan Cox : Fragmentation bogosity removed41* (Thanks to [email protected])42* Dmitry Gorodchanin : Send of a raw packet crash fix.43* Alan Cox : Silly ip bug when an overlength44* fragment turns up. Now frees the45* queue.46* Linus Torvalds/ : Memory leakage on fragmentation47* Alan Cox : handling.48* Gerhard Koerting: Forwarding uses IP priority hints49* Teemu Rantanen : Fragment problems.50* Alan Cox : General cleanup, comments and reformat51* Alan Cox : SNMP statistics52* Alan Cox : BSD address rule semantics. Also see53* UDP as there is a nasty checksum issue54* if you do things the wrong way.55* Alan Cox : Always defrag, moved IP_FORWARD to the config.in file56* Alan Cox : IP options adjust sk->priority.57* Pedro Roque : Fix mtu/length error in ip_forward.58* Alan Cox : Avoid ip_chk_addr when possible.59* Richard Underwood : IP multicasting.60* Alan Cox : Cleaned up multicast handlers.61* Alan Cox : RAW sockets demultiplex in the BSD style.62* Gunther Mayer : Fix the SNMP reporting typo63* Alan Cox : Always in group 224.0.0.164* Pauline Middelink : Fast ip_checksum update when forwarding65* Masquerading support.66* Alan Cox : Multicast loopback error for 224.0.0.167* Alan Cox : IP_MULTICAST_LOOP option.68* Alan Cox : Use notifiers.69* Bjorn Ekwall : Removed ip_csum (from slhc.c too)70* Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!)71* Stefan Becker : Send out ICMP HOST REDIRECT72* Arnt Gulbrandsen : ip_build_xmit73* Alan Cox : Per socket routing cache74* Alan Cox : Fixed routing cache, added header cache.75* Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it.76* Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net.77* Alan Cox : Incoming IP option handling.78* Alan Cox : Set saddr on raw output frames as per BSD.79* Alan Cox : Stopped broadcast source route explosions.80* Alan Cox : Can disable source routing81* Takeshi Sone : Masquerading didn't work.82* Dave Bonn,Alan Cox : Faster IP forwarding whenever possible.83* Alan Cox : Memory leaks, tramples, misc debugging.84* Alan Cox : Fixed multicast (by popular demand 8))85* Alan Cox : Fixed forwarding (by even more popular demand 8))86* Alan Cox : Fixed SNMP statistics [I think]87* Gerhard Koerting : IP fragmentation forwarding fix88* Alan Cox : Device lock against page fault.89* Alan Cox : IP_HDRINCL facility.90* Werner Almesberger : Zero fragment bug91* Alan Cox : RAW IP frame length bug92* Alan Cox : Outgoing firewall on build_xmit93* A.N.Kuznetsov : IP_OPTIONS support throughout the kernel94* Alan Cox : Multicast routing hooks95* Jos Vos : Do accounting *before* call_in_firewall96* Willy Konynenberg : Transparent proxying support97*98*99*100* To Fix:101* IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient102* and could be made very efficient with the addition of some virtual memory hacks to permit103* the allocation of a buffer that can then be 'grown' by twiddling page tables.104* Output fragmentation wants updating along with the buffer management to use a single105* interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet106* output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause107* fragmentation anyway.108*109* This program is free software; you can redistribute it and/or110* modify it under the terms of the GNU General Public License111* as published by the Free Software Foundation; either version112* 2 of the License, or (at your option) any later version.113*/114115#include <asm/system.h>116#include <linux/module.h>117#include <linux/types.h>118#include <linux/kernel.h>119#include <linux/string.h>120#include <linux/errno.h>121#include <linux/slab.h>122123#include <linux/net.h>124#include <linux/socket.h>125#include <linux/sockios.h>126#include <linux/in.h>127#include <linux/inet.h>128#include <linux/inetdevice.h>129#include <linux/netdevice.h>130#include <linux/etherdevice.h>131132#include <net/snmp.h>133#include <net/ip.h>134#include <net/protocol.h>135#include <net/route.h>136#include <linux/skbuff.h>137#include <net/sock.h>138#include <net/arp.h>139#include <net/icmp.h>140#include <net/raw.h>141#include <net/checksum.h>142#include <linux/netfilter_ipv4.h>143#include <net/xfrm.h>144#include <linux/mroute.h>145#include <linux/netlink.h>146147/*148* Process Router Attention IP option (RFC 2113)149*/150int ip_call_ra_chain(struct sk_buff *skb)151{152struct ip_ra_chain *ra;153u8 protocol = ip_hdr(skb)->protocol;154struct sock *last = NULL;155struct net_device *dev = skb->dev;156157for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {158struct sock *sk = ra->sk;159160/* If socket is bound to an interface, only report161* the packet if it came from that interface.162*/163if (sk && inet_sk(sk)->inet_num == protocol &&164(!sk->sk_bound_dev_if ||165sk->sk_bound_dev_if == dev->ifindex) &&166net_eq(sock_net(sk), dev_net(dev))) {167if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {168if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))169return 1;170}171if (last) {172struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);173if (skb2)174raw_rcv(last, skb2);175}176last = sk;177}178}179180if (last) {181raw_rcv(last, skb);182return 1;183}184return 0;185}186187static int ip_local_deliver_finish(struct sk_buff *skb)188{189struct net *net = dev_net(skb->dev);190191__skb_pull(skb, ip_hdrlen(skb));192193/* Point into the IP datagram, just past the header. */194skb_reset_transport_header(skb);195196rcu_read_lock();197{198int protocol = ip_hdr(skb)->protocol;199int hash, raw;200const struct net_protocol *ipprot;201202resubmit:203raw = raw_local_deliver(skb, protocol);204205hash = protocol & (MAX_INET_PROTOS - 1);206ipprot = rcu_dereference(inet_protos[hash]);207if (ipprot != NULL) {208int ret;209210if (!net_eq(net, &init_net) && !ipprot->netns_ok) {211if (net_ratelimit())212printk("%s: proto %d isn't netns-ready\n",213__func__, protocol);214kfree_skb(skb);215goto out;216}217218if (!ipprot->no_policy) {219if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {220kfree_skb(skb);221goto out;222}223nf_reset(skb);224}225ret = ipprot->handler(skb);226if (ret < 0) {227protocol = -ret;228goto resubmit;229}230IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);231} else {232if (!raw) {233if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {234IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);235icmp_send(skb, ICMP_DEST_UNREACH,236ICMP_PROT_UNREACH, 0);237}238} else239IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);240kfree_skb(skb);241}242}243out:244rcu_read_unlock();245246return 0;247}248249/*250* Deliver IP Packets to the higher protocol layers.251*/252int ip_local_deliver(struct sk_buff *skb)253{254/*255* Reassemble IP fragments.256*/257258if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {259if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))260return 0;261}262263return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,264ip_local_deliver_finish);265}266267static inline int ip_rcv_options(struct sk_buff *skb)268{269struct ip_options *opt;270const struct iphdr *iph;271struct net_device *dev = skb->dev;272273/* It looks as overkill, because not all274IP options require packet mangling.275But it is the easiest for now, especially taking276into account that combination of IP options277and running sniffer is extremely rare condition.278--ANK (980813)279*/280if (skb_cow(skb, skb_headroom(skb))) {281IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);282goto drop;283}284285iph = ip_hdr(skb);286opt = &(IPCB(skb)->opt);287opt->optlen = iph->ihl*4 - sizeof(struct iphdr);288289if (ip_options_compile(dev_net(dev), opt, skb)) {290IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);291goto drop;292}293294if (unlikely(opt->srr)) {295struct in_device *in_dev = __in_dev_get_rcu(dev);296297if (in_dev) {298if (!IN_DEV_SOURCE_ROUTE(in_dev)) {299if (IN_DEV_LOG_MARTIANS(in_dev) &&300net_ratelimit())301printk(KERN_INFO "source route option %pI4 -> %pI4\n",302&iph->saddr, &iph->daddr);303goto drop;304}305}306307if (ip_options_rcv_srr(skb))308goto drop;309}310311return 0;312drop:313return -1;314}315316static int ip_rcv_finish(struct sk_buff *skb)317{318const struct iphdr *iph = ip_hdr(skb);319struct rtable *rt;320321/*322* Initialise the virtual path cache for the packet. It describes323* how the packet travels inside Linux networking.324*/325if (skb_dst(skb) == NULL) {326int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,327iph->tos, skb->dev);328if (unlikely(err)) {329if (err == -EHOSTUNREACH)330IP_INC_STATS_BH(dev_net(skb->dev),331IPSTATS_MIB_INADDRERRORS);332else if (err == -ENETUNREACH)333IP_INC_STATS_BH(dev_net(skb->dev),334IPSTATS_MIB_INNOROUTES);335else if (err == -EXDEV)336NET_INC_STATS_BH(dev_net(skb->dev),337LINUX_MIB_IPRPFILTER);338goto drop;339}340}341342#ifdef CONFIG_IP_ROUTE_CLASSID343if (unlikely(skb_dst(skb)->tclassid)) {344struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);345u32 idx = skb_dst(skb)->tclassid;346st[idx&0xFF].o_packets++;347st[idx&0xFF].o_bytes += skb->len;348st[(idx>>16)&0xFF].i_packets++;349st[(idx>>16)&0xFF].i_bytes += skb->len;350}351#endif352353if (iph->ihl > 5 && ip_rcv_options(skb))354goto drop;355356rt = skb_rtable(skb);357if (rt->rt_type == RTN_MULTICAST) {358IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,359skb->len);360} else if (rt->rt_type == RTN_BROADCAST)361IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,362skb->len);363364return dst_input(skb);365366drop:367kfree_skb(skb);368return NET_RX_DROP;369}370371/*372* Main IP Receive routine.373*/374int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)375{376const struct iphdr *iph;377u32 len;378379/* When the interface is in promisc. mode, drop all the crap380* that it receives, do not try to analyse it.381*/382if (skb->pkt_type == PACKET_OTHERHOST)383goto drop;384385386IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);387388if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {389IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);390goto out;391}392393if (!pskb_may_pull(skb, sizeof(struct iphdr)))394goto inhdr_error;395396iph = ip_hdr(skb);397398/*399* RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.400*401* Is the datagram acceptable?402*403* 1. Length at least the size of an ip header404* 2. Version of 4405* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]406* 4. Doesn't have a bogus length407*/408409if (iph->ihl < 5 || iph->version != 4)410goto inhdr_error;411412if (!pskb_may_pull(skb, iph->ihl*4))413goto inhdr_error;414415iph = ip_hdr(skb);416417if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))418goto inhdr_error;419420len = ntohs(iph->tot_len);421if (skb->len < len) {422IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);423goto drop;424} else if (len < (iph->ihl*4))425goto inhdr_error;426427/* Our transport medium may have padded the buffer out. Now we know it428* is IP we can trim to the true length of the frame.429* Note this now means skb->len holds ntohs(iph->tot_len).430*/431if (pskb_trim_rcsum(skb, len)) {432IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);433goto drop;434}435436/* Remove any debris in the socket control block */437memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));438439/* Must drop socket now because of tproxy. */440skb_orphan(skb);441442return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,443ip_rcv_finish);444445inhdr_error:446IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);447drop:448kfree_skb(skb);449out:450return NET_RX_DROP;451}452453454