Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/ipv4/route.c
15109 views
1
/*
2
* INET An implementation of the TCP/IP protocol suite for the LINUX
3
* operating system. INET is implemented using the BSD Socket
4
* interface as the means of communication with the user level.
5
*
6
* ROUTE - implementation of the IP router.
7
*
8
* Authors: Ross Biro
9
* Fred N. van Kempen, <[email protected]>
10
* Alan Cox, <[email protected]>
11
* Linus Torvalds, <[email protected]>
12
* Alexey Kuznetsov, <[email protected]>
13
*
14
* Fixes:
15
* Alan Cox : Verify area fixes.
16
* Alan Cox : cli() protects routing changes
17
* Rui Oliveira : ICMP routing table updates
18
* ([email protected]) Routing table insertion and update
19
* Linus Torvalds : Rewrote bits to be sensible
20
* Alan Cox : Added BSD route gw semantics
21
* Alan Cox : Super /proc >4K
22
* Alan Cox : MTU in route table
23
* Alan Cox : MSS actually. Also added the window
24
* clamper.
25
* Sam Lantinga : Fixed route matching in rt_del()
26
* Alan Cox : Routing cache support.
27
* Alan Cox : Removed compatibility cruft.
28
* Alan Cox : RTF_REJECT support.
29
* Alan Cox : TCP irtt support.
30
* Jonathan Naylor : Added Metric support.
31
* Miquel van Smoorenburg : BSD API fixes.
32
* Miquel van Smoorenburg : Metrics.
33
* Alan Cox : Use __u32 properly
34
* Alan Cox : Aligned routing errors more closely with BSD
35
* our system is still very different.
36
* Alan Cox : Faster /proc handling
37
* Alexey Kuznetsov : Massive rework to support tree based routing,
38
* routing caches and better behaviour.
39
*
40
* Olaf Erb : irtt wasn't being copied right.
41
* Bjorn Ekwall : Kerneld route support.
42
* Alan Cox : Multicast fixed (I hope)
43
* Pavel Krauz : Limited broadcast fixed
44
* Mike McLagan : Routing by source
45
* Alexey Kuznetsov : End of old history. Split to fib.c and
46
* route.c and rewritten from scratch.
47
* Andi Kleen : Load-limit warning messages.
48
* Vitaly E. Lavrov : Transparent proxy revived after year coma.
49
* Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50
* Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51
* Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52
* Marc Boucher : routing by fwmark
53
* Robert Olsson : Added rt_cache statistics
54
* Arnaldo C. Melo : Convert proc stuff to seq_file
55
* Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56
* Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57
* Ilia Sotnikov : Removed TOS from hash calculations
58
*
59
* This program is free software; you can redistribute it and/or
60
* modify it under the terms of the GNU General Public License
61
* as published by the Free Software Foundation; either version
62
* 2 of the License, or (at your option) any later version.
63
*/
64
65
#include <linux/module.h>
66
#include <asm/uaccess.h>
67
#include <asm/system.h>
68
#include <linux/bitops.h>
69
#include <linux/types.h>
70
#include <linux/kernel.h>
71
#include <linux/mm.h>
72
#include <linux/bootmem.h>
73
#include <linux/string.h>
74
#include <linux/socket.h>
75
#include <linux/sockios.h>
76
#include <linux/errno.h>
77
#include <linux/in.h>
78
#include <linux/inet.h>
79
#include <linux/netdevice.h>
80
#include <linux/proc_fs.h>
81
#include <linux/init.h>
82
#include <linux/workqueue.h>
83
#include <linux/skbuff.h>
84
#include <linux/inetdevice.h>
85
#include <linux/igmp.h>
86
#include <linux/pkt_sched.h>
87
#include <linux/mroute.h>
88
#include <linux/netfilter_ipv4.h>
89
#include <linux/random.h>
90
#include <linux/jhash.h>
91
#include <linux/rcupdate.h>
92
#include <linux/times.h>
93
#include <linux/slab.h>
94
#include <net/dst.h>
95
#include <net/net_namespace.h>
96
#include <net/protocol.h>
97
#include <net/ip.h>
98
#include <net/route.h>
99
#include <net/inetpeer.h>
100
#include <net/sock.h>
101
#include <net/ip_fib.h>
102
#include <net/arp.h>
103
#include <net/tcp.h>
104
#include <net/icmp.h>
105
#include <net/xfrm.h>
106
#include <net/netevent.h>
107
#include <net/rtnetlink.h>
108
#ifdef CONFIG_SYSCTL
109
#include <linux/sysctl.h>
110
#endif
111
112
#define RT_FL_TOS(oldflp4) \
113
((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115
#define IP_MAX_MTU 0xFFF0
116
117
#define RT_GC_TIMEOUT (300*HZ)
118
119
static int ip_rt_max_size;
120
static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121
static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122
static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123
static int ip_rt_redirect_number __read_mostly = 9;
124
static int ip_rt_redirect_load __read_mostly = HZ / 50;
125
static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126
static int ip_rt_error_cost __read_mostly = HZ;
127
static int ip_rt_error_burst __read_mostly = 5 * HZ;
128
static int ip_rt_gc_elasticity __read_mostly = 8;
129
static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130
static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131
static int ip_rt_min_advmss __read_mostly = 256;
132
static int rt_chain_length_max __read_mostly = 20;
133
134
/*
135
* Interface to generic destination cache.
136
*/
137
138
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
139
static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
140
static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
141
static void ipv4_dst_destroy(struct dst_entry *dst);
142
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143
static void ipv4_link_failure(struct sk_buff *skb);
144
static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
145
static int rt_garbage_collect(struct dst_ops *ops);
146
147
static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
148
int how)
149
{
150
}
151
152
static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
153
{
154
struct rtable *rt = (struct rtable *) dst;
155
struct inet_peer *peer;
156
u32 *p = NULL;
157
158
if (!rt->peer)
159
rt_bind_peer(rt, rt->rt_dst, 1);
160
161
peer = rt->peer;
162
if (peer) {
163
u32 *old_p = __DST_METRICS_PTR(old);
164
unsigned long prev, new;
165
166
p = peer->metrics;
167
if (inet_metrics_new(peer))
168
memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
169
170
new = (unsigned long) p;
171
prev = cmpxchg(&dst->_metrics, old, new);
172
173
if (prev != old) {
174
p = __DST_METRICS_PTR(prev);
175
if (prev & DST_METRICS_READ_ONLY)
176
p = NULL;
177
} else {
178
if (rt->fi) {
179
fib_info_put(rt->fi);
180
rt->fi = NULL;
181
}
182
}
183
}
184
return p;
185
}
186
187
static struct dst_ops ipv4_dst_ops = {
188
.family = AF_INET,
189
.protocol = cpu_to_be16(ETH_P_IP),
190
.gc = rt_garbage_collect,
191
.check = ipv4_dst_check,
192
.default_advmss = ipv4_default_advmss,
193
.default_mtu = ipv4_default_mtu,
194
.cow_metrics = ipv4_cow_metrics,
195
.destroy = ipv4_dst_destroy,
196
.ifdown = ipv4_dst_ifdown,
197
.negative_advice = ipv4_negative_advice,
198
.link_failure = ipv4_link_failure,
199
.update_pmtu = ip_rt_update_pmtu,
200
.local_out = __ip_local_out,
201
};
202
203
#define ECN_OR_COST(class) TC_PRIO_##class
204
205
const __u8 ip_tos2prio[16] = {
206
TC_PRIO_BESTEFFORT,
207
ECN_OR_COST(BESTEFFORT),
208
TC_PRIO_BESTEFFORT,
209
ECN_OR_COST(BESTEFFORT),
210
TC_PRIO_BULK,
211
ECN_OR_COST(BULK),
212
TC_PRIO_BULK,
213
ECN_OR_COST(BULK),
214
TC_PRIO_INTERACTIVE,
215
ECN_OR_COST(INTERACTIVE),
216
TC_PRIO_INTERACTIVE,
217
ECN_OR_COST(INTERACTIVE),
218
TC_PRIO_INTERACTIVE_BULK,
219
ECN_OR_COST(INTERACTIVE_BULK),
220
TC_PRIO_INTERACTIVE_BULK,
221
ECN_OR_COST(INTERACTIVE_BULK)
222
};
223
224
225
/*
226
* Route cache.
227
*/
228
229
/* The locking scheme is rather straight forward:
230
*
231
* 1) Read-Copy Update protects the buckets of the central route hash.
232
* 2) Only writers remove entries, and they hold the lock
233
* as they look at rtable reference counts.
234
* 3) Only readers acquire references to rtable entries,
235
* they do so with atomic increments and with the
236
* lock held.
237
*/
238
239
struct rt_hash_bucket {
240
struct rtable __rcu *chain;
241
};
242
243
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
244
defined(CONFIG_PROVE_LOCKING)
245
/*
246
* Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
247
* The size of this table is a power of two and depends on the number of CPUS.
248
* (on lockdep we have a quite big spinlock_t, so keep the size down there)
249
*/
250
#ifdef CONFIG_LOCKDEP
251
# define RT_HASH_LOCK_SZ 256
252
#else
253
# if NR_CPUS >= 32
254
# define RT_HASH_LOCK_SZ 4096
255
# elif NR_CPUS >= 16
256
# define RT_HASH_LOCK_SZ 2048
257
# elif NR_CPUS >= 8
258
# define RT_HASH_LOCK_SZ 1024
259
# elif NR_CPUS >= 4
260
# define RT_HASH_LOCK_SZ 512
261
# else
262
# define RT_HASH_LOCK_SZ 256
263
# endif
264
#endif
265
266
static spinlock_t *rt_hash_locks;
267
# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
268
269
static __init void rt_hash_lock_init(void)
270
{
271
int i;
272
273
rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
274
GFP_KERNEL);
275
if (!rt_hash_locks)
276
panic("IP: failed to allocate rt_hash_locks\n");
277
278
for (i = 0; i < RT_HASH_LOCK_SZ; i++)
279
spin_lock_init(&rt_hash_locks[i]);
280
}
281
#else
282
# define rt_hash_lock_addr(slot) NULL
283
284
static inline void rt_hash_lock_init(void)
285
{
286
}
287
#endif
288
289
static struct rt_hash_bucket *rt_hash_table __read_mostly;
290
static unsigned rt_hash_mask __read_mostly;
291
static unsigned int rt_hash_log __read_mostly;
292
293
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
294
#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
295
296
static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
297
int genid)
298
{
299
return jhash_3words((__force u32)daddr, (__force u32)saddr,
300
idx, genid)
301
& rt_hash_mask;
302
}
303
304
static inline int rt_genid(struct net *net)
305
{
306
return atomic_read(&net->ipv4.rt_genid);
307
}
308
309
#ifdef CONFIG_PROC_FS
310
struct rt_cache_iter_state {
311
struct seq_net_private p;
312
int bucket;
313
int genid;
314
};
315
316
static struct rtable *rt_cache_get_first(struct seq_file *seq)
317
{
318
struct rt_cache_iter_state *st = seq->private;
319
struct rtable *r = NULL;
320
321
for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
322
if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
323
continue;
324
rcu_read_lock_bh();
325
r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
326
while (r) {
327
if (dev_net(r->dst.dev) == seq_file_net(seq) &&
328
r->rt_genid == st->genid)
329
return r;
330
r = rcu_dereference_bh(r->dst.rt_next);
331
}
332
rcu_read_unlock_bh();
333
}
334
return r;
335
}
336
337
static struct rtable *__rt_cache_get_next(struct seq_file *seq,
338
struct rtable *r)
339
{
340
struct rt_cache_iter_state *st = seq->private;
341
342
r = rcu_dereference_bh(r->dst.rt_next);
343
while (!r) {
344
rcu_read_unlock_bh();
345
do {
346
if (--st->bucket < 0)
347
return NULL;
348
} while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
349
rcu_read_lock_bh();
350
r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
351
}
352
return r;
353
}
354
355
static struct rtable *rt_cache_get_next(struct seq_file *seq,
356
struct rtable *r)
357
{
358
struct rt_cache_iter_state *st = seq->private;
359
while ((r = __rt_cache_get_next(seq, r)) != NULL) {
360
if (dev_net(r->dst.dev) != seq_file_net(seq))
361
continue;
362
if (r->rt_genid == st->genid)
363
break;
364
}
365
return r;
366
}
367
368
static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
369
{
370
struct rtable *r = rt_cache_get_first(seq);
371
372
if (r)
373
while (pos && (r = rt_cache_get_next(seq, r)))
374
--pos;
375
return pos ? NULL : r;
376
}
377
378
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
379
{
380
struct rt_cache_iter_state *st = seq->private;
381
if (*pos)
382
return rt_cache_get_idx(seq, *pos - 1);
383
st->genid = rt_genid(seq_file_net(seq));
384
return SEQ_START_TOKEN;
385
}
386
387
static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
388
{
389
struct rtable *r;
390
391
if (v == SEQ_START_TOKEN)
392
r = rt_cache_get_first(seq);
393
else
394
r = rt_cache_get_next(seq, v);
395
++*pos;
396
return r;
397
}
398
399
static void rt_cache_seq_stop(struct seq_file *seq, void *v)
400
{
401
if (v && v != SEQ_START_TOKEN)
402
rcu_read_unlock_bh();
403
}
404
405
static int rt_cache_seq_show(struct seq_file *seq, void *v)
406
{
407
if (v == SEQ_START_TOKEN)
408
seq_printf(seq, "%-127s\n",
409
"Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
410
"Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
411
"HHUptod\tSpecDst");
412
else {
413
struct rtable *r = v;
414
int len;
415
416
seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
417
"%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
418
r->dst.dev ? r->dst.dev->name : "*",
419
(__force u32)r->rt_dst,
420
(__force u32)r->rt_gateway,
421
r->rt_flags, atomic_read(&r->dst.__refcnt),
422
r->dst.__use, 0, (__force u32)r->rt_src,
423
dst_metric_advmss(&r->dst) + 40,
424
dst_metric(&r->dst, RTAX_WINDOW),
425
(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
426
dst_metric(&r->dst, RTAX_RTTVAR)),
427
r->rt_key_tos,
428
r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
429
r->dst.hh ? (r->dst.hh->hh_output ==
430
dev_queue_xmit) : 0,
431
r->rt_spec_dst, &len);
432
433
seq_printf(seq, "%*s\n", 127 - len, "");
434
}
435
return 0;
436
}
437
438
static const struct seq_operations rt_cache_seq_ops = {
439
.start = rt_cache_seq_start,
440
.next = rt_cache_seq_next,
441
.stop = rt_cache_seq_stop,
442
.show = rt_cache_seq_show,
443
};
444
445
static int rt_cache_seq_open(struct inode *inode, struct file *file)
446
{
447
return seq_open_net(inode, file, &rt_cache_seq_ops,
448
sizeof(struct rt_cache_iter_state));
449
}
450
451
static const struct file_operations rt_cache_seq_fops = {
452
.owner = THIS_MODULE,
453
.open = rt_cache_seq_open,
454
.read = seq_read,
455
.llseek = seq_lseek,
456
.release = seq_release_net,
457
};
458
459
460
static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
461
{
462
int cpu;
463
464
if (*pos == 0)
465
return SEQ_START_TOKEN;
466
467
for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
468
if (!cpu_possible(cpu))
469
continue;
470
*pos = cpu+1;
471
return &per_cpu(rt_cache_stat, cpu);
472
}
473
return NULL;
474
}
475
476
static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
477
{
478
int cpu;
479
480
for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
481
if (!cpu_possible(cpu))
482
continue;
483
*pos = cpu+1;
484
return &per_cpu(rt_cache_stat, cpu);
485
}
486
return NULL;
487
488
}
489
490
static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
491
{
492
493
}
494
495
static int rt_cpu_seq_show(struct seq_file *seq, void *v)
496
{
497
struct rt_cache_stat *st = v;
498
499
if (v == SEQ_START_TOKEN) {
500
seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
501
return 0;
502
}
503
504
seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
505
" %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
506
dst_entries_get_slow(&ipv4_dst_ops),
507
st->in_hit,
508
st->in_slow_tot,
509
st->in_slow_mc,
510
st->in_no_route,
511
st->in_brd,
512
st->in_martian_dst,
513
st->in_martian_src,
514
515
st->out_hit,
516
st->out_slow_tot,
517
st->out_slow_mc,
518
519
st->gc_total,
520
st->gc_ignored,
521
st->gc_goal_miss,
522
st->gc_dst_overflow,
523
st->in_hlist_search,
524
st->out_hlist_search
525
);
526
return 0;
527
}
528
529
static const struct seq_operations rt_cpu_seq_ops = {
530
.start = rt_cpu_seq_start,
531
.next = rt_cpu_seq_next,
532
.stop = rt_cpu_seq_stop,
533
.show = rt_cpu_seq_show,
534
};
535
536
537
static int rt_cpu_seq_open(struct inode *inode, struct file *file)
538
{
539
return seq_open(file, &rt_cpu_seq_ops);
540
}
541
542
static const struct file_operations rt_cpu_seq_fops = {
543
.owner = THIS_MODULE,
544
.open = rt_cpu_seq_open,
545
.read = seq_read,
546
.llseek = seq_lseek,
547
.release = seq_release,
548
};
549
550
#ifdef CONFIG_IP_ROUTE_CLASSID
551
static int rt_acct_proc_show(struct seq_file *m, void *v)
552
{
553
struct ip_rt_acct *dst, *src;
554
unsigned int i, j;
555
556
dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
557
if (!dst)
558
return -ENOMEM;
559
560
for_each_possible_cpu(i) {
561
src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
562
for (j = 0; j < 256; j++) {
563
dst[j].o_bytes += src[j].o_bytes;
564
dst[j].o_packets += src[j].o_packets;
565
dst[j].i_bytes += src[j].i_bytes;
566
dst[j].i_packets += src[j].i_packets;
567
}
568
}
569
570
seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
571
kfree(dst);
572
return 0;
573
}
574
575
static int rt_acct_proc_open(struct inode *inode, struct file *file)
576
{
577
return single_open(file, rt_acct_proc_show, NULL);
578
}
579
580
static const struct file_operations rt_acct_proc_fops = {
581
.owner = THIS_MODULE,
582
.open = rt_acct_proc_open,
583
.read = seq_read,
584
.llseek = seq_lseek,
585
.release = single_release,
586
};
587
#endif
588
589
static int __net_init ip_rt_do_proc_init(struct net *net)
590
{
591
struct proc_dir_entry *pde;
592
593
pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
594
&rt_cache_seq_fops);
595
if (!pde)
596
goto err1;
597
598
pde = proc_create("rt_cache", S_IRUGO,
599
net->proc_net_stat, &rt_cpu_seq_fops);
600
if (!pde)
601
goto err2;
602
603
#ifdef CONFIG_IP_ROUTE_CLASSID
604
pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
605
if (!pde)
606
goto err3;
607
#endif
608
return 0;
609
610
#ifdef CONFIG_IP_ROUTE_CLASSID
611
err3:
612
remove_proc_entry("rt_cache", net->proc_net_stat);
613
#endif
614
err2:
615
remove_proc_entry("rt_cache", net->proc_net);
616
err1:
617
return -ENOMEM;
618
}
619
620
static void __net_exit ip_rt_do_proc_exit(struct net *net)
621
{
622
remove_proc_entry("rt_cache", net->proc_net_stat);
623
remove_proc_entry("rt_cache", net->proc_net);
624
#ifdef CONFIG_IP_ROUTE_CLASSID
625
remove_proc_entry("rt_acct", net->proc_net);
626
#endif
627
}
628
629
static struct pernet_operations ip_rt_proc_ops __net_initdata = {
630
.init = ip_rt_do_proc_init,
631
.exit = ip_rt_do_proc_exit,
632
};
633
634
static int __init ip_rt_proc_init(void)
635
{
636
return register_pernet_subsys(&ip_rt_proc_ops);
637
}
638
639
#else
640
static inline int ip_rt_proc_init(void)
641
{
642
return 0;
643
}
644
#endif /* CONFIG_PROC_FS */
645
646
static inline void rt_free(struct rtable *rt)
647
{
648
call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
649
}
650
651
static inline void rt_drop(struct rtable *rt)
652
{
653
ip_rt_put(rt);
654
call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
655
}
656
657
static inline int rt_fast_clean(struct rtable *rth)
658
{
659
/* Kill broadcast/multicast entries very aggresively, if they
660
collide in hash table with more useful entries */
661
return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
662
rt_is_input_route(rth) && rth->dst.rt_next;
663
}
664
665
static inline int rt_valuable(struct rtable *rth)
666
{
667
return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
668
(rth->peer && rth->peer->pmtu_expires);
669
}
670
671
static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
672
{
673
unsigned long age;
674
int ret = 0;
675
676
if (atomic_read(&rth->dst.__refcnt))
677
goto out;
678
679
age = jiffies - rth->dst.lastuse;
680
if ((age <= tmo1 && !rt_fast_clean(rth)) ||
681
(age <= tmo2 && rt_valuable(rth)))
682
goto out;
683
ret = 1;
684
out: return ret;
685
}
686
687
/* Bits of score are:
688
* 31: very valuable
689
* 30: not quite useless
690
* 29..0: usage counter
691
*/
692
static inline u32 rt_score(struct rtable *rt)
693
{
694
u32 score = jiffies - rt->dst.lastuse;
695
696
score = ~score & ~(3<<30);
697
698
if (rt_valuable(rt))
699
score |= (1<<31);
700
701
if (rt_is_output_route(rt) ||
702
!(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
703
score |= (1<<30);
704
705
return score;
706
}
707
708
static inline bool rt_caching(const struct net *net)
709
{
710
return net->ipv4.current_rt_cache_rebuild_count <=
711
net->ipv4.sysctl_rt_cache_rebuild_count;
712
}
713
714
static inline bool compare_hash_inputs(const struct rtable *rt1,
715
const struct rtable *rt2)
716
{
717
return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
718
((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
719
(rt1->rt_iif ^ rt2->rt_iif)) == 0);
720
}
721
722
static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
723
{
724
return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
725
((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
726
(rt1->rt_mark ^ rt2->rt_mark) |
727
(rt1->rt_key_tos ^ rt2->rt_key_tos) |
728
(rt1->rt_oif ^ rt2->rt_oif) |
729
(rt1->rt_iif ^ rt2->rt_iif)) == 0;
730
}
731
732
static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
733
{
734
return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
735
}
736
737
static inline int rt_is_expired(struct rtable *rth)
738
{
739
return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
740
}
741
742
/*
743
* Perform a full scan of hash table and free all entries.
744
* Can be called by a softirq or a process.
745
* In the later case, we want to be reschedule if necessary
746
*/
747
static void rt_do_flush(struct net *net, int process_context)
748
{
749
unsigned int i;
750
struct rtable *rth, *next;
751
752
for (i = 0; i <= rt_hash_mask; i++) {
753
struct rtable __rcu **pprev;
754
struct rtable *list;
755
756
if (process_context && need_resched())
757
cond_resched();
758
rth = rcu_dereference_raw(rt_hash_table[i].chain);
759
if (!rth)
760
continue;
761
762
spin_lock_bh(rt_hash_lock_addr(i));
763
764
list = NULL;
765
pprev = &rt_hash_table[i].chain;
766
rth = rcu_dereference_protected(*pprev,
767
lockdep_is_held(rt_hash_lock_addr(i)));
768
769
while (rth) {
770
next = rcu_dereference_protected(rth->dst.rt_next,
771
lockdep_is_held(rt_hash_lock_addr(i)));
772
773
if (!net ||
774
net_eq(dev_net(rth->dst.dev), net)) {
775
rcu_assign_pointer(*pprev, next);
776
rcu_assign_pointer(rth->dst.rt_next, list);
777
list = rth;
778
} else {
779
pprev = &rth->dst.rt_next;
780
}
781
rth = next;
782
}
783
784
spin_unlock_bh(rt_hash_lock_addr(i));
785
786
for (; list; list = next) {
787
next = rcu_dereference_protected(list->dst.rt_next, 1);
788
rt_free(list);
789
}
790
}
791
}
792
793
/*
794
* While freeing expired entries, we compute average chain length
795
* and standard deviation, using fixed-point arithmetic.
796
* This to have an estimation of rt_chain_length_max
797
* rt_chain_length_max = max(elasticity, AVG + 4*SD)
798
* We use 3 bits for frational part, and 29 (or 61) for magnitude.
799
*/
800
801
#define FRACT_BITS 3
802
#define ONE (1UL << FRACT_BITS)
803
804
/*
805
* Given a hash chain and an item in this hash chain,
806
* find if a previous entry has the same hash_inputs
807
* (but differs on tos, mark or oif)
808
* Returns 0 if an alias is found.
809
* Returns ONE if rth has no alias before itself.
810
*/
811
static int has_noalias(const struct rtable *head, const struct rtable *rth)
812
{
813
const struct rtable *aux = head;
814
815
while (aux != rth) {
816
if (compare_hash_inputs(aux, rth))
817
return 0;
818
aux = rcu_dereference_protected(aux->dst.rt_next, 1);
819
}
820
return ONE;
821
}
822
823
/*
824
* Perturbation of rt_genid by a small quantity [1..256]
825
* Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
826
* many times (2^24) without giving recent rt_genid.
827
* Jenkins hash is strong enough that litle changes of rt_genid are OK.
828
*/
829
static void rt_cache_invalidate(struct net *net)
830
{
831
unsigned char shuffle;
832
833
get_random_bytes(&shuffle, sizeof(shuffle));
834
atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
835
}
836
837
/*
838
* delay < 0 : invalidate cache (fast : entries will be deleted later)
839
* delay >= 0 : invalidate & flush cache (can be long)
840
*/
841
void rt_cache_flush(struct net *net, int delay)
842
{
843
rt_cache_invalidate(net);
844
if (delay >= 0)
845
rt_do_flush(net, !in_softirq());
846
}
847
848
/* Flush previous cache invalidated entries from the cache */
849
void rt_cache_flush_batch(struct net *net)
850
{
851
rt_do_flush(net, !in_softirq());
852
}
853
854
static void rt_emergency_hash_rebuild(struct net *net)
855
{
856
if (net_ratelimit())
857
printk(KERN_WARNING "Route hash chain too long!\n");
858
rt_cache_invalidate(net);
859
}
860
861
/*
862
Short description of GC goals.
863
864
We want to build algorithm, which will keep routing cache
865
at some equilibrium point, when number of aged off entries
866
is kept approximately equal to newly generated ones.
867
868
Current expiration strength is variable "expire".
869
We try to adjust it dynamically, so that if networking
870
is idle expires is large enough to keep enough of warm entries,
871
and when load increases it reduces to limit cache size.
872
*/
873
874
static int rt_garbage_collect(struct dst_ops *ops)
875
{
876
static unsigned long expire = RT_GC_TIMEOUT;
877
static unsigned long last_gc;
878
static int rover;
879
static int equilibrium;
880
struct rtable *rth;
881
struct rtable __rcu **rthp;
882
unsigned long now = jiffies;
883
int goal;
884
int entries = dst_entries_get_fast(&ipv4_dst_ops);
885
886
/*
887
* Garbage collection is pretty expensive,
888
* do not make it too frequently.
889
*/
890
891
RT_CACHE_STAT_INC(gc_total);
892
893
if (now - last_gc < ip_rt_gc_min_interval &&
894
entries < ip_rt_max_size) {
895
RT_CACHE_STAT_INC(gc_ignored);
896
goto out;
897
}
898
899
entries = dst_entries_get_slow(&ipv4_dst_ops);
900
/* Calculate number of entries, which we want to expire now. */
901
goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
902
if (goal <= 0) {
903
if (equilibrium < ipv4_dst_ops.gc_thresh)
904
equilibrium = ipv4_dst_ops.gc_thresh;
905
goal = entries - equilibrium;
906
if (goal > 0) {
907
equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
908
goal = entries - equilibrium;
909
}
910
} else {
911
/* We are in dangerous area. Try to reduce cache really
912
* aggressively.
913
*/
914
goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
915
equilibrium = entries - goal;
916
}
917
918
if (now - last_gc >= ip_rt_gc_min_interval)
919
last_gc = now;
920
921
if (goal <= 0) {
922
equilibrium += goal;
923
goto work_done;
924
}
925
926
do {
927
int i, k;
928
929
for (i = rt_hash_mask, k = rover; i >= 0; i--) {
930
unsigned long tmo = expire;
931
932
k = (k + 1) & rt_hash_mask;
933
rthp = &rt_hash_table[k].chain;
934
spin_lock_bh(rt_hash_lock_addr(k));
935
while ((rth = rcu_dereference_protected(*rthp,
936
lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
937
if (!rt_is_expired(rth) &&
938
!rt_may_expire(rth, tmo, expire)) {
939
tmo >>= 1;
940
rthp = &rth->dst.rt_next;
941
continue;
942
}
943
*rthp = rth->dst.rt_next;
944
rt_free(rth);
945
goal--;
946
}
947
spin_unlock_bh(rt_hash_lock_addr(k));
948
if (goal <= 0)
949
break;
950
}
951
rover = k;
952
953
if (goal <= 0)
954
goto work_done;
955
956
/* Goal is not achieved. We stop process if:
957
958
- if expire reduced to zero. Otherwise, expire is halfed.
959
- if table is not full.
960
- if we are called from interrupt.
961
- jiffies check is just fallback/debug loop breaker.
962
We will not spin here for long time in any case.
963
*/
964
965
RT_CACHE_STAT_INC(gc_goal_miss);
966
967
if (expire == 0)
968
break;
969
970
expire >>= 1;
971
972
if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
973
goto out;
974
} while (!in_softirq() && time_before_eq(jiffies, now));
975
976
if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
977
goto out;
978
if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
979
goto out;
980
if (net_ratelimit())
981
printk(KERN_WARNING "dst cache overflow\n");
982
RT_CACHE_STAT_INC(gc_dst_overflow);
983
return 1;
984
985
work_done:
986
expire += ip_rt_gc_min_interval;
987
if (expire > ip_rt_gc_timeout ||
988
dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
989
dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
990
expire = ip_rt_gc_timeout;
991
out: return 0;
992
}
993
994
/*
995
* Returns number of entries in a hash chain that have different hash_inputs
996
*/
997
static int slow_chain_length(const struct rtable *head)
998
{
999
int length = 0;
1000
const struct rtable *rth = head;
1001
1002
while (rth) {
1003
length += has_noalias(head, rth);
1004
rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1005
}
1006
return length >> FRACT_BITS;
1007
}
1008
1009
static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1010
struct sk_buff *skb, int ifindex)
1011
{
1012
struct rtable *rth, *cand;
1013
struct rtable __rcu **rthp, **candp;
1014
unsigned long now;
1015
u32 min_score;
1016
int chain_length;
1017
int attempts = !in_softirq();
1018
1019
restart:
1020
chain_length = 0;
1021
min_score = ~(u32)0;
1022
cand = NULL;
1023
candp = NULL;
1024
now = jiffies;
1025
1026
if (!rt_caching(dev_net(rt->dst.dev))) {
1027
/*
1028
* If we're not caching, just tell the caller we
1029
* were successful and don't touch the route. The
1030
* caller hold the sole reference to the cache entry, and
1031
* it will be released when the caller is done with it.
1032
* If we drop it here, the callers have no way to resolve routes
1033
* when we're not caching. Instead, just point *rp at rt, so
1034
* the caller gets a single use out of the route
1035
* Note that we do rt_free on this new route entry, so that
1036
* once its refcount hits zero, we are still able to reap it
1037
* (Thanks Alexey)
1038
* Note: To avoid expensive rcu stuff for this uncached dst,
1039
* we set DST_NOCACHE so that dst_release() can free dst without
1040
* waiting a grace period.
1041
*/
1042
1043
rt->dst.flags |= DST_NOCACHE;
1044
if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1045
int err = arp_bind_neighbour(&rt->dst);
1046
if (err) {
1047
if (net_ratelimit())
1048
printk(KERN_WARNING
1049
"Neighbour table failure & not caching routes.\n");
1050
ip_rt_put(rt);
1051
return ERR_PTR(err);
1052
}
1053
}
1054
1055
goto skip_hashing;
1056
}
1057
1058
rthp = &rt_hash_table[hash].chain;
1059
1060
spin_lock_bh(rt_hash_lock_addr(hash));
1061
while ((rth = rcu_dereference_protected(*rthp,
1062
lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1063
if (rt_is_expired(rth)) {
1064
*rthp = rth->dst.rt_next;
1065
rt_free(rth);
1066
continue;
1067
}
1068
if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1069
/* Put it first */
1070
*rthp = rth->dst.rt_next;
1071
/*
1072
* Since lookup is lockfree, the deletion
1073
* must be visible to another weakly ordered CPU before
1074
* the insertion at the start of the hash chain.
1075
*/
1076
rcu_assign_pointer(rth->dst.rt_next,
1077
rt_hash_table[hash].chain);
1078
/*
1079
* Since lookup is lockfree, the update writes
1080
* must be ordered for consistency on SMP.
1081
*/
1082
rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1083
1084
dst_use(&rth->dst, now);
1085
spin_unlock_bh(rt_hash_lock_addr(hash));
1086
1087
rt_drop(rt);
1088
if (skb)
1089
skb_dst_set(skb, &rth->dst);
1090
return rth;
1091
}
1092
1093
if (!atomic_read(&rth->dst.__refcnt)) {
1094
u32 score = rt_score(rth);
1095
1096
if (score <= min_score) {
1097
cand = rth;
1098
candp = rthp;
1099
min_score = score;
1100
}
1101
}
1102
1103
chain_length++;
1104
1105
rthp = &rth->dst.rt_next;
1106
}
1107
1108
if (cand) {
1109
/* ip_rt_gc_elasticity used to be average length of chain
1110
* length, when exceeded gc becomes really aggressive.
1111
*
1112
* The second limit is less certain. At the moment it allows
1113
* only 2 entries per bucket. We will see.
1114
*/
1115
if (chain_length > ip_rt_gc_elasticity) {
1116
*candp = cand->dst.rt_next;
1117
rt_free(cand);
1118
}
1119
} else {
1120
if (chain_length > rt_chain_length_max &&
1121
slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1122
struct net *net = dev_net(rt->dst.dev);
1123
int num = ++net->ipv4.current_rt_cache_rebuild_count;
1124
if (!rt_caching(net)) {
1125
printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1126
rt->dst.dev->name, num);
1127
}
1128
rt_emergency_hash_rebuild(net);
1129
spin_unlock_bh(rt_hash_lock_addr(hash));
1130
1131
hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1132
ifindex, rt_genid(net));
1133
goto restart;
1134
}
1135
}
1136
1137
/* Try to bind route to arp only if it is output
1138
route or unicast forwarding path.
1139
*/
1140
if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1141
int err = arp_bind_neighbour(&rt->dst);
1142
if (err) {
1143
spin_unlock_bh(rt_hash_lock_addr(hash));
1144
1145
if (err != -ENOBUFS) {
1146
rt_drop(rt);
1147
return ERR_PTR(err);
1148
}
1149
1150
/* Neighbour tables are full and nothing
1151
can be released. Try to shrink route cache,
1152
it is most likely it holds some neighbour records.
1153
*/
1154
if (attempts-- > 0) {
1155
int saved_elasticity = ip_rt_gc_elasticity;
1156
int saved_int = ip_rt_gc_min_interval;
1157
ip_rt_gc_elasticity = 1;
1158
ip_rt_gc_min_interval = 0;
1159
rt_garbage_collect(&ipv4_dst_ops);
1160
ip_rt_gc_min_interval = saved_int;
1161
ip_rt_gc_elasticity = saved_elasticity;
1162
goto restart;
1163
}
1164
1165
if (net_ratelimit())
1166
printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1167
rt_drop(rt);
1168
return ERR_PTR(-ENOBUFS);
1169
}
1170
}
1171
1172
rt->dst.rt_next = rt_hash_table[hash].chain;
1173
1174
/*
1175
* Since lookup is lockfree, we must make sure
1176
* previous writes to rt are committed to memory
1177
* before making rt visible to other CPUS.
1178
*/
1179
rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1180
1181
spin_unlock_bh(rt_hash_lock_addr(hash));
1182
1183
skip_hashing:
1184
if (skb)
1185
skb_dst_set(skb, &rt->dst);
1186
return rt;
1187
}
1188
1189
static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1190
1191
static u32 rt_peer_genid(void)
1192
{
1193
return atomic_read(&__rt_peer_genid);
1194
}
1195
1196
void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1197
{
1198
struct inet_peer *peer;
1199
1200
peer = inet_getpeer_v4(daddr, create);
1201
1202
if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1203
inet_putpeer(peer);
1204
else
1205
rt->rt_peer_genid = rt_peer_genid();
1206
}
1207
1208
/*
1209
* Peer allocation may fail only in serious out-of-memory conditions. However
1210
* we still can generate some output.
1211
* Random ID selection looks a bit dangerous because we have no chances to
1212
* select ID being unique in a reasonable period of time.
1213
* But broken packet identifier may be better than no packet at all.
1214
*/
1215
static void ip_select_fb_ident(struct iphdr *iph)
1216
{
1217
static DEFINE_SPINLOCK(ip_fb_id_lock);
1218
static u32 ip_fallback_id;
1219
u32 salt;
1220
1221
spin_lock_bh(&ip_fb_id_lock);
1222
salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1223
iph->id = htons(salt & 0xFFFF);
1224
ip_fallback_id = salt;
1225
spin_unlock_bh(&ip_fb_id_lock);
1226
}
1227
1228
void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1229
{
1230
struct rtable *rt = (struct rtable *) dst;
1231
1232
if (rt) {
1233
if (rt->peer == NULL)
1234
rt_bind_peer(rt, rt->rt_dst, 1);
1235
1236
/* If peer is attached to destination, it is never detached,
1237
so that we need not to grab a lock to dereference it.
1238
*/
1239
if (rt->peer) {
1240
iph->id = htons(inet_getid(rt->peer, more));
1241
return;
1242
}
1243
} else
1244
printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1245
__builtin_return_address(0));
1246
1247
ip_select_fb_ident(iph);
1248
}
1249
EXPORT_SYMBOL(__ip_select_ident);
1250
1251
static void rt_del(unsigned hash, struct rtable *rt)
1252
{
1253
struct rtable __rcu **rthp;
1254
struct rtable *aux;
1255
1256
rthp = &rt_hash_table[hash].chain;
1257
spin_lock_bh(rt_hash_lock_addr(hash));
1258
ip_rt_put(rt);
1259
while ((aux = rcu_dereference_protected(*rthp,
1260
lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1261
if (aux == rt || rt_is_expired(aux)) {
1262
*rthp = aux->dst.rt_next;
1263
rt_free(aux);
1264
continue;
1265
}
1266
rthp = &aux->dst.rt_next;
1267
}
1268
spin_unlock_bh(rt_hash_lock_addr(hash));
1269
}
1270
1271
/* called in rcu_read_lock() section */
1272
void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1273
__be32 saddr, struct net_device *dev)
1274
{
1275
struct in_device *in_dev = __in_dev_get_rcu(dev);
1276
struct inet_peer *peer;
1277
struct net *net;
1278
1279
if (!in_dev)
1280
return;
1281
1282
net = dev_net(dev);
1283
if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1284
ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1285
ipv4_is_zeronet(new_gw))
1286
goto reject_redirect;
1287
1288
if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1289
if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1290
goto reject_redirect;
1291
if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1292
goto reject_redirect;
1293
} else {
1294
if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1295
goto reject_redirect;
1296
}
1297
1298
peer = inet_getpeer_v4(daddr, 1);
1299
if (peer) {
1300
peer->redirect_learned.a4 = new_gw;
1301
1302
inet_putpeer(peer);
1303
1304
atomic_inc(&__rt_peer_genid);
1305
}
1306
return;
1307
1308
reject_redirect:
1309
#ifdef CONFIG_IP_ROUTE_VERBOSE
1310
if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1311
printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1312
" Advised path = %pI4 -> %pI4\n",
1313
&old_gw, dev->name, &new_gw,
1314
&saddr, &daddr);
1315
#endif
1316
;
1317
}
1318
1319
static bool peer_pmtu_expired(struct inet_peer *peer)
1320
{
1321
unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1322
1323
return orig &&
1324
time_after_eq(jiffies, orig) &&
1325
cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1326
}
1327
1328
static bool peer_pmtu_cleaned(struct inet_peer *peer)
1329
{
1330
unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1331
1332
return orig &&
1333
cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1334
}
1335
1336
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1337
{
1338
struct rtable *rt = (struct rtable *)dst;
1339
struct dst_entry *ret = dst;
1340
1341
if (rt) {
1342
if (dst->obsolete > 0) {
1343
ip_rt_put(rt);
1344
ret = NULL;
1345
} else if (rt->rt_flags & RTCF_REDIRECTED) {
1346
unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1347
rt->rt_oif,
1348
rt_genid(dev_net(dst->dev)));
1349
rt_del(hash, rt);
1350
ret = NULL;
1351
} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1352
dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1353
}
1354
}
1355
return ret;
1356
}
1357
1358
/*
1359
* Algorithm:
1360
* 1. The first ip_rt_redirect_number redirects are sent
1361
* with exponential backoff, then we stop sending them at all,
1362
* assuming that the host ignores our redirects.
1363
* 2. If we did not see packets requiring redirects
1364
* during ip_rt_redirect_silence, we assume that the host
1365
* forgot redirected route and start to send redirects again.
1366
*
1367
* This algorithm is much cheaper and more intelligent than dumb load limiting
1368
* in icmp.c.
1369
*
1370
* NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1371
* and "frag. need" (breaks PMTU discovery) in icmp.c.
1372
*/
1373
1374
void ip_rt_send_redirect(struct sk_buff *skb)
1375
{
1376
struct rtable *rt = skb_rtable(skb);
1377
struct in_device *in_dev;
1378
struct inet_peer *peer;
1379
int log_martians;
1380
1381
rcu_read_lock();
1382
in_dev = __in_dev_get_rcu(rt->dst.dev);
1383
if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1384
rcu_read_unlock();
1385
return;
1386
}
1387
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1388
rcu_read_unlock();
1389
1390
if (!rt->peer)
1391
rt_bind_peer(rt, rt->rt_dst, 1);
1392
peer = rt->peer;
1393
if (!peer) {
1394
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1395
return;
1396
}
1397
1398
/* No redirected packets during ip_rt_redirect_silence;
1399
* reset the algorithm.
1400
*/
1401
if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1402
peer->rate_tokens = 0;
1403
1404
/* Too many ignored redirects; do not send anything
1405
* set dst.rate_last to the last seen redirected packet.
1406
*/
1407
if (peer->rate_tokens >= ip_rt_redirect_number) {
1408
peer->rate_last = jiffies;
1409
return;
1410
}
1411
1412
/* Check for load limit; set rate_last to the latest sent
1413
* redirect.
1414
*/
1415
if (peer->rate_tokens == 0 ||
1416
time_after(jiffies,
1417
(peer->rate_last +
1418
(ip_rt_redirect_load << peer->rate_tokens)))) {
1419
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1420
peer->rate_last = jiffies;
1421
++peer->rate_tokens;
1422
#ifdef CONFIG_IP_ROUTE_VERBOSE
1423
if (log_martians &&
1424
peer->rate_tokens == ip_rt_redirect_number &&
1425
net_ratelimit())
1426
printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1427
&ip_hdr(skb)->saddr, rt->rt_iif,
1428
&rt->rt_dst, &rt->rt_gateway);
1429
#endif
1430
}
1431
}
1432
1433
static int ip_error(struct sk_buff *skb)
1434
{
1435
struct rtable *rt = skb_rtable(skb);
1436
struct inet_peer *peer;
1437
unsigned long now;
1438
bool send;
1439
int code;
1440
1441
switch (rt->dst.error) {
1442
case EINVAL:
1443
default:
1444
goto out;
1445
case EHOSTUNREACH:
1446
code = ICMP_HOST_UNREACH;
1447
break;
1448
case ENETUNREACH:
1449
code = ICMP_NET_UNREACH;
1450
IP_INC_STATS_BH(dev_net(rt->dst.dev),
1451
IPSTATS_MIB_INNOROUTES);
1452
break;
1453
case EACCES:
1454
code = ICMP_PKT_FILTERED;
1455
break;
1456
}
1457
1458
if (!rt->peer)
1459
rt_bind_peer(rt, rt->rt_dst, 1);
1460
peer = rt->peer;
1461
1462
send = true;
1463
if (peer) {
1464
now = jiffies;
1465
peer->rate_tokens += now - peer->rate_last;
1466
if (peer->rate_tokens > ip_rt_error_burst)
1467
peer->rate_tokens = ip_rt_error_burst;
1468
peer->rate_last = now;
1469
if (peer->rate_tokens >= ip_rt_error_cost)
1470
peer->rate_tokens -= ip_rt_error_cost;
1471
else
1472
send = false;
1473
}
1474
if (send)
1475
icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1476
1477
out: kfree_skb(skb);
1478
return 0;
1479
}
1480
1481
/*
1482
* The last two values are not from the RFC but
1483
* are needed for AMPRnet AX.25 paths.
1484
*/
1485
1486
static const unsigned short mtu_plateau[] =
1487
{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1488
1489
static inline unsigned short guess_mtu(unsigned short old_mtu)
1490
{
1491
int i;
1492
1493
for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1494
if (old_mtu > mtu_plateau[i])
1495
return mtu_plateau[i];
1496
return 68;
1497
}
1498
1499
unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1500
unsigned short new_mtu,
1501
struct net_device *dev)
1502
{
1503
unsigned short old_mtu = ntohs(iph->tot_len);
1504
unsigned short est_mtu = 0;
1505
struct inet_peer *peer;
1506
1507
peer = inet_getpeer_v4(iph->daddr, 1);
1508
if (peer) {
1509
unsigned short mtu = new_mtu;
1510
1511
if (new_mtu < 68 || new_mtu >= old_mtu) {
1512
/* BSD 4.2 derived systems incorrectly adjust
1513
* tot_len by the IP header length, and report
1514
* a zero MTU in the ICMP message.
1515
*/
1516
if (mtu == 0 &&
1517
old_mtu >= 68 + (iph->ihl << 2))
1518
old_mtu -= iph->ihl << 2;
1519
mtu = guess_mtu(old_mtu);
1520
}
1521
1522
if (mtu < ip_rt_min_pmtu)
1523
mtu = ip_rt_min_pmtu;
1524
if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1525
unsigned long pmtu_expires;
1526
1527
pmtu_expires = jiffies + ip_rt_mtu_expires;
1528
if (!pmtu_expires)
1529
pmtu_expires = 1UL;
1530
1531
est_mtu = mtu;
1532
peer->pmtu_learned = mtu;
1533
peer->pmtu_expires = pmtu_expires;
1534
}
1535
1536
inet_putpeer(peer);
1537
1538
atomic_inc(&__rt_peer_genid);
1539
}
1540
return est_mtu ? : new_mtu;
1541
}
1542
1543
static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1544
{
1545
unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1546
1547
if (!expires)
1548
return;
1549
if (time_before(jiffies, expires)) {
1550
u32 orig_dst_mtu = dst_mtu(dst);
1551
if (peer->pmtu_learned < orig_dst_mtu) {
1552
if (!peer->pmtu_orig)
1553
peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1554
dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1555
}
1556
} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1557
dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1558
}
1559
1560
static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1561
{
1562
struct rtable *rt = (struct rtable *) dst;
1563
struct inet_peer *peer;
1564
1565
dst_confirm(dst);
1566
1567
if (!rt->peer)
1568
rt_bind_peer(rt, rt->rt_dst, 1);
1569
peer = rt->peer;
1570
if (peer) {
1571
unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1572
1573
if (mtu < ip_rt_min_pmtu)
1574
mtu = ip_rt_min_pmtu;
1575
if (!pmtu_expires || mtu < peer->pmtu_learned) {
1576
1577
pmtu_expires = jiffies + ip_rt_mtu_expires;
1578
if (!pmtu_expires)
1579
pmtu_expires = 1UL;
1580
1581
peer->pmtu_learned = mtu;
1582
peer->pmtu_expires = pmtu_expires;
1583
1584
atomic_inc(&__rt_peer_genid);
1585
rt->rt_peer_genid = rt_peer_genid();
1586
}
1587
check_peer_pmtu(dst, peer);
1588
}
1589
}
1590
1591
static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1592
{
1593
struct rtable *rt = (struct rtable *) dst;
1594
__be32 orig_gw = rt->rt_gateway;
1595
1596
dst_confirm(&rt->dst);
1597
1598
neigh_release(rt->dst.neighbour);
1599
rt->dst.neighbour = NULL;
1600
1601
rt->rt_gateway = peer->redirect_learned.a4;
1602
if (arp_bind_neighbour(&rt->dst) ||
1603
!(rt->dst.neighbour->nud_state & NUD_VALID)) {
1604
if (rt->dst.neighbour)
1605
neigh_event_send(rt->dst.neighbour, NULL);
1606
rt->rt_gateway = orig_gw;
1607
return -EAGAIN;
1608
} else {
1609
rt->rt_flags |= RTCF_REDIRECTED;
1610
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1611
rt->dst.neighbour);
1612
}
1613
return 0;
1614
}
1615
1616
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1617
{
1618
struct rtable *rt = (struct rtable *) dst;
1619
1620
if (rt_is_expired(rt))
1621
return NULL;
1622
if (rt->rt_peer_genid != rt_peer_genid()) {
1623
struct inet_peer *peer;
1624
1625
if (!rt->peer)
1626
rt_bind_peer(rt, rt->rt_dst, 0);
1627
1628
peer = rt->peer;
1629
if (peer) {
1630
check_peer_pmtu(dst, peer);
1631
1632
if (peer->redirect_learned.a4 &&
1633
peer->redirect_learned.a4 != rt->rt_gateway) {
1634
if (check_peer_redir(dst, peer))
1635
return NULL;
1636
}
1637
}
1638
1639
rt->rt_peer_genid = rt_peer_genid();
1640
}
1641
return dst;
1642
}
1643
1644
static void ipv4_dst_destroy(struct dst_entry *dst)
1645
{
1646
struct rtable *rt = (struct rtable *) dst;
1647
struct inet_peer *peer = rt->peer;
1648
1649
if (rt->fi) {
1650
fib_info_put(rt->fi);
1651
rt->fi = NULL;
1652
}
1653
if (peer) {
1654
rt->peer = NULL;
1655
inet_putpeer(peer);
1656
}
1657
}
1658
1659
1660
static void ipv4_link_failure(struct sk_buff *skb)
1661
{
1662
struct rtable *rt;
1663
1664
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1665
1666
rt = skb_rtable(skb);
1667
if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1668
dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1669
}
1670
1671
static int ip_rt_bug(struct sk_buff *skb)
1672
{
1673
printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1674
&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1675
skb->dev ? skb->dev->name : "?");
1676
kfree_skb(skb);
1677
WARN_ON(1);
1678
return 0;
1679
}
1680
1681
/*
1682
We do not cache source address of outgoing interface,
1683
because it is used only by IP RR, TS and SRR options,
1684
so that it out of fast path.
1685
1686
BTW remember: "addr" is allowed to be not aligned
1687
in IP options!
1688
*/
1689
1690
void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1691
{
1692
__be32 src;
1693
1694
if (rt_is_output_route(rt))
1695
src = ip_hdr(skb)->saddr;
1696
else {
1697
struct fib_result res;
1698
struct flowi4 fl4;
1699
struct iphdr *iph;
1700
1701
iph = ip_hdr(skb);
1702
1703
memset(&fl4, 0, sizeof(fl4));
1704
fl4.daddr = iph->daddr;
1705
fl4.saddr = iph->saddr;
1706
fl4.flowi4_tos = iph->tos;
1707
fl4.flowi4_oif = rt->dst.dev->ifindex;
1708
fl4.flowi4_iif = skb->dev->ifindex;
1709
fl4.flowi4_mark = skb->mark;
1710
1711
rcu_read_lock();
1712
if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1713
src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1714
else
1715
src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1716
RT_SCOPE_UNIVERSE);
1717
rcu_read_unlock();
1718
}
1719
memcpy(addr, &src, 4);
1720
}
1721
1722
#ifdef CONFIG_IP_ROUTE_CLASSID
1723
static void set_class_tag(struct rtable *rt, u32 tag)
1724
{
1725
if (!(rt->dst.tclassid & 0xFFFF))
1726
rt->dst.tclassid |= tag & 0xFFFF;
1727
if (!(rt->dst.tclassid & 0xFFFF0000))
1728
rt->dst.tclassid |= tag & 0xFFFF0000;
1729
}
1730
#endif
1731
1732
static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1733
{
1734
unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1735
1736
if (advmss == 0) {
1737
advmss = max_t(unsigned int, dst->dev->mtu - 40,
1738
ip_rt_min_advmss);
1739
if (advmss > 65535 - 40)
1740
advmss = 65535 - 40;
1741
}
1742
return advmss;
1743
}
1744
1745
static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1746
{
1747
unsigned int mtu = dst->dev->mtu;
1748
1749
if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1750
const struct rtable *rt = (const struct rtable *) dst;
1751
1752
if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1753
mtu = 576;
1754
}
1755
1756
if (mtu > IP_MAX_MTU)
1757
mtu = IP_MAX_MTU;
1758
1759
return mtu;
1760
}
1761
1762
static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1763
struct fib_info *fi)
1764
{
1765
struct inet_peer *peer;
1766
int create = 0;
1767
1768
/* If a peer entry exists for this destination, we must hook
1769
* it up in order to get at cached metrics.
1770
*/
1771
if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1772
create = 1;
1773
1774
rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1775
if (peer) {
1776
rt->rt_peer_genid = rt_peer_genid();
1777
if (inet_metrics_new(peer))
1778
memcpy(peer->metrics, fi->fib_metrics,
1779
sizeof(u32) * RTAX_MAX);
1780
dst_init_metrics(&rt->dst, peer->metrics, false);
1781
1782
check_peer_pmtu(&rt->dst, peer);
1783
if (peer->redirect_learned.a4 &&
1784
peer->redirect_learned.a4 != rt->rt_gateway) {
1785
rt->rt_gateway = peer->redirect_learned.a4;
1786
rt->rt_flags |= RTCF_REDIRECTED;
1787
}
1788
} else {
1789
if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1790
rt->fi = fi;
1791
atomic_inc(&fi->fib_clntref);
1792
}
1793
dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1794
}
1795
}
1796
1797
static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1798
const struct fib_result *res,
1799
struct fib_info *fi, u16 type, u32 itag)
1800
{
1801
struct dst_entry *dst = &rt->dst;
1802
1803
if (fi) {
1804
if (FIB_RES_GW(*res) &&
1805
FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1806
rt->rt_gateway = FIB_RES_GW(*res);
1807
rt_init_metrics(rt, fl4, fi);
1808
#ifdef CONFIG_IP_ROUTE_CLASSID
1809
dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1810
#endif
1811
}
1812
1813
if (dst_mtu(dst) > IP_MAX_MTU)
1814
dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1815
if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1816
dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1817
1818
#ifdef CONFIG_IP_ROUTE_CLASSID
1819
#ifdef CONFIG_IP_MULTIPLE_TABLES
1820
set_class_tag(rt, fib_rules_tclass(res));
1821
#endif
1822
set_class_tag(rt, itag);
1823
#endif
1824
}
1825
1826
static struct rtable *rt_dst_alloc(struct net_device *dev,
1827
bool nopolicy, bool noxfrm)
1828
{
1829
return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1830
DST_HOST |
1831
(nopolicy ? DST_NOPOLICY : 0) |
1832
(noxfrm ? DST_NOXFRM : 0));
1833
}
1834
1835
/* called in rcu_read_lock() section */
1836
static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1837
u8 tos, struct net_device *dev, int our)
1838
{
1839
unsigned int hash;
1840
struct rtable *rth;
1841
__be32 spec_dst;
1842
struct in_device *in_dev = __in_dev_get_rcu(dev);
1843
u32 itag = 0;
1844
int err;
1845
1846
/* Primary sanity checks. */
1847
1848
if (in_dev == NULL)
1849
return -EINVAL;
1850
1851
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1852
ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1853
goto e_inval;
1854
1855
if (ipv4_is_zeronet(saddr)) {
1856
if (!ipv4_is_local_multicast(daddr))
1857
goto e_inval;
1858
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1859
} else {
1860
err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1861
&itag);
1862
if (err < 0)
1863
goto e_err;
1864
}
1865
rth = rt_dst_alloc(init_net.loopback_dev,
1866
IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1867
if (!rth)
1868
goto e_nobufs;
1869
1870
#ifdef CONFIG_IP_ROUTE_CLASSID
1871
rth->dst.tclassid = itag;
1872
#endif
1873
rth->dst.output = ip_rt_bug;
1874
1875
rth->rt_key_dst = daddr;
1876
rth->rt_key_src = saddr;
1877
rth->rt_genid = rt_genid(dev_net(dev));
1878
rth->rt_flags = RTCF_MULTICAST;
1879
rth->rt_type = RTN_MULTICAST;
1880
rth->rt_key_tos = tos;
1881
rth->rt_dst = daddr;
1882
rth->rt_src = saddr;
1883
rth->rt_route_iif = dev->ifindex;
1884
rth->rt_iif = dev->ifindex;
1885
rth->rt_oif = 0;
1886
rth->rt_mark = skb->mark;
1887
rth->rt_gateway = daddr;
1888
rth->rt_spec_dst= spec_dst;
1889
rth->rt_peer_genid = 0;
1890
rth->peer = NULL;
1891
rth->fi = NULL;
1892
if (our) {
1893
rth->dst.input= ip_local_deliver;
1894
rth->rt_flags |= RTCF_LOCAL;
1895
}
1896
1897
#ifdef CONFIG_IP_MROUTE
1898
if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1899
rth->dst.input = ip_mr_input;
1900
#endif
1901
RT_CACHE_STAT_INC(in_slow_mc);
1902
1903
hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1904
rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1905
return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1906
1907
e_nobufs:
1908
return -ENOBUFS;
1909
e_inval:
1910
return -EINVAL;
1911
e_err:
1912
return err;
1913
}
1914
1915
1916
static void ip_handle_martian_source(struct net_device *dev,
1917
struct in_device *in_dev,
1918
struct sk_buff *skb,
1919
__be32 daddr,
1920
__be32 saddr)
1921
{
1922
RT_CACHE_STAT_INC(in_martian_src);
1923
#ifdef CONFIG_IP_ROUTE_VERBOSE
1924
if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1925
/*
1926
* RFC1812 recommendation, if source is martian,
1927
* the only hint is MAC header.
1928
*/
1929
printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1930
&daddr, &saddr, dev->name);
1931
if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1932
int i;
1933
const unsigned char *p = skb_mac_header(skb);
1934
printk(KERN_WARNING "ll header: ");
1935
for (i = 0; i < dev->hard_header_len; i++, p++) {
1936
printk("%02x", *p);
1937
if (i < (dev->hard_header_len - 1))
1938
printk(":");
1939
}
1940
printk("\n");
1941
}
1942
}
1943
#endif
1944
}
1945
1946
/* called in rcu_read_lock() section */
1947
static int __mkroute_input(struct sk_buff *skb,
1948
const struct fib_result *res,
1949
struct in_device *in_dev,
1950
__be32 daddr, __be32 saddr, u32 tos,
1951
struct rtable **result)
1952
{
1953
struct rtable *rth;
1954
int err;
1955
struct in_device *out_dev;
1956
unsigned int flags = 0;
1957
__be32 spec_dst;
1958
u32 itag;
1959
1960
/* get a working reference to the output device */
1961
out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1962
if (out_dev == NULL) {
1963
if (net_ratelimit())
1964
printk(KERN_CRIT "Bug in ip_route_input" \
1965
"_slow(). Please, report\n");
1966
return -EINVAL;
1967
}
1968
1969
1970
err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1971
in_dev->dev, &spec_dst, &itag);
1972
if (err < 0) {
1973
ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1974
saddr);
1975
1976
goto cleanup;
1977
}
1978
1979
if (err)
1980
flags |= RTCF_DIRECTSRC;
1981
1982
if (out_dev == in_dev && err &&
1983
(IN_DEV_SHARED_MEDIA(out_dev) ||
1984
inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1985
flags |= RTCF_DOREDIRECT;
1986
1987
if (skb->protocol != htons(ETH_P_IP)) {
1988
/* Not IP (i.e. ARP). Do not create route, if it is
1989
* invalid for proxy arp. DNAT routes are always valid.
1990
*
1991
* Proxy arp feature have been extended to allow, ARP
1992
* replies back to the same interface, to support
1993
* Private VLAN switch technologies. See arp.c.
1994
*/
1995
if (out_dev == in_dev &&
1996
IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1997
err = -EINVAL;
1998
goto cleanup;
1999
}
2000
}
2001
2002
rth = rt_dst_alloc(out_dev->dev,
2003
IN_DEV_CONF_GET(in_dev, NOPOLICY),
2004
IN_DEV_CONF_GET(out_dev, NOXFRM));
2005
if (!rth) {
2006
err = -ENOBUFS;
2007
goto cleanup;
2008
}
2009
2010
rth->rt_key_dst = daddr;
2011
rth->rt_key_src = saddr;
2012
rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2013
rth->rt_flags = flags;
2014
rth->rt_type = res->type;
2015
rth->rt_key_tos = tos;
2016
rth->rt_dst = daddr;
2017
rth->rt_src = saddr;
2018
rth->rt_route_iif = in_dev->dev->ifindex;
2019
rth->rt_iif = in_dev->dev->ifindex;
2020
rth->rt_oif = 0;
2021
rth->rt_mark = skb->mark;
2022
rth->rt_gateway = daddr;
2023
rth->rt_spec_dst= spec_dst;
2024
rth->rt_peer_genid = 0;
2025
rth->peer = NULL;
2026
rth->fi = NULL;
2027
2028
rth->dst.input = ip_forward;
2029
rth->dst.output = ip_output;
2030
2031
rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2032
2033
*result = rth;
2034
err = 0;
2035
cleanup:
2036
return err;
2037
}
2038
2039
static int ip_mkroute_input(struct sk_buff *skb,
2040
struct fib_result *res,
2041
const struct flowi4 *fl4,
2042
struct in_device *in_dev,
2043
__be32 daddr, __be32 saddr, u32 tos)
2044
{
2045
struct rtable* rth = NULL;
2046
int err;
2047
unsigned hash;
2048
2049
#ifdef CONFIG_IP_ROUTE_MULTIPATH
2050
if (res->fi && res->fi->fib_nhs > 1)
2051
fib_select_multipath(res);
2052
#endif
2053
2054
/* create a routing cache entry */
2055
err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2056
if (err)
2057
return err;
2058
2059
/* put it into the cache */
2060
hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2061
rt_genid(dev_net(rth->dst.dev)));
2062
rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2063
if (IS_ERR(rth))
2064
return PTR_ERR(rth);
2065
return 0;
2066
}
2067
2068
/*
2069
* NOTE. We drop all the packets that has local source
2070
* addresses, because every properly looped back packet
2071
* must have correct destination already attached by output routine.
2072
*
2073
* Such approach solves two big problems:
2074
* 1. Not simplex devices are handled properly.
2075
* 2. IP spoofing attempts are filtered with 100% of guarantee.
2076
* called with rcu_read_lock()
2077
*/
2078
2079
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2080
u8 tos, struct net_device *dev)
2081
{
2082
struct fib_result res;
2083
struct in_device *in_dev = __in_dev_get_rcu(dev);
2084
struct flowi4 fl4;
2085
unsigned flags = 0;
2086
u32 itag = 0;
2087
struct rtable * rth;
2088
unsigned hash;
2089
__be32 spec_dst;
2090
int err = -EINVAL;
2091
struct net * net = dev_net(dev);
2092
2093
/* IP on this device is disabled. */
2094
2095
if (!in_dev)
2096
goto out;
2097
2098
/* Check for the most weird martians, which can be not detected
2099
by fib_lookup.
2100
*/
2101
2102
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2103
ipv4_is_loopback(saddr))
2104
goto martian_source;
2105
2106
if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2107
goto brd_input;
2108
2109
/* Accept zero addresses only to limited broadcast;
2110
* I even do not know to fix it or not. Waiting for complains :-)
2111
*/
2112
if (ipv4_is_zeronet(saddr))
2113
goto martian_source;
2114
2115
if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2116
goto martian_destination;
2117
2118
/*
2119
* Now we are ready to route packet.
2120
*/
2121
fl4.flowi4_oif = 0;
2122
fl4.flowi4_iif = dev->ifindex;
2123
fl4.flowi4_mark = skb->mark;
2124
fl4.flowi4_tos = tos;
2125
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2126
fl4.daddr = daddr;
2127
fl4.saddr = saddr;
2128
err = fib_lookup(net, &fl4, &res);
2129
if (err != 0) {
2130
if (!IN_DEV_FORWARD(in_dev))
2131
goto e_hostunreach;
2132
goto no_route;
2133
}
2134
2135
RT_CACHE_STAT_INC(in_slow_tot);
2136
2137
if (res.type == RTN_BROADCAST)
2138
goto brd_input;
2139
2140
if (res.type == RTN_LOCAL) {
2141
err = fib_validate_source(skb, saddr, daddr, tos,
2142
net->loopback_dev->ifindex,
2143
dev, &spec_dst, &itag);
2144
if (err < 0)
2145
goto martian_source_keep_err;
2146
if (err)
2147
flags |= RTCF_DIRECTSRC;
2148
spec_dst = daddr;
2149
goto local_input;
2150
}
2151
2152
if (!IN_DEV_FORWARD(in_dev))
2153
goto e_hostunreach;
2154
if (res.type != RTN_UNICAST)
2155
goto martian_destination;
2156
2157
err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2158
out: return err;
2159
2160
brd_input:
2161
if (skb->protocol != htons(ETH_P_IP))
2162
goto e_inval;
2163
2164
if (ipv4_is_zeronet(saddr))
2165
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2166
else {
2167
err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2168
&itag);
2169
if (err < 0)
2170
goto martian_source_keep_err;
2171
if (err)
2172
flags |= RTCF_DIRECTSRC;
2173
}
2174
flags |= RTCF_BROADCAST;
2175
res.type = RTN_BROADCAST;
2176
RT_CACHE_STAT_INC(in_brd);
2177
2178
local_input:
2179
rth = rt_dst_alloc(net->loopback_dev,
2180
IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2181
if (!rth)
2182
goto e_nobufs;
2183
2184
rth->dst.input= ip_local_deliver;
2185
rth->dst.output= ip_rt_bug;
2186
#ifdef CONFIG_IP_ROUTE_CLASSID
2187
rth->dst.tclassid = itag;
2188
#endif
2189
2190
rth->rt_key_dst = daddr;
2191
rth->rt_key_src = saddr;
2192
rth->rt_genid = rt_genid(net);
2193
rth->rt_flags = flags|RTCF_LOCAL;
2194
rth->rt_type = res.type;
2195
rth->rt_key_tos = tos;
2196
rth->rt_dst = daddr;
2197
rth->rt_src = saddr;
2198
#ifdef CONFIG_IP_ROUTE_CLASSID
2199
rth->dst.tclassid = itag;
2200
#endif
2201
rth->rt_route_iif = dev->ifindex;
2202
rth->rt_iif = dev->ifindex;
2203
rth->rt_oif = 0;
2204
rth->rt_mark = skb->mark;
2205
rth->rt_gateway = daddr;
2206
rth->rt_spec_dst= spec_dst;
2207
rth->rt_peer_genid = 0;
2208
rth->peer = NULL;
2209
rth->fi = NULL;
2210
if (res.type == RTN_UNREACHABLE) {
2211
rth->dst.input= ip_error;
2212
rth->dst.error= -err;
2213
rth->rt_flags &= ~RTCF_LOCAL;
2214
}
2215
hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2216
rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2217
err = 0;
2218
if (IS_ERR(rth))
2219
err = PTR_ERR(rth);
2220
goto out;
2221
2222
no_route:
2223
RT_CACHE_STAT_INC(in_no_route);
2224
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2225
res.type = RTN_UNREACHABLE;
2226
if (err == -ESRCH)
2227
err = -ENETUNREACH;
2228
goto local_input;
2229
2230
/*
2231
* Do not cache martian addresses: they should be logged (RFC1812)
2232
*/
2233
martian_destination:
2234
RT_CACHE_STAT_INC(in_martian_dst);
2235
#ifdef CONFIG_IP_ROUTE_VERBOSE
2236
if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2237
printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2238
&daddr, &saddr, dev->name);
2239
#endif
2240
2241
e_hostunreach:
2242
err = -EHOSTUNREACH;
2243
goto out;
2244
2245
e_inval:
2246
err = -EINVAL;
2247
goto out;
2248
2249
e_nobufs:
2250
err = -ENOBUFS;
2251
goto out;
2252
2253
martian_source:
2254
err = -EINVAL;
2255
martian_source_keep_err:
2256
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2257
goto out;
2258
}
2259
2260
int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2261
u8 tos, struct net_device *dev, bool noref)
2262
{
2263
struct rtable * rth;
2264
unsigned hash;
2265
int iif = dev->ifindex;
2266
struct net *net;
2267
int res;
2268
2269
net = dev_net(dev);
2270
2271
rcu_read_lock();
2272
2273
if (!rt_caching(net))
2274
goto skip_cache;
2275
2276
tos &= IPTOS_RT_MASK;
2277
hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2278
2279
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2280
rth = rcu_dereference(rth->dst.rt_next)) {
2281
if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2282
((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2283
(rth->rt_iif ^ iif) |
2284
rth->rt_oif |
2285
(rth->rt_key_tos ^ tos)) == 0 &&
2286
rth->rt_mark == skb->mark &&
2287
net_eq(dev_net(rth->dst.dev), net) &&
2288
!rt_is_expired(rth)) {
2289
if (noref) {
2290
dst_use_noref(&rth->dst, jiffies);
2291
skb_dst_set_noref(skb, &rth->dst);
2292
} else {
2293
dst_use(&rth->dst, jiffies);
2294
skb_dst_set(skb, &rth->dst);
2295
}
2296
RT_CACHE_STAT_INC(in_hit);
2297
rcu_read_unlock();
2298
return 0;
2299
}
2300
RT_CACHE_STAT_INC(in_hlist_search);
2301
}
2302
2303
skip_cache:
2304
/* Multicast recognition logic is moved from route cache to here.
2305
The problem was that too many Ethernet cards have broken/missing
2306
hardware multicast filters :-( As result the host on multicasting
2307
network acquires a lot of useless route cache entries, sort of
2308
SDR messages from all the world. Now we try to get rid of them.
2309
Really, provided software IP multicast filter is organized
2310
reasonably (at least, hashed), it does not result in a slowdown
2311
comparing with route cache reject entries.
2312
Note, that multicast routers are not affected, because
2313
route cache entry is created eventually.
2314
*/
2315
if (ipv4_is_multicast(daddr)) {
2316
struct in_device *in_dev = __in_dev_get_rcu(dev);
2317
2318
if (in_dev) {
2319
int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2320
ip_hdr(skb)->protocol);
2321
if (our
2322
#ifdef CONFIG_IP_MROUTE
2323
||
2324
(!ipv4_is_local_multicast(daddr) &&
2325
IN_DEV_MFORWARD(in_dev))
2326
#endif
2327
) {
2328
int res = ip_route_input_mc(skb, daddr, saddr,
2329
tos, dev, our);
2330
rcu_read_unlock();
2331
return res;
2332
}
2333
}
2334
rcu_read_unlock();
2335
return -EINVAL;
2336
}
2337
res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2338
rcu_read_unlock();
2339
return res;
2340
}
2341
EXPORT_SYMBOL(ip_route_input_common);
2342
2343
/* called with rcu_read_lock() */
2344
static struct rtable *__mkroute_output(const struct fib_result *res,
2345
const struct flowi4 *fl4,
2346
__be32 orig_daddr, __be32 orig_saddr,
2347
int orig_oif, struct net_device *dev_out,
2348
unsigned int flags)
2349
{
2350
struct fib_info *fi = res->fi;
2351
u32 tos = RT_FL_TOS(fl4);
2352
struct in_device *in_dev;
2353
u16 type = res->type;
2354
struct rtable *rth;
2355
2356
if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2357
return ERR_PTR(-EINVAL);
2358
2359
if (ipv4_is_lbcast(fl4->daddr))
2360
type = RTN_BROADCAST;
2361
else if (ipv4_is_multicast(fl4->daddr))
2362
type = RTN_MULTICAST;
2363
else if (ipv4_is_zeronet(fl4->daddr))
2364
return ERR_PTR(-EINVAL);
2365
2366
if (dev_out->flags & IFF_LOOPBACK)
2367
flags |= RTCF_LOCAL;
2368
2369
in_dev = __in_dev_get_rcu(dev_out);
2370
if (!in_dev)
2371
return ERR_PTR(-EINVAL);
2372
2373
if (type == RTN_BROADCAST) {
2374
flags |= RTCF_BROADCAST | RTCF_LOCAL;
2375
fi = NULL;
2376
} else if (type == RTN_MULTICAST) {
2377
flags |= RTCF_MULTICAST | RTCF_LOCAL;
2378
if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2379
fl4->flowi4_proto))
2380
flags &= ~RTCF_LOCAL;
2381
/* If multicast route do not exist use
2382
* default one, but do not gateway in this case.
2383
* Yes, it is hack.
2384
*/
2385
if (fi && res->prefixlen < 4)
2386
fi = NULL;
2387
}
2388
2389
rth = rt_dst_alloc(dev_out,
2390
IN_DEV_CONF_GET(in_dev, NOPOLICY),
2391
IN_DEV_CONF_GET(in_dev, NOXFRM));
2392
if (!rth)
2393
return ERR_PTR(-ENOBUFS);
2394
2395
rth->dst.output = ip_output;
2396
2397
rth->rt_key_dst = orig_daddr;
2398
rth->rt_key_src = orig_saddr;
2399
rth->rt_genid = rt_genid(dev_net(dev_out));
2400
rth->rt_flags = flags;
2401
rth->rt_type = type;
2402
rth->rt_key_tos = tos;
2403
rth->rt_dst = fl4->daddr;
2404
rth->rt_src = fl4->saddr;
2405
rth->rt_route_iif = 0;
2406
rth->rt_iif = orig_oif ? : dev_out->ifindex;
2407
rth->rt_oif = orig_oif;
2408
rth->rt_mark = fl4->flowi4_mark;
2409
rth->rt_gateway = fl4->daddr;
2410
rth->rt_spec_dst= fl4->saddr;
2411
rth->rt_peer_genid = 0;
2412
rth->peer = NULL;
2413
rth->fi = NULL;
2414
2415
RT_CACHE_STAT_INC(out_slow_tot);
2416
2417
if (flags & RTCF_LOCAL) {
2418
rth->dst.input = ip_local_deliver;
2419
rth->rt_spec_dst = fl4->daddr;
2420
}
2421
if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2422
rth->rt_spec_dst = fl4->saddr;
2423
if (flags & RTCF_LOCAL &&
2424
!(dev_out->flags & IFF_LOOPBACK)) {
2425
rth->dst.output = ip_mc_output;
2426
RT_CACHE_STAT_INC(out_slow_mc);
2427
}
2428
#ifdef CONFIG_IP_MROUTE
2429
if (type == RTN_MULTICAST) {
2430
if (IN_DEV_MFORWARD(in_dev) &&
2431
!ipv4_is_local_multicast(fl4->daddr)) {
2432
rth->dst.input = ip_mr_input;
2433
rth->dst.output = ip_mc_output;
2434
}
2435
}
2436
#endif
2437
}
2438
2439
rt_set_nexthop(rth, fl4, res, fi, type, 0);
2440
2441
return rth;
2442
}
2443
2444
/*
2445
* Major route resolver routine.
2446
* called with rcu_read_lock();
2447
*/
2448
2449
static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2450
{
2451
struct net_device *dev_out = NULL;
2452
u32 tos = RT_FL_TOS(fl4);
2453
unsigned int flags = 0;
2454
struct fib_result res;
2455
struct rtable *rth;
2456
__be32 orig_daddr;
2457
__be32 orig_saddr;
2458
int orig_oif;
2459
2460
res.fi = NULL;
2461
#ifdef CONFIG_IP_MULTIPLE_TABLES
2462
res.r = NULL;
2463
#endif
2464
2465
orig_daddr = fl4->daddr;
2466
orig_saddr = fl4->saddr;
2467
orig_oif = fl4->flowi4_oif;
2468
2469
fl4->flowi4_iif = net->loopback_dev->ifindex;
2470
fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2471
fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2472
RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2473
2474
rcu_read_lock();
2475
if (fl4->saddr) {
2476
rth = ERR_PTR(-EINVAL);
2477
if (ipv4_is_multicast(fl4->saddr) ||
2478
ipv4_is_lbcast(fl4->saddr) ||
2479
ipv4_is_zeronet(fl4->saddr))
2480
goto out;
2481
2482
/* I removed check for oif == dev_out->oif here.
2483
It was wrong for two reasons:
2484
1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2485
is assigned to multiple interfaces.
2486
2. Moreover, we are allowed to send packets with saddr
2487
of another iface. --ANK
2488
*/
2489
2490
if (fl4->flowi4_oif == 0 &&
2491
(ipv4_is_multicast(fl4->daddr) ||
2492
ipv4_is_lbcast(fl4->daddr))) {
2493
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2494
dev_out = __ip_dev_find(net, fl4->saddr, false);
2495
if (dev_out == NULL)
2496
goto out;
2497
2498
/* Special hack: user can direct multicasts
2499
and limited broadcast via necessary interface
2500
without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2501
This hack is not just for fun, it allows
2502
vic,vat and friends to work.
2503
They bind socket to loopback, set ttl to zero
2504
and expect that it will work.
2505
From the viewpoint of routing cache they are broken,
2506
because we are not allowed to build multicast path
2507
with loopback source addr (look, routing cache
2508
cannot know, that ttl is zero, so that packet
2509
will not leave this host and route is valid).
2510
Luckily, this hack is good workaround.
2511
*/
2512
2513
fl4->flowi4_oif = dev_out->ifindex;
2514
goto make_route;
2515
}
2516
2517
if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2518
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2519
if (!__ip_dev_find(net, fl4->saddr, false))
2520
goto out;
2521
}
2522
}
2523
2524
2525
if (fl4->flowi4_oif) {
2526
dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2527
rth = ERR_PTR(-ENODEV);
2528
if (dev_out == NULL)
2529
goto out;
2530
2531
/* RACE: Check return value of inet_select_addr instead. */
2532
if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2533
rth = ERR_PTR(-ENETUNREACH);
2534
goto out;
2535
}
2536
if (ipv4_is_local_multicast(fl4->daddr) ||
2537
ipv4_is_lbcast(fl4->daddr)) {
2538
if (!fl4->saddr)
2539
fl4->saddr = inet_select_addr(dev_out, 0,
2540
RT_SCOPE_LINK);
2541
goto make_route;
2542
}
2543
if (fl4->saddr) {
2544
if (ipv4_is_multicast(fl4->daddr))
2545
fl4->saddr = inet_select_addr(dev_out, 0,
2546
fl4->flowi4_scope);
2547
else if (!fl4->daddr)
2548
fl4->saddr = inet_select_addr(dev_out, 0,
2549
RT_SCOPE_HOST);
2550
}
2551
}
2552
2553
if (!fl4->daddr) {
2554
fl4->daddr = fl4->saddr;
2555
if (!fl4->daddr)
2556
fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2557
dev_out = net->loopback_dev;
2558
fl4->flowi4_oif = net->loopback_dev->ifindex;
2559
res.type = RTN_LOCAL;
2560
flags |= RTCF_LOCAL;
2561
goto make_route;
2562
}
2563
2564
if (fib_lookup(net, fl4, &res)) {
2565
res.fi = NULL;
2566
if (fl4->flowi4_oif) {
2567
/* Apparently, routing tables are wrong. Assume,
2568
that the destination is on link.
2569
2570
WHY? DW.
2571
Because we are allowed to send to iface
2572
even if it has NO routes and NO assigned
2573
addresses. When oif is specified, routing
2574
tables are looked up with only one purpose:
2575
to catch if destination is gatewayed, rather than
2576
direct. Moreover, if MSG_DONTROUTE is set,
2577
we send packet, ignoring both routing tables
2578
and ifaddr state. --ANK
2579
2580
2581
We could make it even if oif is unknown,
2582
likely IPv6, but we do not.
2583
*/
2584
2585
if (fl4->saddr == 0)
2586
fl4->saddr = inet_select_addr(dev_out, 0,
2587
RT_SCOPE_LINK);
2588
res.type = RTN_UNICAST;
2589
goto make_route;
2590
}
2591
rth = ERR_PTR(-ENETUNREACH);
2592
goto out;
2593
}
2594
2595
if (res.type == RTN_LOCAL) {
2596
if (!fl4->saddr) {
2597
if (res.fi->fib_prefsrc)
2598
fl4->saddr = res.fi->fib_prefsrc;
2599
else
2600
fl4->saddr = fl4->daddr;
2601
}
2602
dev_out = net->loopback_dev;
2603
fl4->flowi4_oif = dev_out->ifindex;
2604
res.fi = NULL;
2605
flags |= RTCF_LOCAL;
2606
goto make_route;
2607
}
2608
2609
#ifdef CONFIG_IP_ROUTE_MULTIPATH
2610
if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2611
fib_select_multipath(&res);
2612
else
2613
#endif
2614
if (!res.prefixlen &&
2615
res.table->tb_num_default > 1 &&
2616
res.type == RTN_UNICAST && !fl4->flowi4_oif)
2617
fib_select_default(&res);
2618
2619
if (!fl4->saddr)
2620
fl4->saddr = FIB_RES_PREFSRC(net, res);
2621
2622
dev_out = FIB_RES_DEV(res);
2623
fl4->flowi4_oif = dev_out->ifindex;
2624
2625
2626
make_route:
2627
rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2628
dev_out, flags);
2629
if (!IS_ERR(rth)) {
2630
unsigned int hash;
2631
2632
hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2633
rt_genid(dev_net(dev_out)));
2634
rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2635
}
2636
2637
out:
2638
rcu_read_unlock();
2639
return rth;
2640
}
2641
2642
struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2643
{
2644
struct rtable *rth;
2645
unsigned int hash;
2646
2647
if (!rt_caching(net))
2648
goto slow_output;
2649
2650
hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2651
2652
rcu_read_lock_bh();
2653
for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2654
rth = rcu_dereference_bh(rth->dst.rt_next)) {
2655
if (rth->rt_key_dst == flp4->daddr &&
2656
rth->rt_key_src == flp4->saddr &&
2657
rt_is_output_route(rth) &&
2658
rth->rt_oif == flp4->flowi4_oif &&
2659
rth->rt_mark == flp4->flowi4_mark &&
2660
!((rth->rt_key_tos ^ flp4->flowi4_tos) &
2661
(IPTOS_RT_MASK | RTO_ONLINK)) &&
2662
net_eq(dev_net(rth->dst.dev), net) &&
2663
!rt_is_expired(rth)) {
2664
dst_use(&rth->dst, jiffies);
2665
RT_CACHE_STAT_INC(out_hit);
2666
rcu_read_unlock_bh();
2667
if (!flp4->saddr)
2668
flp4->saddr = rth->rt_src;
2669
if (!flp4->daddr)
2670
flp4->daddr = rth->rt_dst;
2671
return rth;
2672
}
2673
RT_CACHE_STAT_INC(out_hlist_search);
2674
}
2675
rcu_read_unlock_bh();
2676
2677
slow_output:
2678
return ip_route_output_slow(net, flp4);
2679
}
2680
EXPORT_SYMBOL_GPL(__ip_route_output_key);
2681
2682
static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2683
{
2684
return NULL;
2685
}
2686
2687
static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2688
{
2689
return 0;
2690
}
2691
2692
static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2693
{
2694
}
2695
2696
static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2697
unsigned long old)
2698
{
2699
return NULL;
2700
}
2701
2702
static struct dst_ops ipv4_dst_blackhole_ops = {
2703
.family = AF_INET,
2704
.protocol = cpu_to_be16(ETH_P_IP),
2705
.destroy = ipv4_dst_destroy,
2706
.check = ipv4_blackhole_dst_check,
2707
.default_mtu = ipv4_blackhole_default_mtu,
2708
.default_advmss = ipv4_default_advmss,
2709
.update_pmtu = ipv4_rt_blackhole_update_pmtu,
2710
.cow_metrics = ipv4_rt_blackhole_cow_metrics,
2711
};
2712
2713
struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2714
{
2715
struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2716
struct rtable *ort = (struct rtable *) dst_orig;
2717
2718
if (rt) {
2719
struct dst_entry *new = &rt->dst;
2720
2721
new->__use = 1;
2722
new->input = dst_discard;
2723
new->output = dst_discard;
2724
dst_copy_metrics(new, &ort->dst);
2725
2726
new->dev = ort->dst.dev;
2727
if (new->dev)
2728
dev_hold(new->dev);
2729
2730
rt->rt_key_dst = ort->rt_key_dst;
2731
rt->rt_key_src = ort->rt_key_src;
2732
rt->rt_key_tos = ort->rt_key_tos;
2733
rt->rt_route_iif = ort->rt_route_iif;
2734
rt->rt_iif = ort->rt_iif;
2735
rt->rt_oif = ort->rt_oif;
2736
rt->rt_mark = ort->rt_mark;
2737
2738
rt->rt_genid = rt_genid(net);
2739
rt->rt_flags = ort->rt_flags;
2740
rt->rt_type = ort->rt_type;
2741
rt->rt_dst = ort->rt_dst;
2742
rt->rt_src = ort->rt_src;
2743
rt->rt_gateway = ort->rt_gateway;
2744
rt->rt_spec_dst = ort->rt_spec_dst;
2745
rt->peer = ort->peer;
2746
if (rt->peer)
2747
atomic_inc(&rt->peer->refcnt);
2748
rt->fi = ort->fi;
2749
if (rt->fi)
2750
atomic_inc(&rt->fi->fib_clntref);
2751
2752
dst_free(new);
2753
}
2754
2755
dst_release(dst_orig);
2756
2757
return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2758
}
2759
2760
struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2761
struct sock *sk)
2762
{
2763
struct rtable *rt = __ip_route_output_key(net, flp4);
2764
2765
if (IS_ERR(rt))
2766
return rt;
2767
2768
if (flp4->flowi4_proto)
2769
rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2770
flowi4_to_flowi(flp4),
2771
sk, 0);
2772
2773
return rt;
2774
}
2775
EXPORT_SYMBOL_GPL(ip_route_output_flow);
2776
2777
static int rt_fill_info(struct net *net,
2778
struct sk_buff *skb, u32 pid, u32 seq, int event,
2779
int nowait, unsigned int flags)
2780
{
2781
struct rtable *rt = skb_rtable(skb);
2782
struct rtmsg *r;
2783
struct nlmsghdr *nlh;
2784
long expires = 0;
2785
const struct inet_peer *peer = rt->peer;
2786
u32 id = 0, ts = 0, tsage = 0, error;
2787
2788
nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2789
if (nlh == NULL)
2790
return -EMSGSIZE;
2791
2792
r = nlmsg_data(nlh);
2793
r->rtm_family = AF_INET;
2794
r->rtm_dst_len = 32;
2795
r->rtm_src_len = 0;
2796
r->rtm_tos = rt->rt_key_tos;
2797
r->rtm_table = RT_TABLE_MAIN;
2798
NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2799
r->rtm_type = rt->rt_type;
2800
r->rtm_scope = RT_SCOPE_UNIVERSE;
2801
r->rtm_protocol = RTPROT_UNSPEC;
2802
r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2803
if (rt->rt_flags & RTCF_NOTIFY)
2804
r->rtm_flags |= RTM_F_NOTIFY;
2805
2806
NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2807
2808
if (rt->rt_key_src) {
2809
r->rtm_src_len = 32;
2810
NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2811
}
2812
if (rt->dst.dev)
2813
NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2814
#ifdef CONFIG_IP_ROUTE_CLASSID
2815
if (rt->dst.tclassid)
2816
NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2817
#endif
2818
if (rt_is_input_route(rt))
2819
NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2820
else if (rt->rt_src != rt->rt_key_src)
2821
NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2822
2823
if (rt->rt_dst != rt->rt_gateway)
2824
NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2825
2826
if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2827
goto nla_put_failure;
2828
2829
if (rt->rt_mark)
2830
NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2831
2832
error = rt->dst.error;
2833
if (peer) {
2834
inet_peer_refcheck(rt->peer);
2835
id = atomic_read(&peer->ip_id_count) & 0xffff;
2836
if (peer->tcp_ts_stamp) {
2837
ts = peer->tcp_ts;
2838
tsage = get_seconds() - peer->tcp_ts_stamp;
2839
}
2840
expires = ACCESS_ONCE(peer->pmtu_expires);
2841
if (expires)
2842
expires -= jiffies;
2843
}
2844
2845
if (rt_is_input_route(rt)) {
2846
#ifdef CONFIG_IP_MROUTE
2847
__be32 dst = rt->rt_dst;
2848
2849
if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2850
IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2851
int err = ipmr_get_route(net, skb,
2852
rt->rt_src, rt->rt_dst,
2853
r, nowait);
2854
if (err <= 0) {
2855
if (!nowait) {
2856
if (err == 0)
2857
return 0;
2858
goto nla_put_failure;
2859
} else {
2860
if (err == -EMSGSIZE)
2861
goto nla_put_failure;
2862
error = err;
2863
}
2864
}
2865
} else
2866
#endif
2867
NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2868
}
2869
2870
if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2871
expires, error) < 0)
2872
goto nla_put_failure;
2873
2874
return nlmsg_end(skb, nlh);
2875
2876
nla_put_failure:
2877
nlmsg_cancel(skb, nlh);
2878
return -EMSGSIZE;
2879
}
2880
2881
static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2882
{
2883
struct net *net = sock_net(in_skb->sk);
2884
struct rtmsg *rtm;
2885
struct nlattr *tb[RTA_MAX+1];
2886
struct rtable *rt = NULL;
2887
__be32 dst = 0;
2888
__be32 src = 0;
2889
u32 iif;
2890
int err;
2891
int mark;
2892
struct sk_buff *skb;
2893
2894
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2895
if (err < 0)
2896
goto errout;
2897
2898
rtm = nlmsg_data(nlh);
2899
2900
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2901
if (skb == NULL) {
2902
err = -ENOBUFS;
2903
goto errout;
2904
}
2905
2906
/* Reserve room for dummy headers, this skb can pass
2907
through good chunk of routing engine.
2908
*/
2909
skb_reset_mac_header(skb);
2910
skb_reset_network_header(skb);
2911
2912
/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2913
ip_hdr(skb)->protocol = IPPROTO_ICMP;
2914
skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2915
2916
src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2917
dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2918
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2919
mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2920
2921
if (iif) {
2922
struct net_device *dev;
2923
2924
dev = __dev_get_by_index(net, iif);
2925
if (dev == NULL) {
2926
err = -ENODEV;
2927
goto errout_free;
2928
}
2929
2930
skb->protocol = htons(ETH_P_IP);
2931
skb->dev = dev;
2932
skb->mark = mark;
2933
local_bh_disable();
2934
err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2935
local_bh_enable();
2936
2937
rt = skb_rtable(skb);
2938
if (err == 0 && rt->dst.error)
2939
err = -rt->dst.error;
2940
} else {
2941
struct flowi4 fl4 = {
2942
.daddr = dst,
2943
.saddr = src,
2944
.flowi4_tos = rtm->rtm_tos,
2945
.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2946
.flowi4_mark = mark,
2947
};
2948
rt = ip_route_output_key(net, &fl4);
2949
2950
err = 0;
2951
if (IS_ERR(rt))
2952
err = PTR_ERR(rt);
2953
}
2954
2955
if (err)
2956
goto errout_free;
2957
2958
skb_dst_set(skb, &rt->dst);
2959
if (rtm->rtm_flags & RTM_F_NOTIFY)
2960
rt->rt_flags |= RTCF_NOTIFY;
2961
2962
err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2963
RTM_NEWROUTE, 0, 0);
2964
if (err <= 0)
2965
goto errout_free;
2966
2967
err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2968
errout:
2969
return err;
2970
2971
errout_free:
2972
kfree_skb(skb);
2973
goto errout;
2974
}
2975
2976
int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2977
{
2978
struct rtable *rt;
2979
int h, s_h;
2980
int idx, s_idx;
2981
struct net *net;
2982
2983
net = sock_net(skb->sk);
2984
2985
s_h = cb->args[0];
2986
if (s_h < 0)
2987
s_h = 0;
2988
s_idx = idx = cb->args[1];
2989
for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2990
if (!rt_hash_table[h].chain)
2991
continue;
2992
rcu_read_lock_bh();
2993
for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
2994
rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2995
if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
2996
continue;
2997
if (rt_is_expired(rt))
2998
continue;
2999
skb_dst_set_noref(skb, &rt->dst);
3000
if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3001
cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3002
1, NLM_F_MULTI) <= 0) {
3003
skb_dst_drop(skb);
3004
rcu_read_unlock_bh();
3005
goto done;
3006
}
3007
skb_dst_drop(skb);
3008
}
3009
rcu_read_unlock_bh();
3010
}
3011
3012
done:
3013
cb->args[0] = h;
3014
cb->args[1] = idx;
3015
return skb->len;
3016
}
3017
3018
void ip_rt_multicast_event(struct in_device *in_dev)
3019
{
3020
rt_cache_flush(dev_net(in_dev->dev), 0);
3021
}
3022
3023
#ifdef CONFIG_SYSCTL
3024
static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3025
void __user *buffer,
3026
size_t *lenp, loff_t *ppos)
3027
{
3028
if (write) {
3029
int flush_delay;
3030
ctl_table ctl;
3031
struct net *net;
3032
3033
memcpy(&ctl, __ctl, sizeof(ctl));
3034
ctl.data = &flush_delay;
3035
proc_dointvec(&ctl, write, buffer, lenp, ppos);
3036
3037
net = (struct net *)__ctl->extra1;
3038
rt_cache_flush(net, flush_delay);
3039
return 0;
3040
}
3041
3042
return -EINVAL;
3043
}
3044
3045
static ctl_table ipv4_route_table[] = {
3046
{
3047
.procname = "gc_thresh",
3048
.data = &ipv4_dst_ops.gc_thresh,
3049
.maxlen = sizeof(int),
3050
.mode = 0644,
3051
.proc_handler = proc_dointvec,
3052
},
3053
{
3054
.procname = "max_size",
3055
.data = &ip_rt_max_size,
3056
.maxlen = sizeof(int),
3057
.mode = 0644,
3058
.proc_handler = proc_dointvec,
3059
},
3060
{
3061
/* Deprecated. Use gc_min_interval_ms */
3062
3063
.procname = "gc_min_interval",
3064
.data = &ip_rt_gc_min_interval,
3065
.maxlen = sizeof(int),
3066
.mode = 0644,
3067
.proc_handler = proc_dointvec_jiffies,
3068
},
3069
{
3070
.procname = "gc_min_interval_ms",
3071
.data = &ip_rt_gc_min_interval,
3072
.maxlen = sizeof(int),
3073
.mode = 0644,
3074
.proc_handler = proc_dointvec_ms_jiffies,
3075
},
3076
{
3077
.procname = "gc_timeout",
3078
.data = &ip_rt_gc_timeout,
3079
.maxlen = sizeof(int),
3080
.mode = 0644,
3081
.proc_handler = proc_dointvec_jiffies,
3082
},
3083
{
3084
.procname = "gc_interval",
3085
.data = &ip_rt_gc_interval,
3086
.maxlen = sizeof(int),
3087
.mode = 0644,
3088
.proc_handler = proc_dointvec_jiffies,
3089
},
3090
{
3091
.procname = "redirect_load",
3092
.data = &ip_rt_redirect_load,
3093
.maxlen = sizeof(int),
3094
.mode = 0644,
3095
.proc_handler = proc_dointvec,
3096
},
3097
{
3098
.procname = "redirect_number",
3099
.data = &ip_rt_redirect_number,
3100
.maxlen = sizeof(int),
3101
.mode = 0644,
3102
.proc_handler = proc_dointvec,
3103
},
3104
{
3105
.procname = "redirect_silence",
3106
.data = &ip_rt_redirect_silence,
3107
.maxlen = sizeof(int),
3108
.mode = 0644,
3109
.proc_handler = proc_dointvec,
3110
},
3111
{
3112
.procname = "error_cost",
3113
.data = &ip_rt_error_cost,
3114
.maxlen = sizeof(int),
3115
.mode = 0644,
3116
.proc_handler = proc_dointvec,
3117
},
3118
{
3119
.procname = "error_burst",
3120
.data = &ip_rt_error_burst,
3121
.maxlen = sizeof(int),
3122
.mode = 0644,
3123
.proc_handler = proc_dointvec,
3124
},
3125
{
3126
.procname = "gc_elasticity",
3127
.data = &ip_rt_gc_elasticity,
3128
.maxlen = sizeof(int),
3129
.mode = 0644,
3130
.proc_handler = proc_dointvec,
3131
},
3132
{
3133
.procname = "mtu_expires",
3134
.data = &ip_rt_mtu_expires,
3135
.maxlen = sizeof(int),
3136
.mode = 0644,
3137
.proc_handler = proc_dointvec_jiffies,
3138
},
3139
{
3140
.procname = "min_pmtu",
3141
.data = &ip_rt_min_pmtu,
3142
.maxlen = sizeof(int),
3143
.mode = 0644,
3144
.proc_handler = proc_dointvec,
3145
},
3146
{
3147
.procname = "min_adv_mss",
3148
.data = &ip_rt_min_advmss,
3149
.maxlen = sizeof(int),
3150
.mode = 0644,
3151
.proc_handler = proc_dointvec,
3152
},
3153
{ }
3154
};
3155
3156
static struct ctl_table empty[1];
3157
3158
static struct ctl_table ipv4_skeleton[] =
3159
{
3160
{ .procname = "route",
3161
.mode = 0555, .child = ipv4_route_table},
3162
{ .procname = "neigh",
3163
.mode = 0555, .child = empty},
3164
{ }
3165
};
3166
3167
static __net_initdata struct ctl_path ipv4_path[] = {
3168
{ .procname = "net", },
3169
{ .procname = "ipv4", },
3170
{ },
3171
};
3172
3173
static struct ctl_table ipv4_route_flush_table[] = {
3174
{
3175
.procname = "flush",
3176
.maxlen = sizeof(int),
3177
.mode = 0200,
3178
.proc_handler = ipv4_sysctl_rtcache_flush,
3179
},
3180
{ },
3181
};
3182
3183
static __net_initdata struct ctl_path ipv4_route_path[] = {
3184
{ .procname = "net", },
3185
{ .procname = "ipv4", },
3186
{ .procname = "route", },
3187
{ },
3188
};
3189
3190
static __net_init int sysctl_route_net_init(struct net *net)
3191
{
3192
struct ctl_table *tbl;
3193
3194
tbl = ipv4_route_flush_table;
3195
if (!net_eq(net, &init_net)) {
3196
tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3197
if (tbl == NULL)
3198
goto err_dup;
3199
}
3200
tbl[0].extra1 = net;
3201
3202
net->ipv4.route_hdr =
3203
register_net_sysctl_table(net, ipv4_route_path, tbl);
3204
if (net->ipv4.route_hdr == NULL)
3205
goto err_reg;
3206
return 0;
3207
3208
err_reg:
3209
if (tbl != ipv4_route_flush_table)
3210
kfree(tbl);
3211
err_dup:
3212
return -ENOMEM;
3213
}
3214
3215
static __net_exit void sysctl_route_net_exit(struct net *net)
3216
{
3217
struct ctl_table *tbl;
3218
3219
tbl = net->ipv4.route_hdr->ctl_table_arg;
3220
unregister_net_sysctl_table(net->ipv4.route_hdr);
3221
BUG_ON(tbl == ipv4_route_flush_table);
3222
kfree(tbl);
3223
}
3224
3225
static __net_initdata struct pernet_operations sysctl_route_ops = {
3226
.init = sysctl_route_net_init,
3227
.exit = sysctl_route_net_exit,
3228
};
3229
#endif
3230
3231
static __net_init int rt_genid_init(struct net *net)
3232
{
3233
get_random_bytes(&net->ipv4.rt_genid,
3234
sizeof(net->ipv4.rt_genid));
3235
get_random_bytes(&net->ipv4.dev_addr_genid,
3236
sizeof(net->ipv4.dev_addr_genid));
3237
return 0;
3238
}
3239
3240
static __net_initdata struct pernet_operations rt_genid_ops = {
3241
.init = rt_genid_init,
3242
};
3243
3244
3245
#ifdef CONFIG_IP_ROUTE_CLASSID
3246
struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3247
#endif /* CONFIG_IP_ROUTE_CLASSID */
3248
3249
static __initdata unsigned long rhash_entries;
3250
static int __init set_rhash_entries(char *str)
3251
{
3252
if (!str)
3253
return 0;
3254
rhash_entries = simple_strtoul(str, &str, 0);
3255
return 1;
3256
}
3257
__setup("rhash_entries=", set_rhash_entries);
3258
3259
int __init ip_rt_init(void)
3260
{
3261
int rc = 0;
3262
3263
#ifdef CONFIG_IP_ROUTE_CLASSID
3264
ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3265
if (!ip_rt_acct)
3266
panic("IP: failed to allocate ip_rt_acct\n");
3267
#endif
3268
3269
ipv4_dst_ops.kmem_cachep =
3270
kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3271
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3272
3273
ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3274
3275
if (dst_entries_init(&ipv4_dst_ops) < 0)
3276
panic("IP: failed to allocate ipv4_dst_ops counter\n");
3277
3278
if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3279
panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3280
3281
rt_hash_table = (struct rt_hash_bucket *)
3282
alloc_large_system_hash("IP route cache",
3283
sizeof(struct rt_hash_bucket),
3284
rhash_entries,
3285
(totalram_pages >= 128 * 1024) ?
3286
15 : 17,
3287
0,
3288
&rt_hash_log,
3289
&rt_hash_mask,
3290
rhash_entries ? 0 : 512 * 1024);
3291
memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3292
rt_hash_lock_init();
3293
3294
ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3295
ip_rt_max_size = (rt_hash_mask + 1) * 16;
3296
3297
devinet_init();
3298
ip_fib_init();
3299
3300
if (ip_rt_proc_init())
3301
printk(KERN_ERR "Unable to create route proc files\n");
3302
#ifdef CONFIG_XFRM
3303
xfrm_init();
3304
xfrm4_init(ip_rt_max_size);
3305
#endif
3306
rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3307
3308
#ifdef CONFIG_SYSCTL
3309
register_pernet_subsys(&sysctl_route_ops);
3310
#endif
3311
register_pernet_subsys(&rt_genid_ops);
3312
return rc;
3313
}
3314
3315
#ifdef CONFIG_SYSCTL
3316
/*
3317
* We really need to sanitize the damn ipv4 init order, then all
3318
* this nonsense will go away.
3319
*/
3320
void __init ip_static_sysctl_init(void)
3321
{
3322
register_sysctl_paths(ipv4_path, ipv4_skeleton);
3323
}
3324
#endif
3325
3326