2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
112 #include <net/atmclip.h>
113 #include <net/secure_seq.h>
115 #define RT_FL_TOS(oldflp4) \
116 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 #define IP_MAX_MTU 0xFFF0
120 #define RT_GC_TIMEOUT (300*HZ)
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
125 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
126 static int ip_rt_redirect_number __read_mostly = 9;
127 static int ip_rt_redirect_load __read_mostly = HZ / 50;
128 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost __read_mostly = HZ;
130 static int ip_rt_error_burst __read_mostly = 5 * HZ;
131 static int ip_rt_gc_elasticity __read_mostly = 8;
132 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
134 static int ip_rt_min_advmss __read_mostly = 256;
135 static int rt_chain_length_max __read_mostly = 20;
136 static int redirect_genid;
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
142 * Interface to generic destination cache.
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int ipv4_mtu(const struct dst_entry *dst);
148 static void ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void ipv4_link_failure(struct sk_buff *skb);
151 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
154 static void __rt_garbage_collect(struct work_struct *w);
155 static DECLARE_WORK(rt_gc_worker, __rt_garbage_collect);
157 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
162 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
164 struct rtable *rt = (struct rtable *) dst;
165 struct inet_peer *peer;
169 rt_bind_peer(rt, rt->rt_dst, 1);
173 u32 *old_p = __DST_METRICS_PTR(old);
174 unsigned long prev, new;
177 if (inet_metrics_new(peer))
178 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
180 new = (unsigned long) p;
181 prev = cmpxchg(&dst->_metrics, old, new);
184 p = __DST_METRICS_PTR(prev);
185 if (prev & DST_METRICS_READ_ONLY)
189 fib_info_put(rt->fi);
197 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
199 static struct dst_ops ipv4_dst_ops = {
201 .protocol = cpu_to_be16(ETH_P_IP),
202 .gc = rt_garbage_collect,
203 .check = ipv4_dst_check,
204 .default_advmss = ipv4_default_advmss,
206 .cow_metrics = ipv4_cow_metrics,
207 .destroy = ipv4_dst_destroy,
208 .ifdown = ipv4_dst_ifdown,
209 .negative_advice = ipv4_negative_advice,
210 .link_failure = ipv4_link_failure,
211 .update_pmtu = ip_rt_update_pmtu,
212 .local_out = __ip_local_out,
213 .neigh_lookup = ipv4_neigh_lookup,
216 #define ECN_OR_COST(class) TC_PRIO_##class
218 const __u8 ip_tos2prio[16] = {
220 ECN_OR_COST(BESTEFFORT),
222 ECN_OR_COST(BESTEFFORT),
228 ECN_OR_COST(INTERACTIVE),
230 ECN_OR_COST(INTERACTIVE),
231 TC_PRIO_INTERACTIVE_BULK,
232 ECN_OR_COST(INTERACTIVE_BULK),
233 TC_PRIO_INTERACTIVE_BULK,
234 ECN_OR_COST(INTERACTIVE_BULK)
242 /* The locking scheme is rather straight forward:
244 * 1) Read-Copy Update protects the buckets of the central route hash.
245 * 2) Only writers remove entries, and they hold the lock
246 * as they look at rtable reference counts.
247 * 3) Only readers acquire references to rtable entries,
248 * they do so with atomic increments and with the
252 struct rt_hash_bucket {
253 struct rtable __rcu *chain;
256 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
257 defined(CONFIG_PROVE_LOCKING)
259 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
260 * The size of this table is a power of two and depends on the number of CPUS.
261 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
263 #ifdef CONFIG_LOCKDEP
264 # define RT_HASH_LOCK_SZ 256
267 # define RT_HASH_LOCK_SZ 4096
269 # define RT_HASH_LOCK_SZ 2048
271 # define RT_HASH_LOCK_SZ 1024
273 # define RT_HASH_LOCK_SZ 512
275 # define RT_HASH_LOCK_SZ 256
279 static spinlock_t *rt_hash_locks;
280 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
282 static __init void rt_hash_lock_init(void)
286 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
289 panic("IP: failed to allocate rt_hash_locks\n");
291 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
292 spin_lock_init(&rt_hash_locks[i]);
295 # define rt_hash_lock_addr(slot) NULL
297 static inline void rt_hash_lock_init(void)
302 static struct rt_hash_bucket *rt_hash_table __read_mostly;
303 static unsigned rt_hash_mask __read_mostly;
304 static unsigned int rt_hash_log __read_mostly;
306 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
307 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
309 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
312 return jhash_3words((__force u32)daddr, (__force u32)saddr,
317 static inline int rt_genid(struct net *net)
319 return atomic_read(&net->ipv4.rt_genid);
322 #ifdef CONFIG_PROC_FS
323 struct rt_cache_iter_state {
324 struct seq_net_private p;
329 static struct rtable *rt_cache_get_first(struct seq_file *seq)
331 struct rt_cache_iter_state *st = seq->private;
332 struct rtable *r = NULL;
334 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
335 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
338 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
340 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
341 r->rt_genid == st->genid)
343 r = rcu_dereference_bh(r->dst.rt_next);
345 rcu_read_unlock_bh();
350 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
353 struct rt_cache_iter_state *st = seq->private;
355 r = rcu_dereference_bh(r->dst.rt_next);
357 rcu_read_unlock_bh();
359 if (--st->bucket < 0)
361 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
363 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
368 static struct rtable *rt_cache_get_next(struct seq_file *seq,
371 struct rt_cache_iter_state *st = seq->private;
372 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
373 if (dev_net(r->dst.dev) != seq_file_net(seq))
375 if (r->rt_genid == st->genid)
381 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
383 struct rtable *r = rt_cache_get_first(seq);
386 while (pos && (r = rt_cache_get_next(seq, r)))
388 return pos ? NULL : r;
391 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
393 struct rt_cache_iter_state *st = seq->private;
395 return rt_cache_get_idx(seq, *pos - 1);
396 st->genid = rt_genid(seq_file_net(seq));
397 return SEQ_START_TOKEN;
400 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
404 if (v == SEQ_START_TOKEN)
405 r = rt_cache_get_first(seq);
407 r = rt_cache_get_next(seq, v);
412 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
414 if (v && v != SEQ_START_TOKEN)
415 rcu_read_unlock_bh();
418 static int rt_cache_seq_show(struct seq_file *seq, void *v)
420 if (v == SEQ_START_TOKEN)
421 seq_printf(seq, "%-127s\n",
422 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
423 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
426 struct rtable *r = v;
431 n = dst_get_neighbour(&r->dst);
432 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
435 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
436 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
437 r->dst.dev ? r->dst.dev->name : "*",
438 (__force u32)r->rt_dst,
439 (__force u32)r->rt_gateway,
440 r->rt_flags, atomic_read(&r->dst.__refcnt),
441 r->dst.__use, 0, (__force u32)r->rt_src,
442 dst_metric_advmss(&r->dst) + 40,
443 dst_metric(&r->dst, RTAX_WINDOW),
444 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
445 dst_metric(&r->dst, RTAX_RTTVAR)),
449 r->rt_spec_dst, &len);
451 seq_printf(seq, "%*s\n", 127 - len, "");
456 static const struct seq_operations rt_cache_seq_ops = {
457 .start = rt_cache_seq_start,
458 .next = rt_cache_seq_next,
459 .stop = rt_cache_seq_stop,
460 .show = rt_cache_seq_show,
463 static int rt_cache_seq_open(struct inode *inode, struct file *file)
465 return seq_open_net(inode, file, &rt_cache_seq_ops,
466 sizeof(struct rt_cache_iter_state));
469 static const struct file_operations rt_cache_seq_fops = {
470 .owner = THIS_MODULE,
471 .open = rt_cache_seq_open,
474 .release = seq_release_net,
478 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
483 return SEQ_START_TOKEN;
485 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
486 if (!cpu_possible(cpu))
489 return &per_cpu(rt_cache_stat, cpu);
494 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
498 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
499 if (!cpu_possible(cpu))
502 return &per_cpu(rt_cache_stat, cpu);
508 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
513 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
515 struct rt_cache_stat *st = v;
517 if (v == SEQ_START_TOKEN) {
518 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
522 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
523 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
524 dst_entries_get_slow(&ipv4_dst_ops),
547 static const struct seq_operations rt_cpu_seq_ops = {
548 .start = rt_cpu_seq_start,
549 .next = rt_cpu_seq_next,
550 .stop = rt_cpu_seq_stop,
551 .show = rt_cpu_seq_show,
555 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
557 return seq_open(file, &rt_cpu_seq_ops);
560 static const struct file_operations rt_cpu_seq_fops = {
561 .owner = THIS_MODULE,
562 .open = rt_cpu_seq_open,
565 .release = seq_release,
568 #ifdef CONFIG_IP_ROUTE_CLASSID
569 static int rt_acct_proc_show(struct seq_file *m, void *v)
571 struct ip_rt_acct *dst, *src;
574 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
578 for_each_possible_cpu(i) {
579 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
580 for (j = 0; j < 256; j++) {
581 dst[j].o_bytes += src[j].o_bytes;
582 dst[j].o_packets += src[j].o_packets;
583 dst[j].i_bytes += src[j].i_bytes;
584 dst[j].i_packets += src[j].i_packets;
588 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
593 static int rt_acct_proc_open(struct inode *inode, struct file *file)
595 return single_open(file, rt_acct_proc_show, NULL);
598 static const struct file_operations rt_acct_proc_fops = {
599 .owner = THIS_MODULE,
600 .open = rt_acct_proc_open,
603 .release = single_release,
607 static int __net_init ip_rt_do_proc_init(struct net *net)
609 struct proc_dir_entry *pde;
611 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
616 pde = proc_create("rt_cache", S_IRUGO,
617 net->proc_net_stat, &rt_cpu_seq_fops);
621 #ifdef CONFIG_IP_ROUTE_CLASSID
622 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
628 #ifdef CONFIG_IP_ROUTE_CLASSID
630 remove_proc_entry("rt_cache", net->proc_net_stat);
633 remove_proc_entry("rt_cache", net->proc_net);
638 static void __net_exit ip_rt_do_proc_exit(struct net *net)
640 remove_proc_entry("rt_cache", net->proc_net_stat);
641 remove_proc_entry("rt_cache", net->proc_net);
642 #ifdef CONFIG_IP_ROUTE_CLASSID
643 remove_proc_entry("rt_acct", net->proc_net);
647 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
648 .init = ip_rt_do_proc_init,
649 .exit = ip_rt_do_proc_exit,
652 static int __init ip_rt_proc_init(void)
654 return register_pernet_subsys(&ip_rt_proc_ops);
658 static inline int ip_rt_proc_init(void)
662 #endif /* CONFIG_PROC_FS */
664 static inline void rt_free(struct rtable *rt)
666 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
669 static inline void rt_drop(struct rtable *rt)
672 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
675 static inline int rt_fast_clean(struct rtable *rth)
677 /* Kill broadcast/multicast entries very aggresively, if they
678 collide in hash table with more useful entries */
679 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
680 rt_is_input_route(rth) && rth->dst.rt_next;
683 static inline int rt_valuable(struct rtable *rth)
685 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
686 (rth->peer && rth->peer->pmtu_expires);
689 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
694 if (atomic_read(&rth->dst.__refcnt))
697 age = jiffies - rth->dst.lastuse;
698 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
699 (age <= tmo2 && rt_valuable(rth)))
705 /* Bits of score are:
707 * 30: not quite useless
708 * 29..0: usage counter
710 static inline u32 rt_score(struct rtable *rt)
712 u32 score = jiffies - rt->dst.lastuse;
714 score = ~score & ~(3<<30);
719 if (rt_is_output_route(rt) ||
720 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
726 static inline bool rt_caching(const struct net *net)
728 return net->ipv4.current_rt_cache_rebuild_count <=
729 net->ipv4.sysctl_rt_cache_rebuild_count;
732 static inline bool compare_hash_inputs(const struct rtable *rt1,
733 const struct rtable *rt2)
735 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
736 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
737 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
740 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
742 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
743 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
744 (rt1->rt_mark ^ rt2->rt_mark) |
745 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
746 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
747 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
750 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
752 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
755 static inline int rt_is_expired(struct rtable *rth)
757 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
761 * Perform a full scan of hash table and free all entries.
762 * Can be called by a softirq or a process.
763 * In the later case, we want to be reschedule if necessary
765 static void rt_do_flush(struct net *net, int process_context)
768 struct rtable *rth, *next;
770 for (i = 0; i <= rt_hash_mask; i++) {
771 struct rtable __rcu **pprev;
774 if (process_context && need_resched())
776 rth = rcu_access_pointer(rt_hash_table[i].chain);
780 spin_lock_bh(rt_hash_lock_addr(i));
783 pprev = &rt_hash_table[i].chain;
784 rth = rcu_dereference_protected(*pprev,
785 lockdep_is_held(rt_hash_lock_addr(i)));
788 next = rcu_dereference_protected(rth->dst.rt_next,
789 lockdep_is_held(rt_hash_lock_addr(i)));
792 net_eq(dev_net(rth->dst.dev), net)) {
793 rcu_assign_pointer(*pprev, next);
794 rcu_assign_pointer(rth->dst.rt_next, list);
797 pprev = &rth->dst.rt_next;
802 spin_unlock_bh(rt_hash_lock_addr(i));
804 for (; list; list = next) {
805 next = rcu_dereference_protected(list->dst.rt_next, 1);
812 * While freeing expired entries, we compute average chain length
813 * and standard deviation, using fixed-point arithmetic.
814 * This to have an estimation of rt_chain_length_max
815 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
816 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
820 #define ONE (1UL << FRACT_BITS)
823 * Given a hash chain and an item in this hash chain,
824 * find if a previous entry has the same hash_inputs
825 * (but differs on tos, mark or oif)
826 * Returns 0 if an alias is found.
827 * Returns ONE if rth has no alias before itself.
829 static int has_noalias(const struct rtable *head, const struct rtable *rth)
831 const struct rtable *aux = head;
834 if (compare_hash_inputs(aux, rth))
836 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
841 static void rt_check_expire(void)
843 static unsigned int rover;
844 unsigned int i = rover, goal;
846 struct rtable __rcu **rthp;
847 unsigned long samples = 0;
848 unsigned long sum = 0, sum2 = 0;
852 delta = jiffies - expires_ljiffies;
853 expires_ljiffies = jiffies;
854 mult = ((u64)delta) << rt_hash_log;
855 if (ip_rt_gc_timeout > 1)
856 do_div(mult, ip_rt_gc_timeout);
857 goal = (unsigned int)mult;
858 if (goal > rt_hash_mask)
859 goal = rt_hash_mask + 1;
860 for (; goal > 0; goal--) {
861 unsigned long tmo = ip_rt_gc_timeout;
862 unsigned long length;
864 i = (i + 1) & rt_hash_mask;
865 rthp = &rt_hash_table[i].chain;
872 if (rcu_dereference_raw(*rthp) == NULL)
875 spin_lock_bh(rt_hash_lock_addr(i));
876 while ((rth = rcu_dereference_protected(*rthp,
877 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
878 prefetch(rth->dst.rt_next);
879 if (rt_is_expired(rth)) {
880 *rthp = rth->dst.rt_next;
884 if (rth->dst.expires) {
885 /* Entry is expired even if it is in use */
886 if (time_before_eq(jiffies, rth->dst.expires)) {
889 rthp = &rth->dst.rt_next;
891 * We only count entries on
892 * a chain with equal hash inputs once
893 * so that entries for different QOS
894 * levels, and other non-hash input
895 * attributes don't unfairly skew
896 * the length computation
898 length += has_noalias(rt_hash_table[i].chain, rth);
901 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
904 /* Cleanup aged off entries. */
905 *rthp = rth->dst.rt_next;
908 spin_unlock_bh(rt_hash_lock_addr(i));
910 sum2 += length*length;
913 unsigned long avg = sum / samples;
914 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
915 rt_chain_length_max = max_t(unsigned long,
917 (avg + 4*sd) >> FRACT_BITS);
923 * rt_worker_func() is run in process context.
924 * we call rt_check_expire() to scan part of the hash table
926 static void rt_worker_func(struct work_struct *work)
929 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
933 * Perturbation of rt_genid by a small quantity [1..256]
934 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
935 * many times (2^24) without giving recent rt_genid.
936 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
938 static void rt_cache_invalidate(struct net *net)
940 unsigned char shuffle;
942 get_random_bytes(&shuffle, sizeof(shuffle));
943 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
945 inetpeer_invalidate_tree(AF_INET);
949 * delay < 0 : invalidate cache (fast : entries will be deleted later)
950 * delay >= 0 : invalidate & flush cache (can be long)
952 void rt_cache_flush(struct net *net, int delay)
954 rt_cache_invalidate(net);
956 rt_do_flush(net, !in_softirq());
959 /* Flush previous cache invalidated entries from the cache */
960 void rt_cache_flush_batch(struct net *net)
962 rt_do_flush(net, !in_softirq());
965 static void rt_emergency_hash_rebuild(struct net *net)
968 printk(KERN_WARNING "Route hash chain too long!\n");
969 rt_cache_invalidate(net);
973 Short description of GC goals.
975 We want to build algorithm, which will keep routing cache
976 at some equilibrium point, when number of aged off entries
977 is kept approximately equal to newly generated ones.
979 Current expiration strength is variable "expire".
980 We try to adjust it dynamically, so that if networking
981 is idle expires is large enough to keep enough of warm entries,
982 and when load increases it reduces to limit cache size.
985 static void __do_rt_garbage_collect(int elasticity, int min_interval)
987 static unsigned long expire = RT_GC_TIMEOUT;
988 static unsigned long last_gc;
990 static int equilibrium;
991 static DEFINE_SPINLOCK(rt_gc_lock);
993 struct rtable __rcu **rthp;
994 unsigned long now = jiffies;
996 int entries = dst_entries_get_fast(&ipv4_dst_ops);
999 * Garbage collection is pretty expensive,
1000 * do not make it too frequently.
1003 spin_lock_bh(&rt_gc_lock);
1005 RT_CACHE_STAT_INC(gc_total);
1007 if (now - last_gc < min_interval &&
1008 entries < ip_rt_max_size) {
1009 RT_CACHE_STAT_INC(gc_ignored);
1013 entries = dst_entries_get_slow(&ipv4_dst_ops);
1014 /* Calculate number of entries, which we want to expire now. */
1015 goal = entries - (elasticity << rt_hash_log);
1017 if (equilibrium < ipv4_dst_ops.gc_thresh)
1018 equilibrium = ipv4_dst_ops.gc_thresh;
1019 goal = entries - equilibrium;
1021 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1022 goal = entries - equilibrium;
1025 /* We are in dangerous area. Try to reduce cache really
1028 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1029 equilibrium = entries - goal;
1032 if (now - last_gc >= min_interval)
1036 equilibrium += goal;
1043 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1044 unsigned long tmo = expire;
1046 k = (k + 1) & rt_hash_mask;
1047 rthp = &rt_hash_table[k].chain;
1048 spin_lock_bh(rt_hash_lock_addr(k));
1049 while ((rth = rcu_dereference_protected(*rthp,
1050 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1051 if (!rt_is_expired(rth) &&
1052 !rt_may_expire(rth, tmo, expire)) {
1054 rthp = &rth->dst.rt_next;
1057 *rthp = rth->dst.rt_next;
1061 spin_unlock_bh(rt_hash_lock_addr(k));
1070 /* Goal is not achieved. We stop process if:
1072 - if expire reduced to zero. Otherwise, expire is halfed.
1073 - if table is not full.
1074 - if we are called from interrupt.
1075 - jiffies check is just fallback/debug loop breaker.
1076 We will not spin here for long time in any case.
1079 RT_CACHE_STAT_INC(gc_goal_miss);
1086 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1088 } while (!in_softirq() && time_before_eq(jiffies, now));
1090 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1092 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1094 if (net_ratelimit())
1095 printk(KERN_WARNING "dst cache overflow\n");
1096 RT_CACHE_STAT_INC(gc_dst_overflow);
1100 expire += min_interval;
1101 if (expire > ip_rt_gc_timeout ||
1102 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1103 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1104 expire = ip_rt_gc_timeout;
1106 spin_unlock_bh(&rt_gc_lock);
1109 static void __rt_garbage_collect(struct work_struct *w)
1111 __do_rt_garbage_collect(ip_rt_gc_elasticity, ip_rt_gc_min_interval);
1114 static int rt_garbage_collect(struct dst_ops *ops)
1116 if (!work_pending(&rt_gc_worker))
1117 schedule_work(&rt_gc_worker);
1119 if (dst_entries_get_fast(&ipv4_dst_ops) >= ip_rt_max_size ||
1120 dst_entries_get_slow(&ipv4_dst_ops) >= ip_rt_max_size) {
1121 RT_CACHE_STAT_INC(gc_dst_overflow);
1128 * Returns number of entries in a hash chain that have different hash_inputs
1130 static int slow_chain_length(const struct rtable *head)
1133 const struct rtable *rth = head;
1136 length += has_noalias(head, rth);
1137 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1139 return length >> FRACT_BITS;
1142 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1144 struct neigh_table *tbl = &arp_tbl;
1145 static const __be32 inaddr_any = 0;
1146 struct net_device *dev = dst->dev;
1147 const __be32 *pkey = daddr;
1148 struct neighbour *n;
1150 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1151 if (dev->type == ARPHRD_ATM)
1152 tbl = clip_tbl_hook;
1154 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1157 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1160 return neigh_create(tbl, pkey, dev);
1163 static int rt_bind_neighbour(struct rtable *rt)
1165 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1168 dst_set_neighbour(&rt->dst, n);
1173 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1174 struct sk_buff *skb, int ifindex)
1176 struct rtable *rth, *cand;
1177 struct rtable __rcu **rthp, **candp;
1185 min_score = ~(u32)0;
1190 if (!rt_caching(dev_net(rt->dst.dev))) {
1192 * If we're not caching, just tell the caller we
1193 * were successful and don't touch the route. The
1194 * caller hold the sole reference to the cache entry, and
1195 * it will be released when the caller is done with it.
1196 * If we drop it here, the callers have no way to resolve routes
1197 * when we're not caching. Instead, just point *rp at rt, so
1198 * the caller gets a single use out of the route
1199 * Note that we do rt_free on this new route entry, so that
1200 * once its refcount hits zero, we are still able to reap it
1202 * Note: To avoid expensive rcu stuff for this uncached dst,
1203 * we set DST_NOCACHE so that dst_release() can free dst without
1204 * waiting a grace period.
1207 rt->dst.flags |= DST_NOCACHE;
1208 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1209 int err = rt_bind_neighbour(rt);
1211 if (net_ratelimit())
1213 "Neighbour table failure & not caching routes.\n");
1215 return ERR_PTR(err);
1222 rthp = &rt_hash_table[hash].chain;
1224 spin_lock_bh(rt_hash_lock_addr(hash));
1225 while ((rth = rcu_dereference_protected(*rthp,
1226 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1227 if (rt_is_expired(rth)) {
1228 *rthp = rth->dst.rt_next;
1232 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1234 *rthp = rth->dst.rt_next;
1236 * Since lookup is lockfree, the deletion
1237 * must be visible to another weakly ordered CPU before
1238 * the insertion at the start of the hash chain.
1240 rcu_assign_pointer(rth->dst.rt_next,
1241 rt_hash_table[hash].chain);
1243 * Since lookup is lockfree, the update writes
1244 * must be ordered for consistency on SMP.
1246 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1248 dst_use(&rth->dst, now);
1249 spin_unlock_bh(rt_hash_lock_addr(hash));
1253 skb_dst_set(skb, &rth->dst);
1257 if (!atomic_read(&rth->dst.__refcnt)) {
1258 u32 score = rt_score(rth);
1260 if (score <= min_score) {
1269 rthp = &rth->dst.rt_next;
1273 /* ip_rt_gc_elasticity used to be average length of chain
1274 * length, when exceeded gc becomes really aggressive.
1276 * The second limit is less certain. At the moment it allows
1277 * only 2 entries per bucket. We will see.
1279 if (chain_length > ip_rt_gc_elasticity) {
1280 *candp = cand->dst.rt_next;
1284 if (chain_length > rt_chain_length_max &&
1285 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1286 struct net *net = dev_net(rt->dst.dev);
1287 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1288 if (!rt_caching(net)) {
1289 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1290 rt->dst.dev->name, num);
1292 rt_emergency_hash_rebuild(net);
1293 spin_unlock_bh(rt_hash_lock_addr(hash));
1295 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1296 ifindex, rt_genid(net));
1301 /* Try to bind route to arp only if it is output
1302 route or unicast forwarding path.
1304 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1305 int err = rt_bind_neighbour(rt);
1307 spin_unlock_bh(rt_hash_lock_addr(hash));
1309 if (err != -ENOBUFS) {
1311 return ERR_PTR(err);
1314 /* Neighbour tables are full and nothing
1315 can be released. Try to shrink route cache,
1316 it is most likely it holds some neighbour records.
1318 if (!in_softirq() && attempts-- > 0) {
1319 static DEFINE_SPINLOCK(lock);
1321 if (spin_trylock(&lock)) {
1322 __do_rt_garbage_collect(1, 0);
1325 spin_unlock_wait(&lock);
1330 if (net_ratelimit())
1331 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1333 return ERR_PTR(-ENOBUFS);
1337 rt->dst.rt_next = rt_hash_table[hash].chain;
1340 * Since lookup is lockfree, we must make sure
1341 * previous writes to rt are committed to memory
1342 * before making rt visible to other CPUS.
1344 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1346 spin_unlock_bh(rt_hash_lock_addr(hash));
1350 skb_dst_set(skb, &rt->dst);
1354 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1356 static u32 rt_peer_genid(void)
1358 return atomic_read(&__rt_peer_genid);
1361 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1363 struct inet_peer *peer;
1365 peer = inet_getpeer_v4(daddr, create);
1367 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1370 rt->rt_peer_genid = rt_peer_genid();
1373 #define IP_IDENTS_SZ 2048u
1374 struct ip_ident_bucket {
1379 static struct ip_ident_bucket *ip_idents __read_mostly;
1381 /* In order to protect privacy, we add a perturbation to identifiers
1382 * if one generator is seldom used. This makes hard for an attacker
1383 * to infer how many packets were sent between two points in time.
1385 u32 ip_idents_reserve(u32 hash, int segs)
1387 struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
1388 u32 old = ACCESS_ONCE(bucket->stamp32);
1389 u32 now = (u32)jiffies;
1392 if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
1396 delta = (u32)(x >> 32);
1399 return atomic_add_return(segs + delta, &bucket->id) - segs;
1401 EXPORT_SYMBOL(ip_idents_reserve);
1403 void __ip_select_ident(struct iphdr *iph, int segs)
1405 static u32 ip_idents_hashrnd __read_mostly;
1406 static bool hashrnd_initialized = false;
1409 if (unlikely(!hashrnd_initialized)) {
1410 hashrnd_initialized = true;
1411 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1414 hash = jhash_3words((__force u32)iph->daddr,
1415 (__force u32)iph->saddr,
1418 id = ip_idents_reserve(hash, segs);
1419 iph->id = htons(id);
1421 EXPORT_SYMBOL(__ip_select_ident);
1423 static void rt_del(unsigned hash, struct rtable *rt)
1425 struct rtable __rcu **rthp;
1428 rthp = &rt_hash_table[hash].chain;
1429 spin_lock_bh(rt_hash_lock_addr(hash));
1431 while ((aux = rcu_dereference_protected(*rthp,
1432 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1433 if (aux == rt || rt_is_expired(aux)) {
1434 *rthp = aux->dst.rt_next;
1438 rthp = &aux->dst.rt_next;
1440 spin_unlock_bh(rt_hash_lock_addr(hash));
1443 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1445 struct rtable *rt = (struct rtable *) dst;
1446 __be32 orig_gw = rt->rt_gateway;
1447 struct neighbour *n, *old_n;
1449 dst_confirm(&rt->dst);
1451 rt->rt_gateway = peer->redirect_learned.a4;
1453 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1455 rt->rt_gateway = orig_gw;
1458 old_n = xchg(&rt->dst._neighbour, n);
1460 neigh_release(old_n);
1461 if (!(n->nud_state & NUD_VALID)) {
1462 neigh_event_send(n, NULL);
1464 rt->rt_flags |= RTCF_REDIRECTED;
1465 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1469 /* called in rcu_read_lock() section */
1470 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1471 __be32 saddr, struct net_device *dev)
1474 struct in_device *in_dev = __in_dev_get_rcu(dev);
1475 __be32 skeys[2] = { saddr, 0 };
1476 int ikeys[2] = { dev->ifindex, 0 };
1477 struct inet_peer *peer;
1484 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1485 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1486 ipv4_is_zeronet(new_gw))
1487 goto reject_redirect;
1489 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1490 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1491 goto reject_redirect;
1492 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1493 goto reject_redirect;
1495 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1496 goto reject_redirect;
1499 for (s = 0; s < 2; s++) {
1500 for (i = 0; i < 2; i++) {
1502 struct rtable __rcu **rthp;
1505 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1507 rthp = &rt_hash_table[hash].chain;
1509 while ((rt = rcu_dereference(*rthp)) != NULL) {
1510 rthp = &rt->dst.rt_next;
1512 if (rt->rt_key_dst != daddr ||
1513 rt->rt_key_src != skeys[s] ||
1514 rt->rt_oif != ikeys[i] ||
1515 rt_is_input_route(rt) ||
1516 rt_is_expired(rt) ||
1517 !net_eq(dev_net(rt->dst.dev), net) ||
1519 rt->dst.dev != dev ||
1520 rt->rt_gateway != old_gw)
1524 rt_bind_peer(rt, rt->rt_dst, 1);
1528 if (peer->redirect_learned.a4 != new_gw ||
1529 peer->redirect_genid != redirect_genid) {
1530 peer->redirect_learned.a4 = new_gw;
1531 peer->redirect_genid = redirect_genid;
1532 atomic_inc(&__rt_peer_genid);
1534 check_peer_redir(&rt->dst, peer);
1542 #ifdef CONFIG_IP_ROUTE_VERBOSE
1543 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1544 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1545 " Advised path = %pI4 -> %pI4\n",
1546 &old_gw, dev->name, &new_gw,
1552 static bool peer_pmtu_expired(struct inet_peer *peer)
1554 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1557 time_after_eq(jiffies, orig) &&
1558 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1561 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1563 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1566 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1569 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1571 struct rtable *rt = (struct rtable *)dst;
1572 struct dst_entry *ret = dst;
1575 if (dst->obsolete > 0) {
1578 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1579 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1581 rt_genid(dev_net(dst->dev)));
1584 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1585 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1593 * 1. The first ip_rt_redirect_number redirects are sent
1594 * with exponential backoff, then we stop sending them at all,
1595 * assuming that the host ignores our redirects.
1596 * 2. If we did not see packets requiring redirects
1597 * during ip_rt_redirect_silence, we assume that the host
1598 * forgot redirected route and start to send redirects again.
1600 * This algorithm is much cheaper and more intelligent than dumb load limiting
1603 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1604 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1607 void ip_rt_send_redirect(struct sk_buff *skb)
1609 struct rtable *rt = skb_rtable(skb);
1610 struct in_device *in_dev;
1611 struct inet_peer *peer;
1615 in_dev = __in_dev_get_rcu(rt->dst.dev);
1616 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1620 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1624 rt_bind_peer(rt, rt->rt_dst, 1);
1627 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1631 /* No redirected packets during ip_rt_redirect_silence;
1632 * reset the algorithm.
1634 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1635 peer->rate_tokens = 0;
1637 /* Too many ignored redirects; do not send anything
1638 * set dst.rate_last to the last seen redirected packet.
1640 if (peer->rate_tokens >= ip_rt_redirect_number) {
1641 peer->rate_last = jiffies;
1645 /* Check for load limit; set rate_last to the latest sent
1648 if (peer->rate_tokens == 0 ||
1651 (ip_rt_redirect_load << peer->rate_tokens)))) {
1652 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1653 peer->rate_last = jiffies;
1654 ++peer->rate_tokens;
1655 #ifdef CONFIG_IP_ROUTE_VERBOSE
1657 peer->rate_tokens == ip_rt_redirect_number &&
1659 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1660 &ip_hdr(skb)->saddr, rt->rt_iif,
1661 &rt->rt_dst, &rt->rt_gateway);
1666 static int ip_error(struct sk_buff *skb)
1668 struct rtable *rt = skb_rtable(skb);
1669 struct inet_peer *peer;
1674 switch (rt->dst.error) {
1679 code = ICMP_HOST_UNREACH;
1682 code = ICMP_NET_UNREACH;
1683 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1684 IPSTATS_MIB_INNOROUTES);
1687 code = ICMP_PKT_FILTERED;
1692 rt_bind_peer(rt, rt->rt_dst, 1);
1698 peer->rate_tokens += now - peer->rate_last;
1699 if (peer->rate_tokens > ip_rt_error_burst)
1700 peer->rate_tokens = ip_rt_error_burst;
1701 peer->rate_last = now;
1702 if (peer->rate_tokens >= ip_rt_error_cost)
1703 peer->rate_tokens -= ip_rt_error_cost;
1708 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1710 out: kfree_skb(skb);
1715 * The last two values are not from the RFC but
1716 * are needed for AMPRnet AX.25 paths.
1719 static const unsigned short mtu_plateau[] =
1720 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1722 static inline unsigned short guess_mtu(unsigned short old_mtu)
1726 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1727 if (old_mtu > mtu_plateau[i])
1728 return mtu_plateau[i];
1732 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1733 unsigned short new_mtu,
1734 struct net_device *dev)
1736 unsigned short old_mtu = ntohs(iph->tot_len);
1737 unsigned short est_mtu = 0;
1738 struct inet_peer *peer;
1740 peer = inet_getpeer_v4(iph->daddr, 1);
1742 unsigned short mtu = new_mtu;
1744 if (new_mtu < 68 || new_mtu >= old_mtu) {
1745 /* BSD 4.2 derived systems incorrectly adjust
1746 * tot_len by the IP header length, and report
1747 * a zero MTU in the ICMP message.
1750 old_mtu >= 68 + (iph->ihl << 2))
1751 old_mtu -= iph->ihl << 2;
1752 mtu = guess_mtu(old_mtu);
1755 if (mtu < ip_rt_min_pmtu)
1756 mtu = ip_rt_min_pmtu;
1757 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1758 unsigned long pmtu_expires;
1760 pmtu_expires = jiffies + ip_rt_mtu_expires;
1765 peer->pmtu_learned = mtu;
1766 peer->pmtu_expires = pmtu_expires;
1767 atomic_inc(&__rt_peer_genid);
1772 return est_mtu ? : new_mtu;
1775 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1777 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1781 if (time_before(jiffies, expires)) {
1782 u32 orig_dst_mtu = dst_mtu(dst);
1783 if (peer->pmtu_learned < orig_dst_mtu) {
1784 if (!peer->pmtu_orig)
1785 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1786 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1788 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1789 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1792 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1794 struct rtable *rt = (struct rtable *) dst;
1795 struct inet_peer *peer;
1800 rt_bind_peer(rt, rt->rt_dst, 1);
1803 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1805 if (mtu < ip_rt_min_pmtu)
1806 mtu = ip_rt_min_pmtu;
1807 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1809 pmtu_expires = jiffies + ip_rt_mtu_expires;
1813 peer->pmtu_learned = mtu;
1814 peer->pmtu_expires = pmtu_expires;
1816 atomic_inc(&__rt_peer_genid);
1817 rt->rt_peer_genid = rt_peer_genid();
1819 check_peer_pmtu(dst, peer);
1824 static void ipv4_validate_peer(struct rtable *rt)
1826 if (rt->rt_peer_genid != rt_peer_genid()) {
1827 struct inet_peer *peer;
1830 rt_bind_peer(rt, rt->rt_dst, 0);
1834 check_peer_pmtu(&rt->dst, peer);
1836 if (peer->redirect_genid != redirect_genid)
1837 peer->redirect_learned.a4 = 0;
1838 if (peer->redirect_learned.a4 &&
1839 peer->redirect_learned.a4 != rt->rt_gateway)
1840 check_peer_redir(&rt->dst, peer);
1843 rt->rt_peer_genid = rt_peer_genid();
1847 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1849 struct rtable *rt = (struct rtable *) dst;
1851 if (rt_is_expired(rt))
1853 ipv4_validate_peer(rt);
1857 static void ipv4_dst_destroy(struct dst_entry *dst)
1859 struct rtable *rt = (struct rtable *) dst;
1860 struct inet_peer *peer = rt->peer;
1863 fib_info_put(rt->fi);
1873 static void ipv4_link_failure(struct sk_buff *skb)
1877 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1879 rt = skb_rtable(skb);
1880 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1881 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1884 static int ip_rt_bug(struct sk_buff *skb)
1886 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1887 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1888 skb->dev ? skb->dev->name : "?");
1895 We do not cache source address of outgoing interface,
1896 because it is used only by IP RR, TS and SRR options,
1897 so that it out of fast path.
1899 BTW remember: "addr" is allowed to be not aligned
1903 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1907 if (rt_is_output_route(rt))
1908 src = ip_hdr(skb)->saddr;
1910 struct fib_result res;
1916 memset(&fl4, 0, sizeof(fl4));
1917 fl4.daddr = iph->daddr;
1918 fl4.saddr = iph->saddr;
1919 fl4.flowi4_tos = RT_TOS(iph->tos);
1920 fl4.flowi4_oif = rt->dst.dev->ifindex;
1921 fl4.flowi4_iif = skb->dev->ifindex;
1922 fl4.flowi4_mark = skb->mark;
1925 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1926 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1928 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1932 memcpy(addr, &src, 4);
1935 #ifdef CONFIG_IP_ROUTE_CLASSID
1936 static void set_class_tag(struct rtable *rt, u32 tag)
1938 if (!(rt->dst.tclassid & 0xFFFF))
1939 rt->dst.tclassid |= tag & 0xFFFF;
1940 if (!(rt->dst.tclassid & 0xFFFF0000))
1941 rt->dst.tclassid |= tag & 0xFFFF0000;
1945 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1947 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1950 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1952 if (advmss > 65535 - 40)
1953 advmss = 65535 - 40;
1958 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1960 const struct rtable *rt = (const struct rtable *) dst;
1961 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1963 if (mtu && rt_is_output_route(rt))
1966 mtu = dst->dev->mtu;
1968 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1970 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1974 if (mtu > IP_MAX_MTU)
1980 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1981 struct fib_info *fi)
1983 struct inet_peer *peer;
1986 /* If a peer entry exists for this destination, we must hook
1987 * it up in order to get at cached metrics.
1989 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1992 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1994 rt->rt_peer_genid = rt_peer_genid();
1995 if (inet_metrics_new(peer))
1996 memcpy(peer->metrics, fi->fib_metrics,
1997 sizeof(u32) * RTAX_MAX);
1998 dst_init_metrics(&rt->dst, peer->metrics, false);
2000 check_peer_pmtu(&rt->dst, peer);
2001 if (peer->redirect_genid != redirect_genid)
2002 peer->redirect_learned.a4 = 0;
2003 if (peer->redirect_learned.a4 &&
2004 peer->redirect_learned.a4 != rt->rt_gateway) {
2005 rt->rt_gateway = peer->redirect_learned.a4;
2006 rt->rt_flags |= RTCF_REDIRECTED;
2009 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
2011 atomic_inc(&fi->fib_clntref);
2013 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
2017 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
2018 const struct fib_result *res,
2019 struct fib_info *fi, u16 type, u32 itag)
2021 struct dst_entry *dst = &rt->dst;
2024 if (FIB_RES_GW(*res) &&
2025 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
2026 rt->rt_gateway = FIB_RES_GW(*res);
2027 rt_init_metrics(rt, fl4, fi);
2028 #ifdef CONFIG_IP_ROUTE_CLASSID
2029 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
2033 if (dst_mtu(dst) > IP_MAX_MTU)
2034 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
2035 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
2036 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
2038 #ifdef CONFIG_IP_ROUTE_CLASSID
2039 #ifdef CONFIG_IP_MULTIPLE_TABLES
2040 set_class_tag(rt, fib_rules_tclass(res));
2042 set_class_tag(rt, itag);
2046 static struct rtable *rt_dst_alloc(struct net_device *dev,
2047 bool nopolicy, bool noxfrm)
2049 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2051 (nopolicy ? DST_NOPOLICY : 0) |
2052 (noxfrm ? DST_NOXFRM : 0));
2055 /* called in rcu_read_lock() section */
2056 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2057 u8 tos, struct net_device *dev, int our)
2062 struct in_device *in_dev = __in_dev_get_rcu(dev);
2066 /* Primary sanity checks. */
2071 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2072 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2075 if (ipv4_is_zeronet(saddr)) {
2076 if (!ipv4_is_local_multicast(daddr))
2078 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2080 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2085 rth = rt_dst_alloc(init_net.loopback_dev,
2086 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2090 #ifdef CONFIG_IP_ROUTE_CLASSID
2091 rth->dst.tclassid = itag;
2093 rth->dst.output = ip_rt_bug;
2095 rth->rt_key_dst = daddr;
2096 rth->rt_key_src = saddr;
2097 rth->rt_genid = rt_genid(dev_net(dev));
2098 rth->rt_flags = RTCF_MULTICAST;
2099 rth->rt_type = RTN_MULTICAST;
2100 rth->rt_key_tos = tos;
2101 rth->rt_dst = daddr;
2102 rth->rt_src = saddr;
2103 rth->rt_route_iif = dev->ifindex;
2104 rth->rt_iif = dev->ifindex;
2106 rth->rt_mark = skb->mark;
2107 rth->rt_gateway = daddr;
2108 rth->rt_spec_dst= spec_dst;
2109 rth->rt_peer_genid = 0;
2113 rth->dst.input= ip_local_deliver;
2114 rth->rt_flags |= RTCF_LOCAL;
2117 #ifdef CONFIG_IP_MROUTE
2118 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2119 rth->dst.input = ip_mr_input;
2121 RT_CACHE_STAT_INC(in_slow_mc);
2123 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2124 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2125 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2136 static void ip_handle_martian_source(struct net_device *dev,
2137 struct in_device *in_dev,
2138 struct sk_buff *skb,
2142 RT_CACHE_STAT_INC(in_martian_src);
2143 #ifdef CONFIG_IP_ROUTE_VERBOSE
2144 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2146 * RFC1812 recommendation, if source is martian,
2147 * the only hint is MAC header.
2149 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2150 &daddr, &saddr, dev->name);
2151 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2153 const unsigned char *p = skb_mac_header(skb);
2154 printk(KERN_WARNING "ll header: ");
2155 for (i = 0; i < dev->hard_header_len; i++, p++) {
2157 if (i < (dev->hard_header_len - 1))
2166 /* called in rcu_read_lock() section */
2167 static int __mkroute_input(struct sk_buff *skb,
2168 const struct fib_result *res,
2169 struct in_device *in_dev,
2170 __be32 daddr, __be32 saddr, u32 tos,
2171 struct rtable **result)
2175 struct in_device *out_dev;
2176 unsigned int flags = 0;
2180 /* get a working reference to the output device */
2181 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2182 if (out_dev == NULL) {
2183 if (net_ratelimit())
2184 printk(KERN_CRIT "Bug in ip_route_input" \
2185 "_slow(). Please, report\n");
2190 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2191 in_dev->dev, &spec_dst, &itag);
2193 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2200 flags |= RTCF_DIRECTSRC;
2202 if (out_dev == in_dev && err &&
2203 (IN_DEV_SHARED_MEDIA(out_dev) ||
2204 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2205 flags |= RTCF_DOREDIRECT;
2207 if (skb->protocol != htons(ETH_P_IP)) {
2208 /* Not IP (i.e. ARP). Do not create route, if it is
2209 * invalid for proxy arp. DNAT routes are always valid.
2211 * Proxy arp feature have been extended to allow, ARP
2212 * replies back to the same interface, to support
2213 * Private VLAN switch technologies. See arp.c.
2215 if (out_dev == in_dev &&
2216 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2222 rth = rt_dst_alloc(out_dev->dev,
2223 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2224 IN_DEV_CONF_GET(out_dev, NOXFRM));
2230 rth->rt_key_dst = daddr;
2231 rth->rt_key_src = saddr;
2232 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2233 rth->rt_flags = flags;
2234 rth->rt_type = res->type;
2235 rth->rt_key_tos = tos;
2236 rth->rt_dst = daddr;
2237 rth->rt_src = saddr;
2238 rth->rt_route_iif = in_dev->dev->ifindex;
2239 rth->rt_iif = in_dev->dev->ifindex;
2241 rth->rt_mark = skb->mark;
2242 rth->rt_gateway = daddr;
2243 rth->rt_spec_dst= spec_dst;
2244 rth->rt_peer_genid = 0;
2248 rth->dst.input = ip_forward;
2249 rth->dst.output = ip_output;
2251 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2259 static int ip_mkroute_input(struct sk_buff *skb,
2260 struct fib_result *res,
2261 const struct flowi4 *fl4,
2262 struct in_device *in_dev,
2263 __be32 daddr, __be32 saddr, u32 tos)
2265 struct rtable* rth = NULL;
2269 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2270 if (res->fi && res->fi->fib_nhs > 1)
2271 fib_select_multipath(res);
2274 /* create a routing cache entry */
2275 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2279 /* put it into the cache */
2280 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2281 rt_genid(dev_net(rth->dst.dev)));
2282 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2284 return PTR_ERR(rth);
2289 * NOTE. We drop all the packets that has local source
2290 * addresses, because every properly looped back packet
2291 * must have correct destination already attached by output routine.
2293 * Such approach solves two big problems:
2294 * 1. Not simplex devices are handled properly.
2295 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2296 * called with rcu_read_lock()
2299 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2300 u8 tos, struct net_device *dev)
2302 struct fib_result res;
2303 struct in_device *in_dev = __in_dev_get_rcu(dev);
2307 struct rtable * rth;
2311 struct net * net = dev_net(dev);
2313 /* IP on this device is disabled. */
2318 /* Check for the most weird martians, which can be not detected
2322 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2323 ipv4_is_loopback(saddr))
2324 goto martian_source;
2326 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2329 /* Accept zero addresses only to limited broadcast;
2330 * I even do not know to fix it or not. Waiting for complains :-)
2332 if (ipv4_is_zeronet(saddr))
2333 goto martian_source;
2335 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2336 goto martian_destination;
2339 * Now we are ready to route packet.
2342 fl4.flowi4_iif = dev->ifindex;
2343 fl4.flowi4_mark = skb->mark;
2344 fl4.flowi4_tos = tos;
2345 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2348 err = fib_lookup(net, &fl4, &res);
2350 if (!IN_DEV_FORWARD(in_dev))
2355 RT_CACHE_STAT_INC(in_slow_tot);
2357 if (res.type == RTN_BROADCAST)
2360 if (res.type == RTN_LOCAL) {
2361 err = fib_validate_source(skb, saddr, daddr, tos,
2362 net->loopback_dev->ifindex,
2363 dev, &spec_dst, &itag);
2365 goto martian_source_keep_err;
2367 flags |= RTCF_DIRECTSRC;
2372 if (!IN_DEV_FORWARD(in_dev))
2374 if (res.type != RTN_UNICAST)
2375 goto martian_destination;
2377 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2381 if (skb->protocol != htons(ETH_P_IP))
2384 if (ipv4_is_zeronet(saddr))
2385 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2387 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2390 goto martian_source_keep_err;
2392 flags |= RTCF_DIRECTSRC;
2394 flags |= RTCF_BROADCAST;
2395 res.type = RTN_BROADCAST;
2396 RT_CACHE_STAT_INC(in_brd);
2399 rth = rt_dst_alloc(net->loopback_dev,
2400 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2404 rth->dst.input= ip_local_deliver;
2405 rth->dst.output= ip_rt_bug;
2406 #ifdef CONFIG_IP_ROUTE_CLASSID
2407 rth->dst.tclassid = itag;
2410 rth->rt_key_dst = daddr;
2411 rth->rt_key_src = saddr;
2412 rth->rt_genid = rt_genid(net);
2413 rth->rt_flags = flags|RTCF_LOCAL;
2414 rth->rt_type = res.type;
2415 rth->rt_key_tos = tos;
2416 rth->rt_dst = daddr;
2417 rth->rt_src = saddr;
2418 #ifdef CONFIG_IP_ROUTE_CLASSID
2419 rth->dst.tclassid = itag;
2421 rth->rt_route_iif = dev->ifindex;
2422 rth->rt_iif = dev->ifindex;
2424 rth->rt_mark = skb->mark;
2425 rth->rt_gateway = daddr;
2426 rth->rt_spec_dst= spec_dst;
2427 rth->rt_peer_genid = 0;
2430 if (res.type == RTN_UNREACHABLE) {
2431 rth->dst.input= ip_error;
2432 rth->dst.error= -err;
2433 rth->rt_flags &= ~RTCF_LOCAL;
2435 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2436 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2443 RT_CACHE_STAT_INC(in_no_route);
2444 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2445 res.type = RTN_UNREACHABLE;
2451 * Do not cache martian addresses: they should be logged (RFC1812)
2453 martian_destination:
2454 RT_CACHE_STAT_INC(in_martian_dst);
2455 #ifdef CONFIG_IP_ROUTE_VERBOSE
2456 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2457 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2458 &daddr, &saddr, dev->name);
2462 err = -EHOSTUNREACH;
2475 martian_source_keep_err:
2476 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2480 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2481 u8 tos, struct net_device *dev, bool noref)
2483 struct rtable * rth;
2485 int iif = dev->ifindex;
2493 if (!rt_caching(net))
2496 tos &= IPTOS_RT_MASK;
2497 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2499 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2500 rth = rcu_dereference(rth->dst.rt_next)) {
2501 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2502 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2503 (rth->rt_route_iif ^ iif) |
2504 (rth->rt_key_tos ^ tos)) == 0 &&
2505 rth->rt_mark == skb->mark &&
2506 net_eq(dev_net(rth->dst.dev), net) &&
2507 !rt_is_expired(rth)) {
2508 ipv4_validate_peer(rth);
2510 dst_use_noref(&rth->dst, jiffies);
2511 skb_dst_set_noref(skb, &rth->dst);
2513 dst_use(&rth->dst, jiffies);
2514 skb_dst_set(skb, &rth->dst);
2516 RT_CACHE_STAT_INC(in_hit);
2520 RT_CACHE_STAT_INC(in_hlist_search);
2524 /* Multicast recognition logic is moved from route cache to here.
2525 The problem was that too many Ethernet cards have broken/missing
2526 hardware multicast filters :-( As result the host on multicasting
2527 network acquires a lot of useless route cache entries, sort of
2528 SDR messages from all the world. Now we try to get rid of them.
2529 Really, provided software IP multicast filter is organized
2530 reasonably (at least, hashed), it does not result in a slowdown
2531 comparing with route cache reject entries.
2532 Note, that multicast routers are not affected, because
2533 route cache entry is created eventually.
2535 if (ipv4_is_multicast(daddr)) {
2536 struct in_device *in_dev = __in_dev_get_rcu(dev);
2539 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2540 ip_hdr(skb)->protocol);
2542 #ifdef CONFIG_IP_MROUTE
2544 (!ipv4_is_local_multicast(daddr) &&
2545 IN_DEV_MFORWARD(in_dev))
2548 int res = ip_route_input_mc(skb, daddr, saddr,
2557 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2561 EXPORT_SYMBOL(ip_route_input_common);
2563 /* called with rcu_read_lock() */
2564 static struct rtable *__mkroute_output(const struct fib_result *res,
2565 const struct flowi4 *fl4,
2566 __be32 orig_daddr, __be32 orig_saddr,
2567 int orig_oif, __u8 orig_rtos,
2568 struct net_device *dev_out,
2571 struct fib_info *fi = res->fi;
2572 struct in_device *in_dev;
2573 u16 type = res->type;
2576 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2577 return ERR_PTR(-EINVAL);
2579 if (ipv4_is_lbcast(fl4->daddr))
2580 type = RTN_BROADCAST;
2581 else if (ipv4_is_multicast(fl4->daddr))
2582 type = RTN_MULTICAST;
2583 else if (ipv4_is_zeronet(fl4->daddr))
2584 return ERR_PTR(-EINVAL);
2586 if (dev_out->flags & IFF_LOOPBACK)
2587 flags |= RTCF_LOCAL;
2589 in_dev = __in_dev_get_rcu(dev_out);
2591 return ERR_PTR(-EINVAL);
2593 if (type == RTN_BROADCAST) {
2594 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2596 } else if (type == RTN_MULTICAST) {
2597 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2598 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2600 flags &= ~RTCF_LOCAL;
2601 /* If multicast route do not exist use
2602 * default one, but do not gateway in this case.
2605 if (fi && res->prefixlen < 4)
2607 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2608 (orig_oif != dev_out->ifindex)) {
2609 /* For local routes that require a particular output interface
2610 * we do not want to cache the result. Caching the result
2611 * causes incorrect behaviour when there are multiple source
2612 * addresses on the interface, the end result being that if the
2613 * intended recipient is waiting on that interface for the
2614 * packet he won't receive it because it will be delivered on
2615 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2616 * be set to the loopback interface as well.
2621 rth = rt_dst_alloc(dev_out,
2622 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2623 IN_DEV_CONF_GET(in_dev, NOXFRM));
2625 return ERR_PTR(-ENOBUFS);
2627 rth->dst.output = ip_output;
2629 rth->rt_key_dst = orig_daddr;
2630 rth->rt_key_src = orig_saddr;
2631 rth->rt_genid = rt_genid(dev_net(dev_out));
2632 rth->rt_flags = flags;
2633 rth->rt_type = type;
2634 rth->rt_key_tos = orig_rtos;
2635 rth->rt_dst = fl4->daddr;
2636 rth->rt_src = fl4->saddr;
2637 rth->rt_route_iif = 0;
2638 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2639 rth->rt_oif = orig_oif;
2640 rth->rt_mark = fl4->flowi4_mark;
2641 rth->rt_gateway = fl4->daddr;
2642 rth->rt_spec_dst= fl4->saddr;
2643 rth->rt_peer_genid = 0;
2647 RT_CACHE_STAT_INC(out_slow_tot);
2649 if (flags & RTCF_LOCAL) {
2650 rth->dst.input = ip_local_deliver;
2651 rth->rt_spec_dst = fl4->daddr;
2653 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2654 rth->rt_spec_dst = fl4->saddr;
2655 if (flags & RTCF_LOCAL &&
2656 !(dev_out->flags & IFF_LOOPBACK)) {
2657 rth->dst.output = ip_mc_output;
2658 RT_CACHE_STAT_INC(out_slow_mc);
2660 #ifdef CONFIG_IP_MROUTE
2661 if (type == RTN_MULTICAST) {
2662 if (IN_DEV_MFORWARD(in_dev) &&
2663 !ipv4_is_local_multicast(fl4->daddr)) {
2664 rth->dst.input = ip_mr_input;
2665 rth->dst.output = ip_mc_output;
2671 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2677 * Major route resolver routine.
2678 * called with rcu_read_lock();
2681 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2683 struct net_device *dev_out = NULL;
2684 __u8 tos = RT_FL_TOS(fl4);
2685 unsigned int flags = 0;
2686 struct fib_result res;
2693 #ifdef CONFIG_IP_MULTIPLE_TABLES
2697 orig_daddr = fl4->daddr;
2698 orig_saddr = fl4->saddr;
2699 orig_oif = fl4->flowi4_oif;
2701 fl4->flowi4_iif = net->loopback_dev->ifindex;
2702 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2703 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2704 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2708 rth = ERR_PTR(-EINVAL);
2709 if (ipv4_is_multicast(fl4->saddr) ||
2710 ipv4_is_lbcast(fl4->saddr) ||
2711 ipv4_is_zeronet(fl4->saddr))
2714 /* I removed check for oif == dev_out->oif here.
2715 It was wrong for two reasons:
2716 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2717 is assigned to multiple interfaces.
2718 2. Moreover, we are allowed to send packets with saddr
2719 of another iface. --ANK
2722 if (fl4->flowi4_oif == 0 &&
2723 (ipv4_is_multicast(fl4->daddr) ||
2724 ipv4_is_lbcast(fl4->daddr))) {
2725 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2726 dev_out = __ip_dev_find(net, fl4->saddr, false);
2727 if (dev_out == NULL)
2730 /* Special hack: user can direct multicasts
2731 and limited broadcast via necessary interface
2732 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2733 This hack is not just for fun, it allows
2734 vic,vat and friends to work.
2735 They bind socket to loopback, set ttl to zero
2736 and expect that it will work.
2737 From the viewpoint of routing cache they are broken,
2738 because we are not allowed to build multicast path
2739 with loopback source addr (look, routing cache
2740 cannot know, that ttl is zero, so that packet
2741 will not leave this host and route is valid).
2742 Luckily, this hack is good workaround.
2745 fl4->flowi4_oif = dev_out->ifindex;
2749 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2750 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2751 if (!__ip_dev_find(net, fl4->saddr, false))
2757 if (fl4->flowi4_oif) {
2758 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2759 rth = ERR_PTR(-ENODEV);
2760 if (dev_out == NULL)
2763 /* RACE: Check return value of inet_select_addr instead. */
2764 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2765 rth = ERR_PTR(-ENETUNREACH);
2768 if (ipv4_is_local_multicast(fl4->daddr) ||
2769 ipv4_is_lbcast(fl4->daddr)) {
2771 fl4->saddr = inet_select_addr(dev_out, 0,
2776 if (ipv4_is_multicast(fl4->daddr))
2777 fl4->saddr = inet_select_addr(dev_out, 0,
2779 else if (!fl4->daddr)
2780 fl4->saddr = inet_select_addr(dev_out, 0,
2786 fl4->daddr = fl4->saddr;
2788 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2789 dev_out = net->loopback_dev;
2790 fl4->flowi4_oif = net->loopback_dev->ifindex;
2791 res.type = RTN_LOCAL;
2792 flags |= RTCF_LOCAL;
2796 if (fib_lookup(net, fl4, &res)) {
2798 if (fl4->flowi4_oif) {
2799 /* Apparently, routing tables are wrong. Assume,
2800 that the destination is on link.
2803 Because we are allowed to send to iface
2804 even if it has NO routes and NO assigned
2805 addresses. When oif is specified, routing
2806 tables are looked up with only one purpose:
2807 to catch if destination is gatewayed, rather than
2808 direct. Moreover, if MSG_DONTROUTE is set,
2809 we send packet, ignoring both routing tables
2810 and ifaddr state. --ANK
2813 We could make it even if oif is unknown,
2814 likely IPv6, but we do not.
2817 if (fl4->saddr == 0)
2818 fl4->saddr = inet_select_addr(dev_out, 0,
2820 res.type = RTN_UNICAST;
2823 rth = ERR_PTR(-ENETUNREACH);
2827 if (res.type == RTN_LOCAL) {
2829 if (res.fi->fib_prefsrc)
2830 fl4->saddr = res.fi->fib_prefsrc;
2832 fl4->saddr = fl4->daddr;
2834 dev_out = net->loopback_dev;
2835 fl4->flowi4_oif = dev_out->ifindex;
2837 flags |= RTCF_LOCAL;
2841 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2842 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2843 fib_select_multipath(&res);
2846 if (!res.prefixlen &&
2847 res.table->tb_num_default > 1 &&
2848 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2849 fib_select_default(&res);
2852 fl4->saddr = FIB_RES_PREFSRC(net, res);
2854 dev_out = FIB_RES_DEV(res);
2855 fl4->flowi4_oif = dev_out->ifindex;
2859 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2860 tos, dev_out, flags);
2864 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2865 rt_genid(dev_net(dev_out)));
2866 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2874 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2879 if (!rt_caching(net))
2882 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2885 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2886 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2887 if (rth->rt_key_dst == flp4->daddr &&
2888 rth->rt_key_src == flp4->saddr &&
2889 rt_is_output_route(rth) &&
2890 rth->rt_oif == flp4->flowi4_oif &&
2891 rth->rt_mark == flp4->flowi4_mark &&
2892 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2893 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2894 net_eq(dev_net(rth->dst.dev), net) &&
2895 !rt_is_expired(rth)) {
2896 ipv4_validate_peer(rth);
2897 dst_use(&rth->dst, jiffies);
2898 RT_CACHE_STAT_INC(out_hit);
2899 rcu_read_unlock_bh();
2901 flp4->saddr = rth->rt_src;
2903 flp4->daddr = rth->rt_dst;
2906 RT_CACHE_STAT_INC(out_hlist_search);
2908 rcu_read_unlock_bh();
2911 return ip_route_output_slow(net, flp4);
2913 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2915 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2920 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2922 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2924 return mtu ? : dst->dev->mtu;
2927 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2931 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2937 static struct dst_ops ipv4_dst_blackhole_ops = {
2939 .protocol = cpu_to_be16(ETH_P_IP),
2940 .destroy = ipv4_dst_destroy,
2941 .check = ipv4_blackhole_dst_check,
2942 .mtu = ipv4_blackhole_mtu,
2943 .default_advmss = ipv4_default_advmss,
2944 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2945 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2946 .neigh_lookup = ipv4_neigh_lookup,
2949 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2951 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2952 struct rtable *ort = (struct rtable *) dst_orig;
2955 struct dst_entry *new = &rt->dst;
2958 new->input = dst_discard;
2959 new->output = dst_discard;
2960 dst_copy_metrics(new, &ort->dst);
2962 new->dev = ort->dst.dev;
2966 rt->rt_key_dst = ort->rt_key_dst;
2967 rt->rt_key_src = ort->rt_key_src;
2968 rt->rt_key_tos = ort->rt_key_tos;
2969 rt->rt_route_iif = ort->rt_route_iif;
2970 rt->rt_iif = ort->rt_iif;
2971 rt->rt_oif = ort->rt_oif;
2972 rt->rt_mark = ort->rt_mark;
2974 rt->rt_genid = rt_genid(net);
2975 rt->rt_flags = ort->rt_flags;
2976 rt->rt_type = ort->rt_type;
2977 rt->rt_dst = ort->rt_dst;
2978 rt->rt_src = ort->rt_src;
2979 rt->rt_gateway = ort->rt_gateway;
2980 rt->rt_spec_dst = ort->rt_spec_dst;
2981 rt->peer = ort->peer;
2983 atomic_inc(&rt->peer->refcnt);
2986 atomic_inc(&rt->fi->fib_clntref);
2991 dst_release(dst_orig);
2993 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2996 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2999 struct rtable *rt = __ip_route_output_key(net, flp4);
3004 if (flp4->flowi4_proto)
3005 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
3006 flowi4_to_flowi(flp4),
3011 EXPORT_SYMBOL_GPL(ip_route_output_flow);
3013 static int rt_fill_info(struct net *net,
3014 struct sk_buff *skb, u32 pid, u32 seq, int event,
3015 int nowait, unsigned int flags)
3017 struct rtable *rt = skb_rtable(skb);
3019 struct nlmsghdr *nlh;
3020 unsigned long expires = 0;
3021 const struct inet_peer *peer = rt->peer;
3022 u32 id = 0, ts = 0, tsage = 0, error;
3024 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
3028 r = nlmsg_data(nlh);
3029 r->rtm_family = AF_INET;
3030 r->rtm_dst_len = 32;
3032 r->rtm_tos = rt->rt_key_tos;
3033 r->rtm_table = RT_TABLE_MAIN;
3034 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
3035 r->rtm_type = rt->rt_type;
3036 r->rtm_scope = RT_SCOPE_UNIVERSE;
3037 r->rtm_protocol = RTPROT_UNSPEC;
3038 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
3039 if (rt->rt_flags & RTCF_NOTIFY)
3040 r->rtm_flags |= RTM_F_NOTIFY;
3042 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
3044 if (rt->rt_key_src) {
3045 r->rtm_src_len = 32;
3046 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
3049 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
3050 #ifdef CONFIG_IP_ROUTE_CLASSID
3051 if (rt->dst.tclassid)
3052 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3054 if (rt_is_input_route(rt))
3055 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3056 else if (rt->rt_src != rt->rt_key_src)
3057 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3059 if (rt->rt_dst != rt->rt_gateway)
3060 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3062 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3063 goto nla_put_failure;
3066 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3068 error = rt->dst.error;
3070 inet_peer_refcheck(rt->peer);
3071 if (peer->tcp_ts_stamp) {
3073 tsage = get_seconds() - peer->tcp_ts_stamp;
3075 expires = ACCESS_ONCE(peer->pmtu_expires);
3077 if (time_before(jiffies, expires))
3084 if (rt_is_input_route(rt)) {
3085 #ifdef CONFIG_IP_MROUTE
3086 __be32 dst = rt->rt_dst;
3088 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3089 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3090 int err = ipmr_get_route(net, skb,
3091 rt->rt_src, rt->rt_dst,
3098 goto nla_put_failure;
3100 if (err == -EMSGSIZE)
3101 goto nla_put_failure;
3107 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3110 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3111 expires, error) < 0)
3112 goto nla_put_failure;
3114 return nlmsg_end(skb, nlh);
3117 nlmsg_cancel(skb, nlh);
3121 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3123 struct net *net = sock_net(in_skb->sk);
3125 struct nlattr *tb[RTA_MAX+1];
3126 struct rtable *rt = NULL;
3132 struct sk_buff *skb;
3134 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3138 rtm = nlmsg_data(nlh);
3140 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3146 /* Reserve room for dummy headers, this skb can pass
3147 through good chunk of routing engine.
3149 skb_reset_mac_header(skb);
3150 skb_reset_network_header(skb);
3152 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3153 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3154 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3156 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3157 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3158 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3159 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3162 struct net_device *dev;
3164 dev = __dev_get_by_index(net, iif);
3170 skb->protocol = htons(ETH_P_IP);
3174 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3177 rt = skb_rtable(skb);
3178 if (err == 0 && rt->dst.error)
3179 err = -rt->dst.error;
3181 struct flowi4 fl4 = {
3184 .flowi4_tos = rtm->rtm_tos,
3185 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3186 .flowi4_mark = mark,
3188 rt = ip_route_output_key(net, &fl4);
3198 skb_dst_set(skb, &rt->dst);
3199 if (rtm->rtm_flags & RTM_F_NOTIFY)
3200 rt->rt_flags |= RTCF_NOTIFY;
3202 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3203 RTM_NEWROUTE, 0, 0);
3207 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3216 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3223 net = sock_net(skb->sk);
3228 s_idx = idx = cb->args[1];
3229 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3230 if (!rt_hash_table[h].chain)
3233 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3234 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3235 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3237 if (rt_is_expired(rt))
3239 skb_dst_set_noref(skb, &rt->dst);
3240 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3241 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3242 1, NLM_F_MULTI) <= 0) {
3244 rcu_read_unlock_bh();
3249 rcu_read_unlock_bh();
3258 void ip_rt_multicast_event(struct in_device *in_dev)
3260 rt_cache_flush(dev_net(in_dev->dev), 0);
3263 #ifdef CONFIG_SYSCTL
3264 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3265 void __user *buffer,
3266 size_t *lenp, loff_t *ppos)
3273 memcpy(&ctl, __ctl, sizeof(ctl));
3274 ctl.data = &flush_delay;
3275 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3277 net = (struct net *)__ctl->extra1;
3278 rt_cache_flush(net, flush_delay);
3285 static ctl_table ipv4_route_table[] = {
3287 .procname = "gc_thresh",
3288 .data = &ipv4_dst_ops.gc_thresh,
3289 .maxlen = sizeof(int),
3291 .proc_handler = proc_dointvec,
3294 .procname = "max_size",
3295 .data = &ip_rt_max_size,
3296 .maxlen = sizeof(int),
3298 .proc_handler = proc_dointvec,
3301 /* Deprecated. Use gc_min_interval_ms */
3303 .procname = "gc_min_interval",
3304 .data = &ip_rt_gc_min_interval,
3305 .maxlen = sizeof(int),
3307 .proc_handler = proc_dointvec_jiffies,
3310 .procname = "gc_min_interval_ms",
3311 .data = &ip_rt_gc_min_interval,
3312 .maxlen = sizeof(int),
3314 .proc_handler = proc_dointvec_ms_jiffies,
3317 .procname = "gc_timeout",
3318 .data = &ip_rt_gc_timeout,
3319 .maxlen = sizeof(int),
3321 .proc_handler = proc_dointvec_jiffies,
3324 .procname = "gc_interval",
3325 .data = &ip_rt_gc_interval,
3326 .maxlen = sizeof(int),
3328 .proc_handler = proc_dointvec_jiffies,
3331 .procname = "redirect_load",
3332 .data = &ip_rt_redirect_load,
3333 .maxlen = sizeof(int),
3335 .proc_handler = proc_dointvec,
3338 .procname = "redirect_number",
3339 .data = &ip_rt_redirect_number,
3340 .maxlen = sizeof(int),
3342 .proc_handler = proc_dointvec,
3345 .procname = "redirect_silence",
3346 .data = &ip_rt_redirect_silence,
3347 .maxlen = sizeof(int),
3349 .proc_handler = proc_dointvec,
3352 .procname = "error_cost",
3353 .data = &ip_rt_error_cost,
3354 .maxlen = sizeof(int),
3356 .proc_handler = proc_dointvec,
3359 .procname = "error_burst",
3360 .data = &ip_rt_error_burst,
3361 .maxlen = sizeof(int),
3363 .proc_handler = proc_dointvec,
3366 .procname = "gc_elasticity",
3367 .data = &ip_rt_gc_elasticity,
3368 .maxlen = sizeof(int),
3370 .proc_handler = proc_dointvec,
3373 .procname = "mtu_expires",
3374 .data = &ip_rt_mtu_expires,
3375 .maxlen = sizeof(int),
3377 .proc_handler = proc_dointvec_jiffies,
3380 .procname = "min_pmtu",
3381 .data = &ip_rt_min_pmtu,
3382 .maxlen = sizeof(int),
3384 .proc_handler = proc_dointvec,
3387 .procname = "min_adv_mss",
3388 .data = &ip_rt_min_advmss,
3389 .maxlen = sizeof(int),
3391 .proc_handler = proc_dointvec,
3396 static struct ctl_table empty[1];
3398 static struct ctl_table ipv4_skeleton[] =
3400 { .procname = "route",
3401 .mode = 0555, .child = ipv4_route_table},
3402 { .procname = "neigh",
3403 .mode = 0555, .child = empty},
3407 static __net_initdata struct ctl_path ipv4_path[] = {
3408 { .procname = "net", },
3409 { .procname = "ipv4", },
3413 static struct ctl_table ipv4_route_flush_table[] = {
3415 .procname = "flush",
3416 .maxlen = sizeof(int),
3418 .proc_handler = ipv4_sysctl_rtcache_flush,
3423 static __net_initdata struct ctl_path ipv4_route_path[] = {
3424 { .procname = "net", },
3425 { .procname = "ipv4", },
3426 { .procname = "route", },
3430 static __net_init int sysctl_route_net_init(struct net *net)
3432 struct ctl_table *tbl;
3434 tbl = ipv4_route_flush_table;
3435 if (!net_eq(net, &init_net)) {
3436 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3440 tbl[0].extra1 = net;
3442 net->ipv4.route_hdr =
3443 register_net_sysctl_table(net, ipv4_route_path, tbl);
3444 if (net->ipv4.route_hdr == NULL)
3449 if (tbl != ipv4_route_flush_table)
3455 static __net_exit void sysctl_route_net_exit(struct net *net)
3457 struct ctl_table *tbl;
3459 tbl = net->ipv4.route_hdr->ctl_table_arg;
3460 unregister_net_sysctl_table(net->ipv4.route_hdr);
3461 BUG_ON(tbl == ipv4_route_flush_table);
3465 static __net_initdata struct pernet_operations sysctl_route_ops = {
3466 .init = sysctl_route_net_init,
3467 .exit = sysctl_route_net_exit,
3471 static __net_init int rt_genid_init(struct net *net)
3473 get_random_bytes(&net->ipv4.rt_genid,
3474 sizeof(net->ipv4.rt_genid));
3475 get_random_bytes(&net->ipv4.dev_addr_genid,
3476 sizeof(net->ipv4.dev_addr_genid));
3480 static __net_initdata struct pernet_operations rt_genid_ops = {
3481 .init = rt_genid_init,
3485 #ifdef CONFIG_IP_ROUTE_CLASSID
3486 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3487 #endif /* CONFIG_IP_ROUTE_CLASSID */
3489 static __initdata unsigned long rhash_entries;
3490 static int __init set_rhash_entries(char *str)
3494 rhash_entries = simple_strtoul(str, &str, 0);
3497 __setup("rhash_entries=", set_rhash_entries);
3499 int __init ip_rt_init(void)
3503 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3505 panic("IP: failed to allocate ip_idents\n");
3507 get_random_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3509 #ifdef CONFIG_IP_ROUTE_CLASSID
3510 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3512 panic("IP: failed to allocate ip_rt_acct\n");
3515 ipv4_dst_ops.kmem_cachep =
3516 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3517 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3519 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3521 if (dst_entries_init(&ipv4_dst_ops) < 0)
3522 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3524 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3525 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3527 rt_hash_table = (struct rt_hash_bucket *)
3528 alloc_large_system_hash("IP route cache",
3529 sizeof(struct rt_hash_bucket),
3531 (totalram_pages >= 128 * 1024) ?
3536 rhash_entries ? 0 : 512 * 1024);
3537 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3538 rt_hash_lock_init();
3540 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3541 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3546 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3547 expires_ljiffies = jiffies;
3548 schedule_delayed_work(&expires_work,
3549 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3551 if (ip_rt_proc_init())
3552 printk(KERN_ERR "Unable to create route proc files\n");
3555 xfrm4_init(ip_rt_max_size);
3557 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3559 #ifdef CONFIG_SYSCTL
3560 register_pernet_subsys(&sysctl_route_ops);
3562 register_pernet_subsys(&rt_genid_ops);
3566 #ifdef CONFIG_SYSCTL
3568 * We really need to sanitize the damn ipv4 init order, then all
3569 * this nonsense will go away.
3571 void __init ip_static_sysctl_init(void)
3573 register_sysctl_paths(ipv4_path, ipv4_skeleton);