2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
112 #include <net/atmclip.h>
113 #include <net/secure_seq.h>
115 #define RT_FL_TOS(oldflp4) \
116 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 #define IP_MAX_MTU 0xFFF0
120 #define RT_GC_TIMEOUT (300*HZ)
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
125 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
126 static int ip_rt_redirect_number __read_mostly = 9;
127 static int ip_rt_redirect_load __read_mostly = HZ / 50;
128 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost __read_mostly = HZ;
130 static int ip_rt_error_burst __read_mostly = 5 * HZ;
131 static int ip_rt_gc_elasticity __read_mostly = 8;
132 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
134 static int ip_rt_min_advmss __read_mostly = 256;
135 static int rt_chain_length_max __read_mostly = 20;
136 static int redirect_genid;
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
142 * Interface to generic destination cache.
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int ipv4_mtu(const struct dst_entry *dst);
148 static void ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void ipv4_link_failure(struct sk_buff *skb);
151 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
154 static void __rt_garbage_collect(struct work_struct *w);
155 static DECLARE_WORK(rt_gc_worker, __rt_garbage_collect);
157 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
162 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
164 struct rtable *rt = (struct rtable *) dst;
165 struct inet_peer *peer;
169 rt_bind_peer(rt, rt->rt_dst, 1);
173 u32 *old_p = __DST_METRICS_PTR(old);
174 unsigned long prev, new;
177 if (inet_metrics_new(peer))
178 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
180 new = (unsigned long) p;
181 prev = cmpxchg(&dst->_metrics, old, new);
184 p = __DST_METRICS_PTR(prev);
185 if (prev & DST_METRICS_READ_ONLY)
189 fib_info_put(rt->fi);
197 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
199 static struct dst_ops ipv4_dst_ops = {
201 .protocol = cpu_to_be16(ETH_P_IP),
202 .gc = rt_garbage_collect,
203 .check = ipv4_dst_check,
204 .default_advmss = ipv4_default_advmss,
206 .cow_metrics = ipv4_cow_metrics,
207 .destroy = ipv4_dst_destroy,
208 .ifdown = ipv4_dst_ifdown,
209 .negative_advice = ipv4_negative_advice,
210 .link_failure = ipv4_link_failure,
211 .update_pmtu = ip_rt_update_pmtu,
212 .local_out = __ip_local_out,
213 .neigh_lookup = ipv4_neigh_lookup,
216 #define ECN_OR_COST(class) TC_PRIO_##class
218 const __u8 ip_tos2prio[16] = {
220 ECN_OR_COST(BESTEFFORT),
222 ECN_OR_COST(BESTEFFORT),
228 ECN_OR_COST(INTERACTIVE),
230 ECN_OR_COST(INTERACTIVE),
231 TC_PRIO_INTERACTIVE_BULK,
232 ECN_OR_COST(INTERACTIVE_BULK),
233 TC_PRIO_INTERACTIVE_BULK,
234 ECN_OR_COST(INTERACTIVE_BULK)
242 /* The locking scheme is rather straight forward:
244 * 1) Read-Copy Update protects the buckets of the central route hash.
245 * 2) Only writers remove entries, and they hold the lock
246 * as they look at rtable reference counts.
247 * 3) Only readers acquire references to rtable entries,
248 * they do so with atomic increments and with the
252 struct rt_hash_bucket {
253 struct rtable __rcu *chain;
256 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
257 defined(CONFIG_PROVE_LOCKING)
259 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
260 * The size of this table is a power of two and depends on the number of CPUS.
261 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
263 #ifdef CONFIG_LOCKDEP
264 # define RT_HASH_LOCK_SZ 256
267 # define RT_HASH_LOCK_SZ 4096
269 # define RT_HASH_LOCK_SZ 2048
271 # define RT_HASH_LOCK_SZ 1024
273 # define RT_HASH_LOCK_SZ 512
275 # define RT_HASH_LOCK_SZ 256
279 static spinlock_t *rt_hash_locks;
280 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
282 static __init void rt_hash_lock_init(void)
286 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
289 panic("IP: failed to allocate rt_hash_locks\n");
291 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
292 spin_lock_init(&rt_hash_locks[i]);
295 # define rt_hash_lock_addr(slot) NULL
297 static inline void rt_hash_lock_init(void)
302 static struct rt_hash_bucket *rt_hash_table __read_mostly;
303 static unsigned rt_hash_mask __read_mostly;
304 static unsigned int rt_hash_log __read_mostly;
306 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
307 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
309 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
312 return jhash_3words((__force u32)daddr, (__force u32)saddr,
317 static inline int rt_genid(struct net *net)
319 return atomic_read(&net->ipv4.rt_genid);
322 #ifdef CONFIG_PROC_FS
323 struct rt_cache_iter_state {
324 struct seq_net_private p;
329 static struct rtable *rt_cache_get_first(struct seq_file *seq)
331 struct rt_cache_iter_state *st = seq->private;
332 struct rtable *r = NULL;
334 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
335 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
338 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
340 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
341 r->rt_genid == st->genid)
343 r = rcu_dereference_bh(r->dst.rt_next);
345 rcu_read_unlock_bh();
350 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
353 struct rt_cache_iter_state *st = seq->private;
355 r = rcu_dereference_bh(r->dst.rt_next);
357 rcu_read_unlock_bh();
359 if (--st->bucket < 0)
361 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
363 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
368 static struct rtable *rt_cache_get_next(struct seq_file *seq,
371 struct rt_cache_iter_state *st = seq->private;
372 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
373 if (dev_net(r->dst.dev) != seq_file_net(seq))
375 if (r->rt_genid == st->genid)
381 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
383 struct rtable *r = rt_cache_get_first(seq);
386 while (pos && (r = rt_cache_get_next(seq, r)))
388 return pos ? NULL : r;
391 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
393 struct rt_cache_iter_state *st = seq->private;
395 return rt_cache_get_idx(seq, *pos - 1);
396 st->genid = rt_genid(seq_file_net(seq));
397 return SEQ_START_TOKEN;
400 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
404 if (v == SEQ_START_TOKEN)
405 r = rt_cache_get_first(seq);
407 r = rt_cache_get_next(seq, v);
412 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
414 if (v && v != SEQ_START_TOKEN)
415 rcu_read_unlock_bh();
418 static int rt_cache_seq_show(struct seq_file *seq, void *v)
420 if (v == SEQ_START_TOKEN)
421 seq_printf(seq, "%-127s\n",
422 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
423 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
426 struct rtable *r = v;
431 n = dst_get_neighbour(&r->dst);
432 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
435 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
436 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
437 r->dst.dev ? r->dst.dev->name : "*",
438 (__force u32)r->rt_dst,
439 (__force u32)r->rt_gateway,
440 r->rt_flags, atomic_read(&r->dst.__refcnt),
441 r->dst.__use, 0, (__force u32)r->rt_src,
442 dst_metric_advmss(&r->dst) + 40,
443 dst_metric(&r->dst, RTAX_WINDOW),
444 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
445 dst_metric(&r->dst, RTAX_RTTVAR)),
449 r->rt_spec_dst, &len);
451 seq_printf(seq, "%*s\n", 127 - len, "");
456 static const struct seq_operations rt_cache_seq_ops = {
457 .start = rt_cache_seq_start,
458 .next = rt_cache_seq_next,
459 .stop = rt_cache_seq_stop,
460 .show = rt_cache_seq_show,
463 static int rt_cache_seq_open(struct inode *inode, struct file *file)
465 return seq_open_net(inode, file, &rt_cache_seq_ops,
466 sizeof(struct rt_cache_iter_state));
469 static const struct file_operations rt_cache_seq_fops = {
470 .owner = THIS_MODULE,
471 .open = rt_cache_seq_open,
474 .release = seq_release_net,
478 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
483 return SEQ_START_TOKEN;
485 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
486 if (!cpu_possible(cpu))
489 return &per_cpu(rt_cache_stat, cpu);
494 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
498 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
499 if (!cpu_possible(cpu))
502 return &per_cpu(rt_cache_stat, cpu);
508 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
513 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
515 struct rt_cache_stat *st = v;
517 if (v == SEQ_START_TOKEN) {
518 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
522 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
523 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
524 dst_entries_get_slow(&ipv4_dst_ops),
547 static const struct seq_operations rt_cpu_seq_ops = {
548 .start = rt_cpu_seq_start,
549 .next = rt_cpu_seq_next,
550 .stop = rt_cpu_seq_stop,
551 .show = rt_cpu_seq_show,
555 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
557 return seq_open(file, &rt_cpu_seq_ops);
560 static const struct file_operations rt_cpu_seq_fops = {
561 .owner = THIS_MODULE,
562 .open = rt_cpu_seq_open,
565 .release = seq_release,
568 #ifdef CONFIG_IP_ROUTE_CLASSID
569 static int rt_acct_proc_show(struct seq_file *m, void *v)
571 struct ip_rt_acct *dst, *src;
574 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
578 for_each_possible_cpu(i) {
579 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
580 for (j = 0; j < 256; j++) {
581 dst[j].o_bytes += src[j].o_bytes;
582 dst[j].o_packets += src[j].o_packets;
583 dst[j].i_bytes += src[j].i_bytes;
584 dst[j].i_packets += src[j].i_packets;
588 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
593 static int rt_acct_proc_open(struct inode *inode, struct file *file)
595 return single_open(file, rt_acct_proc_show, NULL);
598 static const struct file_operations rt_acct_proc_fops = {
599 .owner = THIS_MODULE,
600 .open = rt_acct_proc_open,
603 .release = single_release,
607 static int __net_init ip_rt_do_proc_init(struct net *net)
609 struct proc_dir_entry *pde;
611 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
616 pde = proc_create("rt_cache", S_IRUGO,
617 net->proc_net_stat, &rt_cpu_seq_fops);
621 #ifdef CONFIG_IP_ROUTE_CLASSID
622 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
628 #ifdef CONFIG_IP_ROUTE_CLASSID
630 remove_proc_entry("rt_cache", net->proc_net_stat);
633 remove_proc_entry("rt_cache", net->proc_net);
638 static void __net_exit ip_rt_do_proc_exit(struct net *net)
640 remove_proc_entry("rt_cache", net->proc_net_stat);
641 remove_proc_entry("rt_cache", net->proc_net);
642 #ifdef CONFIG_IP_ROUTE_CLASSID
643 remove_proc_entry("rt_acct", net->proc_net);
647 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
648 .init = ip_rt_do_proc_init,
649 .exit = ip_rt_do_proc_exit,
652 static int __init ip_rt_proc_init(void)
654 return register_pernet_subsys(&ip_rt_proc_ops);
658 static inline int ip_rt_proc_init(void)
662 #endif /* CONFIG_PROC_FS */
664 static inline void rt_free(struct rtable *rt)
666 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
669 static inline void rt_drop(struct rtable *rt)
672 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
675 static inline int rt_fast_clean(struct rtable *rth)
677 /* Kill broadcast/multicast entries very aggresively, if they
678 collide in hash table with more useful entries */
679 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
680 rt_is_input_route(rth) && rth->dst.rt_next;
683 static inline int rt_valuable(struct rtable *rth)
685 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
686 (rth->peer && rth->peer->pmtu_expires);
689 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
694 if (atomic_read(&rth->dst.__refcnt))
697 age = jiffies - rth->dst.lastuse;
698 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
699 (age <= tmo2 && rt_valuable(rth)))
705 /* Bits of score are:
707 * 30: not quite useless
708 * 29..0: usage counter
710 static inline u32 rt_score(struct rtable *rt)
712 u32 score = jiffies - rt->dst.lastuse;
714 score = ~score & ~(3<<30);
719 if (rt_is_output_route(rt) ||
720 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
726 static inline bool rt_caching(const struct net *net)
728 return net->ipv4.current_rt_cache_rebuild_count <=
729 net->ipv4.sysctl_rt_cache_rebuild_count;
732 static inline bool compare_hash_inputs(const struct rtable *rt1,
733 const struct rtable *rt2)
735 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
736 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
737 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
740 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
742 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
743 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
744 (rt1->rt_mark ^ rt2->rt_mark) |
745 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
746 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
747 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
750 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
752 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
755 static inline int rt_is_expired(struct rtable *rth)
757 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
761 * Perform a full scan of hash table and free all entries.
762 * Can be called by a softirq or a process.
763 * In the later case, we want to be reschedule if necessary
765 static void rt_do_flush(struct net *net, int process_context)
768 struct rtable *rth, *next;
770 for (i = 0; i <= rt_hash_mask; i++) {
771 struct rtable __rcu **pprev;
774 if (process_context && need_resched())
776 rth = rcu_access_pointer(rt_hash_table[i].chain);
780 spin_lock_bh(rt_hash_lock_addr(i));
783 pprev = &rt_hash_table[i].chain;
784 rth = rcu_dereference_protected(*pprev,
785 lockdep_is_held(rt_hash_lock_addr(i)));
788 next = rcu_dereference_protected(rth->dst.rt_next,
789 lockdep_is_held(rt_hash_lock_addr(i)));
792 net_eq(dev_net(rth->dst.dev), net)) {
793 rcu_assign_pointer(*pprev, next);
794 rcu_assign_pointer(rth->dst.rt_next, list);
797 pprev = &rth->dst.rt_next;
802 spin_unlock_bh(rt_hash_lock_addr(i));
804 for (; list; list = next) {
805 next = rcu_dereference_protected(list->dst.rt_next, 1);
812 * While freeing expired entries, we compute average chain length
813 * and standard deviation, using fixed-point arithmetic.
814 * This to have an estimation of rt_chain_length_max
815 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
816 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
820 #define ONE (1UL << FRACT_BITS)
823 * Given a hash chain and an item in this hash chain,
824 * find if a previous entry has the same hash_inputs
825 * (but differs on tos, mark or oif)
826 * Returns 0 if an alias is found.
827 * Returns ONE if rth has no alias before itself.
829 static int has_noalias(const struct rtable *head, const struct rtable *rth)
831 const struct rtable *aux = head;
834 if (compare_hash_inputs(aux, rth))
836 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
841 static void rt_check_expire(void)
843 static unsigned int rover;
844 unsigned int i = rover, goal;
846 struct rtable __rcu **rthp;
847 unsigned long samples = 0;
848 unsigned long sum = 0, sum2 = 0;
852 delta = jiffies - expires_ljiffies;
853 expires_ljiffies = jiffies;
854 mult = ((u64)delta) << rt_hash_log;
855 if (ip_rt_gc_timeout > 1)
856 do_div(mult, ip_rt_gc_timeout);
857 goal = (unsigned int)mult;
858 if (goal > rt_hash_mask)
859 goal = rt_hash_mask + 1;
860 for (; goal > 0; goal--) {
861 unsigned long tmo = ip_rt_gc_timeout;
862 unsigned long length;
864 i = (i + 1) & rt_hash_mask;
865 rthp = &rt_hash_table[i].chain;
872 if (rcu_dereference_raw(*rthp) == NULL)
875 spin_lock_bh(rt_hash_lock_addr(i));
876 while ((rth = rcu_dereference_protected(*rthp,
877 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
878 prefetch(rth->dst.rt_next);
879 if (rt_is_expired(rth)) {
880 *rthp = rth->dst.rt_next;
884 if (rth->dst.expires) {
885 /* Entry is expired even if it is in use */
886 if (time_before_eq(jiffies, rth->dst.expires)) {
889 rthp = &rth->dst.rt_next;
891 * We only count entries on
892 * a chain with equal hash inputs once
893 * so that entries for different QOS
894 * levels, and other non-hash input
895 * attributes don't unfairly skew
896 * the length computation
898 length += has_noalias(rt_hash_table[i].chain, rth);
901 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
904 /* Cleanup aged off entries. */
905 *rthp = rth->dst.rt_next;
908 spin_unlock_bh(rt_hash_lock_addr(i));
910 sum2 += length*length;
913 unsigned long avg = sum / samples;
914 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
915 rt_chain_length_max = max_t(unsigned long,
917 (avg + 4*sd) >> FRACT_BITS);
923 * rt_worker_func() is run in process context.
924 * we call rt_check_expire() to scan part of the hash table
926 static void rt_worker_func(struct work_struct *work)
929 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
933 * Perturbation of rt_genid by a small quantity [1..256]
934 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
935 * many times (2^24) without giving recent rt_genid.
936 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
938 static void rt_cache_invalidate(struct net *net)
940 unsigned char shuffle;
942 get_random_bytes(&shuffle, sizeof(shuffle));
943 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
945 inetpeer_invalidate_tree(AF_INET);
949 * delay < 0 : invalidate cache (fast : entries will be deleted later)
950 * delay >= 0 : invalidate & flush cache (can be long)
952 void rt_cache_flush(struct net *net, int delay)
954 rt_cache_invalidate(net);
956 rt_do_flush(net, !in_softirq());
959 /* Flush previous cache invalidated entries from the cache */
960 void rt_cache_flush_batch(struct net *net)
962 rt_do_flush(net, !in_softirq());
965 static void rt_emergency_hash_rebuild(struct net *net)
968 printk(KERN_WARNING "Route hash chain too long!\n");
969 rt_cache_invalidate(net);
973 Short description of GC goals.
975 We want to build algorithm, which will keep routing cache
976 at some equilibrium point, when number of aged off entries
977 is kept approximately equal to newly generated ones.
979 Current expiration strength is variable "expire".
980 We try to adjust it dynamically, so that if networking
981 is idle expires is large enough to keep enough of warm entries,
982 and when load increases it reduces to limit cache size.
985 static void __do_rt_garbage_collect(int elasticity, int min_interval)
987 static unsigned long expire = RT_GC_TIMEOUT;
988 static unsigned long last_gc;
990 static int equilibrium;
992 struct rtable __rcu **rthp;
993 unsigned long now = jiffies;
995 int entries = dst_entries_get_fast(&ipv4_dst_ops);
998 * Garbage collection is pretty expensive,
999 * do not make it too frequently.
1002 RT_CACHE_STAT_INC(gc_total);
1004 if (now - last_gc < min_interval &&
1005 entries < ip_rt_max_size) {
1006 RT_CACHE_STAT_INC(gc_ignored);
1010 entries = dst_entries_get_slow(&ipv4_dst_ops);
1011 /* Calculate number of entries, which we want to expire now. */
1012 goal = entries - (elasticity << rt_hash_log);
1014 if (equilibrium < ipv4_dst_ops.gc_thresh)
1015 equilibrium = ipv4_dst_ops.gc_thresh;
1016 goal = entries - equilibrium;
1018 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1019 goal = entries - equilibrium;
1022 /* We are in dangerous area. Try to reduce cache really
1025 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1026 equilibrium = entries - goal;
1029 if (now - last_gc >= min_interval)
1033 equilibrium += goal;
1040 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1041 unsigned long tmo = expire;
1043 k = (k + 1) & rt_hash_mask;
1044 rthp = &rt_hash_table[k].chain;
1045 spin_lock_bh(rt_hash_lock_addr(k));
1046 while ((rth = rcu_dereference_protected(*rthp,
1047 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1048 if (!rt_is_expired(rth) &&
1049 !rt_may_expire(rth, tmo, expire)) {
1051 rthp = &rth->dst.rt_next;
1054 *rthp = rth->dst.rt_next;
1058 spin_unlock_bh(rt_hash_lock_addr(k));
1067 /* Goal is not achieved. We stop process if:
1069 - if expire reduced to zero. Otherwise, expire is halfed.
1070 - if table is not full.
1071 - if we are called from interrupt.
1072 - jiffies check is just fallback/debug loop breaker.
1073 We will not spin here for long time in any case.
1076 RT_CACHE_STAT_INC(gc_goal_miss);
1083 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1085 } while (!in_softirq() && time_before_eq(jiffies, now));
1087 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1089 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1091 if (net_ratelimit())
1092 printk(KERN_WARNING "dst cache overflow\n");
1093 RT_CACHE_STAT_INC(gc_dst_overflow);
1097 expire += min_interval;
1098 if (expire > ip_rt_gc_timeout ||
1099 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1100 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1101 expire = ip_rt_gc_timeout;
1105 static void __rt_garbage_collect(struct work_struct *w)
1107 __do_rt_garbage_collect(ip_rt_gc_elasticity, ip_rt_gc_min_interval);
1110 static int rt_garbage_collect(struct dst_ops *ops)
1112 if (!work_pending(&rt_gc_worker))
1113 schedule_work(&rt_gc_worker);
1115 if (dst_entries_get_fast(&ipv4_dst_ops) >= ip_rt_max_size ||
1116 dst_entries_get_slow(&ipv4_dst_ops) >= ip_rt_max_size) {
1117 RT_CACHE_STAT_INC(gc_dst_overflow);
1124 * Returns number of entries in a hash chain that have different hash_inputs
1126 static int slow_chain_length(const struct rtable *head)
1129 const struct rtable *rth = head;
1132 length += has_noalias(head, rth);
1133 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1135 return length >> FRACT_BITS;
1138 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1140 struct neigh_table *tbl = &arp_tbl;
1141 static const __be32 inaddr_any = 0;
1142 struct net_device *dev = dst->dev;
1143 const __be32 *pkey = daddr;
1144 struct neighbour *n;
1146 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1147 if (dev->type == ARPHRD_ATM)
1148 tbl = clip_tbl_hook;
1150 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1153 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1156 return neigh_create(tbl, pkey, dev);
1159 static int rt_bind_neighbour(struct rtable *rt)
1161 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1164 dst_set_neighbour(&rt->dst, n);
1169 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1170 struct sk_buff *skb, int ifindex)
1172 struct rtable *rth, *cand;
1173 struct rtable __rcu **rthp, **candp;
1177 int attempts = !in_softirq();
1181 min_score = ~(u32)0;
1186 if (!rt_caching(dev_net(rt->dst.dev))) {
1188 * If we're not caching, just tell the caller we
1189 * were successful and don't touch the route. The
1190 * caller hold the sole reference to the cache entry, and
1191 * it will be released when the caller is done with it.
1192 * If we drop it here, the callers have no way to resolve routes
1193 * when we're not caching. Instead, just point *rp at rt, so
1194 * the caller gets a single use out of the route
1195 * Note that we do rt_free on this new route entry, so that
1196 * once its refcount hits zero, we are still able to reap it
1198 * Note: To avoid expensive rcu stuff for this uncached dst,
1199 * we set DST_NOCACHE so that dst_release() can free dst without
1200 * waiting a grace period.
1203 rt->dst.flags |= DST_NOCACHE;
1204 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1205 int err = rt_bind_neighbour(rt);
1207 if (net_ratelimit())
1209 "Neighbour table failure & not caching routes.\n");
1211 return ERR_PTR(err);
1218 rthp = &rt_hash_table[hash].chain;
1220 spin_lock_bh(rt_hash_lock_addr(hash));
1221 while ((rth = rcu_dereference_protected(*rthp,
1222 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1223 if (rt_is_expired(rth)) {
1224 *rthp = rth->dst.rt_next;
1228 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1230 *rthp = rth->dst.rt_next;
1232 * Since lookup is lockfree, the deletion
1233 * must be visible to another weakly ordered CPU before
1234 * the insertion at the start of the hash chain.
1236 rcu_assign_pointer(rth->dst.rt_next,
1237 rt_hash_table[hash].chain);
1239 * Since lookup is lockfree, the update writes
1240 * must be ordered for consistency on SMP.
1242 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1244 dst_use(&rth->dst, now);
1245 spin_unlock_bh(rt_hash_lock_addr(hash));
1249 skb_dst_set(skb, &rth->dst);
1253 if (!atomic_read(&rth->dst.__refcnt)) {
1254 u32 score = rt_score(rth);
1256 if (score <= min_score) {
1265 rthp = &rth->dst.rt_next;
1269 /* ip_rt_gc_elasticity used to be average length of chain
1270 * length, when exceeded gc becomes really aggressive.
1272 * The second limit is less certain. At the moment it allows
1273 * only 2 entries per bucket. We will see.
1275 if (chain_length > ip_rt_gc_elasticity) {
1276 *candp = cand->dst.rt_next;
1280 if (chain_length > rt_chain_length_max &&
1281 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1282 struct net *net = dev_net(rt->dst.dev);
1283 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1284 if (!rt_caching(net)) {
1285 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1286 rt->dst.dev->name, num);
1288 rt_emergency_hash_rebuild(net);
1289 spin_unlock_bh(rt_hash_lock_addr(hash));
1291 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1292 ifindex, rt_genid(net));
1297 /* Try to bind route to arp only if it is output
1298 route or unicast forwarding path.
1300 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1301 int err = rt_bind_neighbour(rt);
1303 spin_unlock_bh(rt_hash_lock_addr(hash));
1305 if (err != -ENOBUFS) {
1307 return ERR_PTR(err);
1310 /* Neighbour tables are full and nothing
1311 can be released. Try to shrink route cache,
1312 it is most likely it holds some neighbour records.
1314 if (attempts-- > 0) {
1315 __do_rt_garbage_collect(1, 0);
1319 if (net_ratelimit())
1320 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1322 return ERR_PTR(-ENOBUFS);
1326 rt->dst.rt_next = rt_hash_table[hash].chain;
1329 * Since lookup is lockfree, we must make sure
1330 * previous writes to rt are committed to memory
1331 * before making rt visible to other CPUS.
1333 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1335 spin_unlock_bh(rt_hash_lock_addr(hash));
1339 skb_dst_set(skb, &rt->dst);
1343 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1345 static u32 rt_peer_genid(void)
1347 return atomic_read(&__rt_peer_genid);
1350 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1352 struct inet_peer *peer;
1354 peer = inet_getpeer_v4(daddr, create);
1356 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1359 rt->rt_peer_genid = rt_peer_genid();
1362 #define IP_IDENTS_SZ 2048u
1363 struct ip_ident_bucket {
1368 static struct ip_ident_bucket *ip_idents __read_mostly;
1370 /* In order to protect privacy, we add a perturbation to identifiers
1371 * if one generator is seldom used. This makes hard for an attacker
1372 * to infer how many packets were sent between two points in time.
1374 u32 ip_idents_reserve(u32 hash, int segs)
1376 struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
1377 u32 old = ACCESS_ONCE(bucket->stamp32);
1378 u32 now = (u32)jiffies;
1381 if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
1385 delta = (u32)(x >> 32);
1388 return atomic_add_return(segs + delta, &bucket->id) - segs;
1390 EXPORT_SYMBOL(ip_idents_reserve);
1392 void __ip_select_ident(struct iphdr *iph, int segs)
1394 static u32 ip_idents_hashrnd __read_mostly;
1395 static bool hashrnd_initialized = false;
1398 if (unlikely(!hashrnd_initialized)) {
1399 hashrnd_initialized = true;
1400 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1403 hash = jhash_3words((__force u32)iph->daddr,
1404 (__force u32)iph->saddr,
1407 id = ip_idents_reserve(hash, segs);
1408 iph->id = htons(id);
1410 EXPORT_SYMBOL(__ip_select_ident);
1412 static void rt_del(unsigned hash, struct rtable *rt)
1414 struct rtable __rcu **rthp;
1417 rthp = &rt_hash_table[hash].chain;
1418 spin_lock_bh(rt_hash_lock_addr(hash));
1420 while ((aux = rcu_dereference_protected(*rthp,
1421 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1422 if (aux == rt || rt_is_expired(aux)) {
1423 *rthp = aux->dst.rt_next;
1427 rthp = &aux->dst.rt_next;
1429 spin_unlock_bh(rt_hash_lock_addr(hash));
1432 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1434 struct rtable *rt = (struct rtable *) dst;
1435 __be32 orig_gw = rt->rt_gateway;
1436 struct neighbour *n, *old_n;
1438 dst_confirm(&rt->dst);
1440 rt->rt_gateway = peer->redirect_learned.a4;
1442 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1444 rt->rt_gateway = orig_gw;
1447 old_n = xchg(&rt->dst._neighbour, n);
1449 neigh_release(old_n);
1450 if (!(n->nud_state & NUD_VALID)) {
1451 neigh_event_send(n, NULL);
1453 rt->rt_flags |= RTCF_REDIRECTED;
1454 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1458 /* called in rcu_read_lock() section */
1459 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1460 __be32 saddr, struct net_device *dev)
1463 struct in_device *in_dev = __in_dev_get_rcu(dev);
1464 __be32 skeys[2] = { saddr, 0 };
1465 int ikeys[2] = { dev->ifindex, 0 };
1466 struct inet_peer *peer;
1473 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1474 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1475 ipv4_is_zeronet(new_gw))
1476 goto reject_redirect;
1478 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1479 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1480 goto reject_redirect;
1481 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1482 goto reject_redirect;
1484 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1485 goto reject_redirect;
1488 for (s = 0; s < 2; s++) {
1489 for (i = 0; i < 2; i++) {
1491 struct rtable __rcu **rthp;
1494 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1496 rthp = &rt_hash_table[hash].chain;
1498 while ((rt = rcu_dereference(*rthp)) != NULL) {
1499 rthp = &rt->dst.rt_next;
1501 if (rt->rt_key_dst != daddr ||
1502 rt->rt_key_src != skeys[s] ||
1503 rt->rt_oif != ikeys[i] ||
1504 rt_is_input_route(rt) ||
1505 rt_is_expired(rt) ||
1506 !net_eq(dev_net(rt->dst.dev), net) ||
1508 rt->dst.dev != dev ||
1509 rt->rt_gateway != old_gw)
1513 rt_bind_peer(rt, rt->rt_dst, 1);
1517 if (peer->redirect_learned.a4 != new_gw ||
1518 peer->redirect_genid != redirect_genid) {
1519 peer->redirect_learned.a4 = new_gw;
1520 peer->redirect_genid = redirect_genid;
1521 atomic_inc(&__rt_peer_genid);
1523 check_peer_redir(&rt->dst, peer);
1531 #ifdef CONFIG_IP_ROUTE_VERBOSE
1532 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1533 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1534 " Advised path = %pI4 -> %pI4\n",
1535 &old_gw, dev->name, &new_gw,
1541 static bool peer_pmtu_expired(struct inet_peer *peer)
1543 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1546 time_after_eq(jiffies, orig) &&
1547 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1550 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1552 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1555 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1558 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1560 struct rtable *rt = (struct rtable *)dst;
1561 struct dst_entry *ret = dst;
1564 if (dst->obsolete > 0) {
1567 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1568 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1570 rt_genid(dev_net(dst->dev)));
1573 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1574 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1582 * 1. The first ip_rt_redirect_number redirects are sent
1583 * with exponential backoff, then we stop sending them at all,
1584 * assuming that the host ignores our redirects.
1585 * 2. If we did not see packets requiring redirects
1586 * during ip_rt_redirect_silence, we assume that the host
1587 * forgot redirected route and start to send redirects again.
1589 * This algorithm is much cheaper and more intelligent than dumb load limiting
1592 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1593 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1596 void ip_rt_send_redirect(struct sk_buff *skb)
1598 struct rtable *rt = skb_rtable(skb);
1599 struct in_device *in_dev;
1600 struct inet_peer *peer;
1604 in_dev = __in_dev_get_rcu(rt->dst.dev);
1605 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1609 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1613 rt_bind_peer(rt, rt->rt_dst, 1);
1616 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1620 /* No redirected packets during ip_rt_redirect_silence;
1621 * reset the algorithm.
1623 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1624 peer->rate_tokens = 0;
1626 /* Too many ignored redirects; do not send anything
1627 * set dst.rate_last to the last seen redirected packet.
1629 if (peer->rate_tokens >= ip_rt_redirect_number) {
1630 peer->rate_last = jiffies;
1634 /* Check for load limit; set rate_last to the latest sent
1637 if (peer->rate_tokens == 0 ||
1640 (ip_rt_redirect_load << peer->rate_tokens)))) {
1641 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1642 peer->rate_last = jiffies;
1643 ++peer->rate_tokens;
1644 #ifdef CONFIG_IP_ROUTE_VERBOSE
1646 peer->rate_tokens == ip_rt_redirect_number &&
1648 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1649 &ip_hdr(skb)->saddr, rt->rt_iif,
1650 &rt->rt_dst, &rt->rt_gateway);
1655 static int ip_error(struct sk_buff *skb)
1657 struct rtable *rt = skb_rtable(skb);
1658 struct inet_peer *peer;
1663 switch (rt->dst.error) {
1668 code = ICMP_HOST_UNREACH;
1671 code = ICMP_NET_UNREACH;
1672 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1673 IPSTATS_MIB_INNOROUTES);
1676 code = ICMP_PKT_FILTERED;
1681 rt_bind_peer(rt, rt->rt_dst, 1);
1687 peer->rate_tokens += now - peer->rate_last;
1688 if (peer->rate_tokens > ip_rt_error_burst)
1689 peer->rate_tokens = ip_rt_error_burst;
1690 peer->rate_last = now;
1691 if (peer->rate_tokens >= ip_rt_error_cost)
1692 peer->rate_tokens -= ip_rt_error_cost;
1697 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1699 out: kfree_skb(skb);
1704 * The last two values are not from the RFC but
1705 * are needed for AMPRnet AX.25 paths.
1708 static const unsigned short mtu_plateau[] =
1709 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1711 static inline unsigned short guess_mtu(unsigned short old_mtu)
1715 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1716 if (old_mtu > mtu_plateau[i])
1717 return mtu_plateau[i];
1721 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1722 unsigned short new_mtu,
1723 struct net_device *dev)
1725 unsigned short old_mtu = ntohs(iph->tot_len);
1726 unsigned short est_mtu = 0;
1727 struct inet_peer *peer;
1729 peer = inet_getpeer_v4(iph->daddr, 1);
1731 unsigned short mtu = new_mtu;
1733 if (new_mtu < 68 || new_mtu >= old_mtu) {
1734 /* BSD 4.2 derived systems incorrectly adjust
1735 * tot_len by the IP header length, and report
1736 * a zero MTU in the ICMP message.
1739 old_mtu >= 68 + (iph->ihl << 2))
1740 old_mtu -= iph->ihl << 2;
1741 mtu = guess_mtu(old_mtu);
1744 if (mtu < ip_rt_min_pmtu)
1745 mtu = ip_rt_min_pmtu;
1746 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1747 unsigned long pmtu_expires;
1749 pmtu_expires = jiffies + ip_rt_mtu_expires;
1754 peer->pmtu_learned = mtu;
1755 peer->pmtu_expires = pmtu_expires;
1756 atomic_inc(&__rt_peer_genid);
1761 return est_mtu ? : new_mtu;
1764 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1766 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1770 if (time_before(jiffies, expires)) {
1771 u32 orig_dst_mtu = dst_mtu(dst);
1772 if (peer->pmtu_learned < orig_dst_mtu) {
1773 if (!peer->pmtu_orig)
1774 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1775 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1777 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1778 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1781 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1783 struct rtable *rt = (struct rtable *) dst;
1784 struct inet_peer *peer;
1789 rt_bind_peer(rt, rt->rt_dst, 1);
1792 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1794 if (mtu < ip_rt_min_pmtu)
1795 mtu = ip_rt_min_pmtu;
1796 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1798 pmtu_expires = jiffies + ip_rt_mtu_expires;
1802 peer->pmtu_learned = mtu;
1803 peer->pmtu_expires = pmtu_expires;
1805 atomic_inc(&__rt_peer_genid);
1806 rt->rt_peer_genid = rt_peer_genid();
1808 check_peer_pmtu(dst, peer);
1813 static void ipv4_validate_peer(struct rtable *rt)
1815 if (rt->rt_peer_genid != rt_peer_genid()) {
1816 struct inet_peer *peer;
1819 rt_bind_peer(rt, rt->rt_dst, 0);
1823 check_peer_pmtu(&rt->dst, peer);
1825 if (peer->redirect_genid != redirect_genid)
1826 peer->redirect_learned.a4 = 0;
1827 if (peer->redirect_learned.a4 &&
1828 peer->redirect_learned.a4 != rt->rt_gateway)
1829 check_peer_redir(&rt->dst, peer);
1832 rt->rt_peer_genid = rt_peer_genid();
1836 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1838 struct rtable *rt = (struct rtable *) dst;
1840 if (rt_is_expired(rt))
1842 ipv4_validate_peer(rt);
1846 static void ipv4_dst_destroy(struct dst_entry *dst)
1848 struct rtable *rt = (struct rtable *) dst;
1849 struct inet_peer *peer = rt->peer;
1852 fib_info_put(rt->fi);
1862 static void ipv4_link_failure(struct sk_buff *skb)
1866 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1868 rt = skb_rtable(skb);
1869 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1870 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1873 static int ip_rt_bug(struct sk_buff *skb)
1875 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1876 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1877 skb->dev ? skb->dev->name : "?");
1884 We do not cache source address of outgoing interface,
1885 because it is used only by IP RR, TS and SRR options,
1886 so that it out of fast path.
1888 BTW remember: "addr" is allowed to be not aligned
1892 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1896 if (rt_is_output_route(rt))
1897 src = ip_hdr(skb)->saddr;
1899 struct fib_result res;
1905 memset(&fl4, 0, sizeof(fl4));
1906 fl4.daddr = iph->daddr;
1907 fl4.saddr = iph->saddr;
1908 fl4.flowi4_tos = RT_TOS(iph->tos);
1909 fl4.flowi4_oif = rt->dst.dev->ifindex;
1910 fl4.flowi4_iif = skb->dev->ifindex;
1911 fl4.flowi4_mark = skb->mark;
1914 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1915 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1917 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1921 memcpy(addr, &src, 4);
1924 #ifdef CONFIG_IP_ROUTE_CLASSID
1925 static void set_class_tag(struct rtable *rt, u32 tag)
1927 if (!(rt->dst.tclassid & 0xFFFF))
1928 rt->dst.tclassid |= tag & 0xFFFF;
1929 if (!(rt->dst.tclassid & 0xFFFF0000))
1930 rt->dst.tclassid |= tag & 0xFFFF0000;
1934 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1936 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1939 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1941 if (advmss > 65535 - 40)
1942 advmss = 65535 - 40;
1947 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1949 const struct rtable *rt = (const struct rtable *) dst;
1950 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1952 if (mtu && rt_is_output_route(rt))
1955 mtu = dst->dev->mtu;
1957 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1959 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1963 if (mtu > IP_MAX_MTU)
1969 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1970 struct fib_info *fi)
1972 struct inet_peer *peer;
1975 /* If a peer entry exists for this destination, we must hook
1976 * it up in order to get at cached metrics.
1978 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1981 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1983 rt->rt_peer_genid = rt_peer_genid();
1984 if (inet_metrics_new(peer))
1985 memcpy(peer->metrics, fi->fib_metrics,
1986 sizeof(u32) * RTAX_MAX);
1987 dst_init_metrics(&rt->dst, peer->metrics, false);
1989 check_peer_pmtu(&rt->dst, peer);
1990 if (peer->redirect_genid != redirect_genid)
1991 peer->redirect_learned.a4 = 0;
1992 if (peer->redirect_learned.a4 &&
1993 peer->redirect_learned.a4 != rt->rt_gateway) {
1994 rt->rt_gateway = peer->redirect_learned.a4;
1995 rt->rt_flags |= RTCF_REDIRECTED;
1998 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
2000 atomic_inc(&fi->fib_clntref);
2002 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
2006 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
2007 const struct fib_result *res,
2008 struct fib_info *fi, u16 type, u32 itag)
2010 struct dst_entry *dst = &rt->dst;
2013 if (FIB_RES_GW(*res) &&
2014 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
2015 rt->rt_gateway = FIB_RES_GW(*res);
2016 rt_init_metrics(rt, fl4, fi);
2017 #ifdef CONFIG_IP_ROUTE_CLASSID
2018 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
2022 if (dst_mtu(dst) > IP_MAX_MTU)
2023 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
2024 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
2025 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
2027 #ifdef CONFIG_IP_ROUTE_CLASSID
2028 #ifdef CONFIG_IP_MULTIPLE_TABLES
2029 set_class_tag(rt, fib_rules_tclass(res));
2031 set_class_tag(rt, itag);
2035 static struct rtable *rt_dst_alloc(struct net_device *dev,
2036 bool nopolicy, bool noxfrm)
2038 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2040 (nopolicy ? DST_NOPOLICY : 0) |
2041 (noxfrm ? DST_NOXFRM : 0));
2044 /* called in rcu_read_lock() section */
2045 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2046 u8 tos, struct net_device *dev, int our)
2051 struct in_device *in_dev = __in_dev_get_rcu(dev);
2055 /* Primary sanity checks. */
2060 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2061 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2064 if (ipv4_is_zeronet(saddr)) {
2065 if (!ipv4_is_local_multicast(daddr))
2067 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2069 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2074 rth = rt_dst_alloc(init_net.loopback_dev,
2075 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2079 #ifdef CONFIG_IP_ROUTE_CLASSID
2080 rth->dst.tclassid = itag;
2082 rth->dst.output = ip_rt_bug;
2084 rth->rt_key_dst = daddr;
2085 rth->rt_key_src = saddr;
2086 rth->rt_genid = rt_genid(dev_net(dev));
2087 rth->rt_flags = RTCF_MULTICAST;
2088 rth->rt_type = RTN_MULTICAST;
2089 rth->rt_key_tos = tos;
2090 rth->rt_dst = daddr;
2091 rth->rt_src = saddr;
2092 rth->rt_route_iif = dev->ifindex;
2093 rth->rt_iif = dev->ifindex;
2095 rth->rt_mark = skb->mark;
2096 rth->rt_gateway = daddr;
2097 rth->rt_spec_dst= spec_dst;
2098 rth->rt_peer_genid = 0;
2102 rth->dst.input= ip_local_deliver;
2103 rth->rt_flags |= RTCF_LOCAL;
2106 #ifdef CONFIG_IP_MROUTE
2107 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2108 rth->dst.input = ip_mr_input;
2110 RT_CACHE_STAT_INC(in_slow_mc);
2112 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2113 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2114 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2125 static void ip_handle_martian_source(struct net_device *dev,
2126 struct in_device *in_dev,
2127 struct sk_buff *skb,
2131 RT_CACHE_STAT_INC(in_martian_src);
2132 #ifdef CONFIG_IP_ROUTE_VERBOSE
2133 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2135 * RFC1812 recommendation, if source is martian,
2136 * the only hint is MAC header.
2138 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2139 &daddr, &saddr, dev->name);
2140 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2142 const unsigned char *p = skb_mac_header(skb);
2143 printk(KERN_WARNING "ll header: ");
2144 for (i = 0; i < dev->hard_header_len; i++, p++) {
2146 if (i < (dev->hard_header_len - 1))
2155 /* called in rcu_read_lock() section */
2156 static int __mkroute_input(struct sk_buff *skb,
2157 const struct fib_result *res,
2158 struct in_device *in_dev,
2159 __be32 daddr, __be32 saddr, u32 tos,
2160 struct rtable **result)
2164 struct in_device *out_dev;
2165 unsigned int flags = 0;
2169 /* get a working reference to the output device */
2170 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2171 if (out_dev == NULL) {
2172 if (net_ratelimit())
2173 printk(KERN_CRIT "Bug in ip_route_input" \
2174 "_slow(). Please, report\n");
2179 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2180 in_dev->dev, &spec_dst, &itag);
2182 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2189 flags |= RTCF_DIRECTSRC;
2191 if (out_dev == in_dev && err &&
2192 (IN_DEV_SHARED_MEDIA(out_dev) ||
2193 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2194 flags |= RTCF_DOREDIRECT;
2196 if (skb->protocol != htons(ETH_P_IP)) {
2197 /* Not IP (i.e. ARP). Do not create route, if it is
2198 * invalid for proxy arp. DNAT routes are always valid.
2200 * Proxy arp feature have been extended to allow, ARP
2201 * replies back to the same interface, to support
2202 * Private VLAN switch technologies. See arp.c.
2204 if (out_dev == in_dev &&
2205 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2211 rth = rt_dst_alloc(out_dev->dev,
2212 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2213 IN_DEV_CONF_GET(out_dev, NOXFRM));
2219 rth->rt_key_dst = daddr;
2220 rth->rt_key_src = saddr;
2221 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2222 rth->rt_flags = flags;
2223 rth->rt_type = res->type;
2224 rth->rt_key_tos = tos;
2225 rth->rt_dst = daddr;
2226 rth->rt_src = saddr;
2227 rth->rt_route_iif = in_dev->dev->ifindex;
2228 rth->rt_iif = in_dev->dev->ifindex;
2230 rth->rt_mark = skb->mark;
2231 rth->rt_gateway = daddr;
2232 rth->rt_spec_dst= spec_dst;
2233 rth->rt_peer_genid = 0;
2237 rth->dst.input = ip_forward;
2238 rth->dst.output = ip_output;
2240 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2248 static int ip_mkroute_input(struct sk_buff *skb,
2249 struct fib_result *res,
2250 const struct flowi4 *fl4,
2251 struct in_device *in_dev,
2252 __be32 daddr, __be32 saddr, u32 tos)
2254 struct rtable* rth = NULL;
2258 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2259 if (res->fi && res->fi->fib_nhs > 1)
2260 fib_select_multipath(res);
2263 /* create a routing cache entry */
2264 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2268 /* put it into the cache */
2269 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2270 rt_genid(dev_net(rth->dst.dev)));
2271 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2273 return PTR_ERR(rth);
2278 * NOTE. We drop all the packets that has local source
2279 * addresses, because every properly looped back packet
2280 * must have correct destination already attached by output routine.
2282 * Such approach solves two big problems:
2283 * 1. Not simplex devices are handled properly.
2284 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2285 * called with rcu_read_lock()
2288 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2289 u8 tos, struct net_device *dev)
2291 struct fib_result res;
2292 struct in_device *in_dev = __in_dev_get_rcu(dev);
2296 struct rtable * rth;
2300 struct net * net = dev_net(dev);
2302 /* IP on this device is disabled. */
2307 /* Check for the most weird martians, which can be not detected
2311 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2312 ipv4_is_loopback(saddr))
2313 goto martian_source;
2315 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2318 /* Accept zero addresses only to limited broadcast;
2319 * I even do not know to fix it or not. Waiting for complains :-)
2321 if (ipv4_is_zeronet(saddr))
2322 goto martian_source;
2324 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2325 goto martian_destination;
2328 * Now we are ready to route packet.
2331 fl4.flowi4_iif = dev->ifindex;
2332 fl4.flowi4_mark = skb->mark;
2333 fl4.flowi4_tos = tos;
2334 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2337 err = fib_lookup(net, &fl4, &res);
2339 if (!IN_DEV_FORWARD(in_dev))
2344 RT_CACHE_STAT_INC(in_slow_tot);
2346 if (res.type == RTN_BROADCAST)
2349 if (res.type == RTN_LOCAL) {
2350 err = fib_validate_source(skb, saddr, daddr, tos,
2351 net->loopback_dev->ifindex,
2352 dev, &spec_dst, &itag);
2354 goto martian_source_keep_err;
2356 flags |= RTCF_DIRECTSRC;
2361 if (!IN_DEV_FORWARD(in_dev))
2363 if (res.type != RTN_UNICAST)
2364 goto martian_destination;
2366 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2370 if (skb->protocol != htons(ETH_P_IP))
2373 if (ipv4_is_zeronet(saddr))
2374 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2376 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2379 goto martian_source_keep_err;
2381 flags |= RTCF_DIRECTSRC;
2383 flags |= RTCF_BROADCAST;
2384 res.type = RTN_BROADCAST;
2385 RT_CACHE_STAT_INC(in_brd);
2388 rth = rt_dst_alloc(net->loopback_dev,
2389 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2393 rth->dst.input= ip_local_deliver;
2394 rth->dst.output= ip_rt_bug;
2395 #ifdef CONFIG_IP_ROUTE_CLASSID
2396 rth->dst.tclassid = itag;
2399 rth->rt_key_dst = daddr;
2400 rth->rt_key_src = saddr;
2401 rth->rt_genid = rt_genid(net);
2402 rth->rt_flags = flags|RTCF_LOCAL;
2403 rth->rt_type = res.type;
2404 rth->rt_key_tos = tos;
2405 rth->rt_dst = daddr;
2406 rth->rt_src = saddr;
2407 #ifdef CONFIG_IP_ROUTE_CLASSID
2408 rth->dst.tclassid = itag;
2410 rth->rt_route_iif = dev->ifindex;
2411 rth->rt_iif = dev->ifindex;
2413 rth->rt_mark = skb->mark;
2414 rth->rt_gateway = daddr;
2415 rth->rt_spec_dst= spec_dst;
2416 rth->rt_peer_genid = 0;
2419 if (res.type == RTN_UNREACHABLE) {
2420 rth->dst.input= ip_error;
2421 rth->dst.error= -err;
2422 rth->rt_flags &= ~RTCF_LOCAL;
2424 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2425 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2432 RT_CACHE_STAT_INC(in_no_route);
2433 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2434 res.type = RTN_UNREACHABLE;
2440 * Do not cache martian addresses: they should be logged (RFC1812)
2442 martian_destination:
2443 RT_CACHE_STAT_INC(in_martian_dst);
2444 #ifdef CONFIG_IP_ROUTE_VERBOSE
2445 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2446 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2447 &daddr, &saddr, dev->name);
2451 err = -EHOSTUNREACH;
2464 martian_source_keep_err:
2465 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2469 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2470 u8 tos, struct net_device *dev, bool noref)
2472 struct rtable * rth;
2474 int iif = dev->ifindex;
2482 if (!rt_caching(net))
2485 tos &= IPTOS_RT_MASK;
2486 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2488 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2489 rth = rcu_dereference(rth->dst.rt_next)) {
2490 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2491 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2492 (rth->rt_route_iif ^ iif) |
2493 (rth->rt_key_tos ^ tos)) == 0 &&
2494 rth->rt_mark == skb->mark &&
2495 net_eq(dev_net(rth->dst.dev), net) &&
2496 !rt_is_expired(rth)) {
2497 ipv4_validate_peer(rth);
2499 dst_use_noref(&rth->dst, jiffies);
2500 skb_dst_set_noref(skb, &rth->dst);
2502 dst_use(&rth->dst, jiffies);
2503 skb_dst_set(skb, &rth->dst);
2505 RT_CACHE_STAT_INC(in_hit);
2509 RT_CACHE_STAT_INC(in_hlist_search);
2513 /* Multicast recognition logic is moved from route cache to here.
2514 The problem was that too many Ethernet cards have broken/missing
2515 hardware multicast filters :-( As result the host on multicasting
2516 network acquires a lot of useless route cache entries, sort of
2517 SDR messages from all the world. Now we try to get rid of them.
2518 Really, provided software IP multicast filter is organized
2519 reasonably (at least, hashed), it does not result in a slowdown
2520 comparing with route cache reject entries.
2521 Note, that multicast routers are not affected, because
2522 route cache entry is created eventually.
2524 if (ipv4_is_multicast(daddr)) {
2525 struct in_device *in_dev = __in_dev_get_rcu(dev);
2528 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2529 ip_hdr(skb)->protocol);
2531 #ifdef CONFIG_IP_MROUTE
2533 (!ipv4_is_local_multicast(daddr) &&
2534 IN_DEV_MFORWARD(in_dev))
2537 int res = ip_route_input_mc(skb, daddr, saddr,
2546 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2550 EXPORT_SYMBOL(ip_route_input_common);
2552 /* called with rcu_read_lock() */
2553 static struct rtable *__mkroute_output(const struct fib_result *res,
2554 const struct flowi4 *fl4,
2555 __be32 orig_daddr, __be32 orig_saddr,
2556 int orig_oif, __u8 orig_rtos,
2557 struct net_device *dev_out,
2560 struct fib_info *fi = res->fi;
2561 struct in_device *in_dev;
2562 u16 type = res->type;
2565 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2566 return ERR_PTR(-EINVAL);
2568 if (ipv4_is_lbcast(fl4->daddr))
2569 type = RTN_BROADCAST;
2570 else if (ipv4_is_multicast(fl4->daddr))
2571 type = RTN_MULTICAST;
2572 else if (ipv4_is_zeronet(fl4->daddr))
2573 return ERR_PTR(-EINVAL);
2575 if (dev_out->flags & IFF_LOOPBACK)
2576 flags |= RTCF_LOCAL;
2578 in_dev = __in_dev_get_rcu(dev_out);
2580 return ERR_PTR(-EINVAL);
2582 if (type == RTN_BROADCAST) {
2583 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2585 } else if (type == RTN_MULTICAST) {
2586 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2587 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2589 flags &= ~RTCF_LOCAL;
2590 /* If multicast route do not exist use
2591 * default one, but do not gateway in this case.
2594 if (fi && res->prefixlen < 4)
2598 rth = rt_dst_alloc(dev_out,
2599 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2600 IN_DEV_CONF_GET(in_dev, NOXFRM));
2602 return ERR_PTR(-ENOBUFS);
2604 rth->dst.output = ip_output;
2606 rth->rt_key_dst = orig_daddr;
2607 rth->rt_key_src = orig_saddr;
2608 rth->rt_genid = rt_genid(dev_net(dev_out));
2609 rth->rt_flags = flags;
2610 rth->rt_type = type;
2611 rth->rt_key_tos = orig_rtos;
2612 rth->rt_dst = fl4->daddr;
2613 rth->rt_src = fl4->saddr;
2614 rth->rt_route_iif = 0;
2615 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2616 rth->rt_oif = orig_oif;
2617 rth->rt_mark = fl4->flowi4_mark;
2618 rth->rt_gateway = fl4->daddr;
2619 rth->rt_spec_dst= fl4->saddr;
2620 rth->rt_peer_genid = 0;
2624 RT_CACHE_STAT_INC(out_slow_tot);
2626 if (flags & RTCF_LOCAL) {
2627 rth->dst.input = ip_local_deliver;
2628 rth->rt_spec_dst = fl4->daddr;
2630 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2631 rth->rt_spec_dst = fl4->saddr;
2632 if (flags & RTCF_LOCAL &&
2633 !(dev_out->flags & IFF_LOOPBACK)) {
2634 rth->dst.output = ip_mc_output;
2635 RT_CACHE_STAT_INC(out_slow_mc);
2637 #ifdef CONFIG_IP_MROUTE
2638 if (type == RTN_MULTICAST) {
2639 if (IN_DEV_MFORWARD(in_dev) &&
2640 !ipv4_is_local_multicast(fl4->daddr)) {
2641 rth->dst.input = ip_mr_input;
2642 rth->dst.output = ip_mc_output;
2648 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2654 * Major route resolver routine.
2655 * called with rcu_read_lock();
2658 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2660 struct net_device *dev_out = NULL;
2661 __u8 tos = RT_FL_TOS(fl4);
2662 unsigned int flags = 0;
2663 struct fib_result res;
2670 #ifdef CONFIG_IP_MULTIPLE_TABLES
2674 orig_daddr = fl4->daddr;
2675 orig_saddr = fl4->saddr;
2676 orig_oif = fl4->flowi4_oif;
2678 fl4->flowi4_iif = net->loopback_dev->ifindex;
2679 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2680 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2681 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2685 rth = ERR_PTR(-EINVAL);
2686 if (ipv4_is_multicast(fl4->saddr) ||
2687 ipv4_is_lbcast(fl4->saddr) ||
2688 ipv4_is_zeronet(fl4->saddr))
2691 /* I removed check for oif == dev_out->oif here.
2692 It was wrong for two reasons:
2693 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2694 is assigned to multiple interfaces.
2695 2. Moreover, we are allowed to send packets with saddr
2696 of another iface. --ANK
2699 if (fl4->flowi4_oif == 0 &&
2700 (ipv4_is_multicast(fl4->daddr) ||
2701 ipv4_is_lbcast(fl4->daddr))) {
2702 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2703 dev_out = __ip_dev_find(net, fl4->saddr, false);
2704 if (dev_out == NULL)
2707 /* Special hack: user can direct multicasts
2708 and limited broadcast via necessary interface
2709 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2710 This hack is not just for fun, it allows
2711 vic,vat and friends to work.
2712 They bind socket to loopback, set ttl to zero
2713 and expect that it will work.
2714 From the viewpoint of routing cache they are broken,
2715 because we are not allowed to build multicast path
2716 with loopback source addr (look, routing cache
2717 cannot know, that ttl is zero, so that packet
2718 will not leave this host and route is valid).
2719 Luckily, this hack is good workaround.
2722 fl4->flowi4_oif = dev_out->ifindex;
2726 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2727 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2728 if (!__ip_dev_find(net, fl4->saddr, false))
2734 if (fl4->flowi4_oif) {
2735 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2736 rth = ERR_PTR(-ENODEV);
2737 if (dev_out == NULL)
2740 /* RACE: Check return value of inet_select_addr instead. */
2741 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2742 rth = ERR_PTR(-ENETUNREACH);
2745 if (ipv4_is_local_multicast(fl4->daddr) ||
2746 ipv4_is_lbcast(fl4->daddr)) {
2748 fl4->saddr = inet_select_addr(dev_out, 0,
2753 if (ipv4_is_multicast(fl4->daddr))
2754 fl4->saddr = inet_select_addr(dev_out, 0,
2756 else if (!fl4->daddr)
2757 fl4->saddr = inet_select_addr(dev_out, 0,
2763 fl4->daddr = fl4->saddr;
2765 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2766 dev_out = net->loopback_dev;
2767 fl4->flowi4_oif = net->loopback_dev->ifindex;
2768 res.type = RTN_LOCAL;
2769 flags |= RTCF_LOCAL;
2773 if (fib_lookup(net, fl4, &res)) {
2775 if (fl4->flowi4_oif) {
2776 /* Apparently, routing tables are wrong. Assume,
2777 that the destination is on link.
2780 Because we are allowed to send to iface
2781 even if it has NO routes and NO assigned
2782 addresses. When oif is specified, routing
2783 tables are looked up with only one purpose:
2784 to catch if destination is gatewayed, rather than
2785 direct. Moreover, if MSG_DONTROUTE is set,
2786 we send packet, ignoring both routing tables
2787 and ifaddr state. --ANK
2790 We could make it even if oif is unknown,
2791 likely IPv6, but we do not.
2794 if (fl4->saddr == 0)
2795 fl4->saddr = inet_select_addr(dev_out, 0,
2797 res.type = RTN_UNICAST;
2800 rth = ERR_PTR(-ENETUNREACH);
2804 if (res.type == RTN_LOCAL) {
2806 if (res.fi->fib_prefsrc)
2807 fl4->saddr = res.fi->fib_prefsrc;
2809 fl4->saddr = fl4->daddr;
2811 dev_out = net->loopback_dev;
2812 fl4->flowi4_oif = dev_out->ifindex;
2814 flags |= RTCF_LOCAL;
2818 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2819 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2820 fib_select_multipath(&res);
2823 if (!res.prefixlen &&
2824 res.table->tb_num_default > 1 &&
2825 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2826 fib_select_default(&res);
2829 fl4->saddr = FIB_RES_PREFSRC(net, res);
2831 dev_out = FIB_RES_DEV(res);
2832 fl4->flowi4_oif = dev_out->ifindex;
2836 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2837 tos, dev_out, flags);
2841 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2842 rt_genid(dev_net(dev_out)));
2843 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2851 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2856 if (!rt_caching(net))
2859 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2862 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2863 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2864 if (rth->rt_key_dst == flp4->daddr &&
2865 rth->rt_key_src == flp4->saddr &&
2866 rt_is_output_route(rth) &&
2867 rth->rt_oif == flp4->flowi4_oif &&
2868 rth->rt_mark == flp4->flowi4_mark &&
2869 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2870 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2871 net_eq(dev_net(rth->dst.dev), net) &&
2872 !rt_is_expired(rth)) {
2873 ipv4_validate_peer(rth);
2874 dst_use(&rth->dst, jiffies);
2875 RT_CACHE_STAT_INC(out_hit);
2876 rcu_read_unlock_bh();
2878 flp4->saddr = rth->rt_src;
2880 flp4->daddr = rth->rt_dst;
2883 RT_CACHE_STAT_INC(out_hlist_search);
2885 rcu_read_unlock_bh();
2888 return ip_route_output_slow(net, flp4);
2890 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2892 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2897 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2899 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2901 return mtu ? : dst->dev->mtu;
2904 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2908 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2914 static struct dst_ops ipv4_dst_blackhole_ops = {
2916 .protocol = cpu_to_be16(ETH_P_IP),
2917 .destroy = ipv4_dst_destroy,
2918 .check = ipv4_blackhole_dst_check,
2919 .mtu = ipv4_blackhole_mtu,
2920 .default_advmss = ipv4_default_advmss,
2921 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2922 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2923 .neigh_lookup = ipv4_neigh_lookup,
2926 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2928 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2929 struct rtable *ort = (struct rtable *) dst_orig;
2932 struct dst_entry *new = &rt->dst;
2935 new->input = dst_discard;
2936 new->output = dst_discard;
2937 dst_copy_metrics(new, &ort->dst);
2939 new->dev = ort->dst.dev;
2943 rt->rt_key_dst = ort->rt_key_dst;
2944 rt->rt_key_src = ort->rt_key_src;
2945 rt->rt_key_tos = ort->rt_key_tos;
2946 rt->rt_route_iif = ort->rt_route_iif;
2947 rt->rt_iif = ort->rt_iif;
2948 rt->rt_oif = ort->rt_oif;
2949 rt->rt_mark = ort->rt_mark;
2951 rt->rt_genid = rt_genid(net);
2952 rt->rt_flags = ort->rt_flags;
2953 rt->rt_type = ort->rt_type;
2954 rt->rt_dst = ort->rt_dst;
2955 rt->rt_src = ort->rt_src;
2956 rt->rt_gateway = ort->rt_gateway;
2957 rt->rt_spec_dst = ort->rt_spec_dst;
2958 rt->peer = ort->peer;
2960 atomic_inc(&rt->peer->refcnt);
2963 atomic_inc(&rt->fi->fib_clntref);
2968 dst_release(dst_orig);
2970 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2973 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2976 struct rtable *rt = __ip_route_output_key(net, flp4);
2981 if (flp4->flowi4_proto)
2982 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2983 flowi4_to_flowi(flp4),
2988 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2990 static int rt_fill_info(struct net *net,
2991 struct sk_buff *skb, u32 pid, u32 seq, int event,
2992 int nowait, unsigned int flags)
2994 struct rtable *rt = skb_rtable(skb);
2996 struct nlmsghdr *nlh;
2997 unsigned long expires = 0;
2998 const struct inet_peer *peer = rt->peer;
2999 u32 id = 0, ts = 0, tsage = 0, error;
3001 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
3005 r = nlmsg_data(nlh);
3006 r->rtm_family = AF_INET;
3007 r->rtm_dst_len = 32;
3009 r->rtm_tos = rt->rt_key_tos;
3010 r->rtm_table = RT_TABLE_MAIN;
3011 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
3012 r->rtm_type = rt->rt_type;
3013 r->rtm_scope = RT_SCOPE_UNIVERSE;
3014 r->rtm_protocol = RTPROT_UNSPEC;
3015 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
3016 if (rt->rt_flags & RTCF_NOTIFY)
3017 r->rtm_flags |= RTM_F_NOTIFY;
3019 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
3021 if (rt->rt_key_src) {
3022 r->rtm_src_len = 32;
3023 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
3026 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
3027 #ifdef CONFIG_IP_ROUTE_CLASSID
3028 if (rt->dst.tclassid)
3029 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3031 if (rt_is_input_route(rt))
3032 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3033 else if (rt->rt_src != rt->rt_key_src)
3034 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3036 if (rt->rt_dst != rt->rt_gateway)
3037 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3039 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3040 goto nla_put_failure;
3043 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3045 error = rt->dst.error;
3047 inet_peer_refcheck(rt->peer);
3048 if (peer->tcp_ts_stamp) {
3050 tsage = get_seconds() - peer->tcp_ts_stamp;
3052 expires = ACCESS_ONCE(peer->pmtu_expires);
3054 if (time_before(jiffies, expires))
3061 if (rt_is_input_route(rt)) {
3062 #ifdef CONFIG_IP_MROUTE
3063 __be32 dst = rt->rt_dst;
3065 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3066 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3067 int err = ipmr_get_route(net, skb,
3068 rt->rt_src, rt->rt_dst,
3074 goto nla_put_failure;
3076 if (err == -EMSGSIZE)
3077 goto nla_put_failure;
3083 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3086 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3087 expires, error) < 0)
3088 goto nla_put_failure;
3090 return nlmsg_end(skb, nlh);
3093 nlmsg_cancel(skb, nlh);
3097 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3099 struct net *net = sock_net(in_skb->sk);
3101 struct nlattr *tb[RTA_MAX+1];
3102 struct rtable *rt = NULL;
3108 struct sk_buff *skb;
3110 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3114 rtm = nlmsg_data(nlh);
3116 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3122 /* Reserve room for dummy headers, this skb can pass
3123 through good chunk of routing engine.
3125 skb_reset_mac_header(skb);
3126 skb_reset_network_header(skb);
3128 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3129 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3130 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3132 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3133 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3134 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3135 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3138 struct net_device *dev;
3140 dev = __dev_get_by_index(net, iif);
3146 skb->protocol = htons(ETH_P_IP);
3150 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3153 rt = skb_rtable(skb);
3154 if (err == 0 && rt->dst.error)
3155 err = -rt->dst.error;
3157 struct flowi4 fl4 = {
3160 .flowi4_tos = rtm->rtm_tos,
3161 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3162 .flowi4_mark = mark,
3164 rt = ip_route_output_key(net, &fl4);
3174 skb_dst_set(skb, &rt->dst);
3175 if (rtm->rtm_flags & RTM_F_NOTIFY)
3176 rt->rt_flags |= RTCF_NOTIFY;
3178 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3179 RTM_NEWROUTE, 0, 0);
3183 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3192 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3199 net = sock_net(skb->sk);
3204 s_idx = idx = cb->args[1];
3205 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3206 if (!rt_hash_table[h].chain)
3209 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3210 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3211 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3213 if (rt_is_expired(rt))
3215 skb_dst_set_noref(skb, &rt->dst);
3216 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3217 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3218 1, NLM_F_MULTI) <= 0) {
3220 rcu_read_unlock_bh();
3225 rcu_read_unlock_bh();
3234 void ip_rt_multicast_event(struct in_device *in_dev)
3236 rt_cache_flush(dev_net(in_dev->dev), 0);
3239 #ifdef CONFIG_SYSCTL
3240 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3241 void __user *buffer,
3242 size_t *lenp, loff_t *ppos)
3249 memcpy(&ctl, __ctl, sizeof(ctl));
3250 ctl.data = &flush_delay;
3251 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3253 net = (struct net *)__ctl->extra1;
3254 rt_cache_flush(net, flush_delay);
3261 static ctl_table ipv4_route_table[] = {
3263 .procname = "gc_thresh",
3264 .data = &ipv4_dst_ops.gc_thresh,
3265 .maxlen = sizeof(int),
3267 .proc_handler = proc_dointvec,
3270 .procname = "max_size",
3271 .data = &ip_rt_max_size,
3272 .maxlen = sizeof(int),
3274 .proc_handler = proc_dointvec,
3277 /* Deprecated. Use gc_min_interval_ms */
3279 .procname = "gc_min_interval",
3280 .data = &ip_rt_gc_min_interval,
3281 .maxlen = sizeof(int),
3283 .proc_handler = proc_dointvec_jiffies,
3286 .procname = "gc_min_interval_ms",
3287 .data = &ip_rt_gc_min_interval,
3288 .maxlen = sizeof(int),
3290 .proc_handler = proc_dointvec_ms_jiffies,
3293 .procname = "gc_timeout",
3294 .data = &ip_rt_gc_timeout,
3295 .maxlen = sizeof(int),
3297 .proc_handler = proc_dointvec_jiffies,
3300 .procname = "gc_interval",
3301 .data = &ip_rt_gc_interval,
3302 .maxlen = sizeof(int),
3304 .proc_handler = proc_dointvec_jiffies,
3307 .procname = "redirect_load",
3308 .data = &ip_rt_redirect_load,
3309 .maxlen = sizeof(int),
3311 .proc_handler = proc_dointvec,
3314 .procname = "redirect_number",
3315 .data = &ip_rt_redirect_number,
3316 .maxlen = sizeof(int),
3318 .proc_handler = proc_dointvec,
3321 .procname = "redirect_silence",
3322 .data = &ip_rt_redirect_silence,
3323 .maxlen = sizeof(int),
3325 .proc_handler = proc_dointvec,
3328 .procname = "error_cost",
3329 .data = &ip_rt_error_cost,
3330 .maxlen = sizeof(int),
3332 .proc_handler = proc_dointvec,
3335 .procname = "error_burst",
3336 .data = &ip_rt_error_burst,
3337 .maxlen = sizeof(int),
3339 .proc_handler = proc_dointvec,
3342 .procname = "gc_elasticity",
3343 .data = &ip_rt_gc_elasticity,
3344 .maxlen = sizeof(int),
3346 .proc_handler = proc_dointvec,
3349 .procname = "mtu_expires",
3350 .data = &ip_rt_mtu_expires,
3351 .maxlen = sizeof(int),
3353 .proc_handler = proc_dointvec_jiffies,
3356 .procname = "min_pmtu",
3357 .data = &ip_rt_min_pmtu,
3358 .maxlen = sizeof(int),
3360 .proc_handler = proc_dointvec,
3363 .procname = "min_adv_mss",
3364 .data = &ip_rt_min_advmss,
3365 .maxlen = sizeof(int),
3367 .proc_handler = proc_dointvec,
3372 static struct ctl_table empty[1];
3374 static struct ctl_table ipv4_skeleton[] =
3376 { .procname = "route",
3377 .mode = 0555, .child = ipv4_route_table},
3378 { .procname = "neigh",
3379 .mode = 0555, .child = empty},
3383 static __net_initdata struct ctl_path ipv4_path[] = {
3384 { .procname = "net", },
3385 { .procname = "ipv4", },
3389 static struct ctl_table ipv4_route_flush_table[] = {
3391 .procname = "flush",
3392 .maxlen = sizeof(int),
3394 .proc_handler = ipv4_sysctl_rtcache_flush,
3399 static __net_initdata struct ctl_path ipv4_route_path[] = {
3400 { .procname = "net", },
3401 { .procname = "ipv4", },
3402 { .procname = "route", },
3406 static __net_init int sysctl_route_net_init(struct net *net)
3408 struct ctl_table *tbl;
3410 tbl = ipv4_route_flush_table;
3411 if (!net_eq(net, &init_net)) {
3412 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3416 tbl[0].extra1 = net;
3418 net->ipv4.route_hdr =
3419 register_net_sysctl_table(net, ipv4_route_path, tbl);
3420 if (net->ipv4.route_hdr == NULL)
3425 if (tbl != ipv4_route_flush_table)
3431 static __net_exit void sysctl_route_net_exit(struct net *net)
3433 struct ctl_table *tbl;
3435 tbl = net->ipv4.route_hdr->ctl_table_arg;
3436 unregister_net_sysctl_table(net->ipv4.route_hdr);
3437 BUG_ON(tbl == ipv4_route_flush_table);
3441 static __net_initdata struct pernet_operations sysctl_route_ops = {
3442 .init = sysctl_route_net_init,
3443 .exit = sysctl_route_net_exit,
3447 static __net_init int rt_genid_init(struct net *net)
3449 get_random_bytes(&net->ipv4.rt_genid,
3450 sizeof(net->ipv4.rt_genid));
3451 get_random_bytes(&net->ipv4.dev_addr_genid,
3452 sizeof(net->ipv4.dev_addr_genid));
3456 static __net_initdata struct pernet_operations rt_genid_ops = {
3457 .init = rt_genid_init,
3461 #ifdef CONFIG_IP_ROUTE_CLASSID
3462 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3463 #endif /* CONFIG_IP_ROUTE_CLASSID */
3465 static __initdata unsigned long rhash_entries;
3466 static int __init set_rhash_entries(char *str)
3470 rhash_entries = simple_strtoul(str, &str, 0);
3473 __setup("rhash_entries=", set_rhash_entries);
3475 int __init ip_rt_init(void)
3479 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3481 panic("IP: failed to allocate ip_idents\n");
3483 get_random_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3485 #ifdef CONFIG_IP_ROUTE_CLASSID
3486 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3488 panic("IP: failed to allocate ip_rt_acct\n");
3491 ipv4_dst_ops.kmem_cachep =
3492 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3493 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3495 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3497 if (dst_entries_init(&ipv4_dst_ops) < 0)
3498 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3500 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3501 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3503 rt_hash_table = (struct rt_hash_bucket *)
3504 alloc_large_system_hash("IP route cache",
3505 sizeof(struct rt_hash_bucket),
3507 (totalram_pages >= 128 * 1024) ?
3512 rhash_entries ? 0 : 512 * 1024);
3513 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3514 rt_hash_lock_init();
3516 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3517 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3522 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3523 expires_ljiffies = jiffies;
3524 schedule_delayed_work(&expires_work,
3525 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3527 if (ip_rt_proc_init())
3528 printk(KERN_ERR "Unable to create route proc files\n");
3531 xfrm4_init(ip_rt_max_size);
3533 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3535 #ifdef CONFIG_SYSCTL
3536 register_pernet_subsys(&sysctl_route_ops);
3538 register_pernet_subsys(&rt_genid_ops);
3542 #ifdef CONFIG_SYSCTL
3544 * We really need to sanitize the damn ipv4 init order, then all
3545 * this nonsense will go away.
3547 void __init ip_static_sysctl_init(void)
3549 register_sysctl_paths(ipv4_path, ipv4_skeleton);