2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
109 #include <linux/sysctl.h>
111 #include <net/atmclip.h>
112 #include <net/secure_seq.h>
114 #define RT_FL_TOS(oldflp4) \
115 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
117 #define IP_MAX_MTU 0xFFF0
119 #define RT_GC_TIMEOUT (300*HZ)
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
123 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124 static int ip_rt_redirect_number __read_mostly = 9;
125 static int ip_rt_redirect_load __read_mostly = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly = HZ;
128 static int ip_rt_error_burst __read_mostly = 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly = 8;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256;
133 static int rt_chain_length_max __read_mostly = 20;
134 static int redirect_genid;
137 * Interface to generic destination cache.
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int ipv4_mtu(const struct dst_entry *dst);
143 static void ipv4_dst_destroy(struct dst_entry *dst);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void ipv4_link_failure(struct sk_buff *skb);
146 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147 static int rt_garbage_collect(struct dst_ops *ops);
149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156 struct rtable *rt = (struct rtable *) dst;
157 struct inet_peer *peer;
161 rt_bind_peer(rt, rt->rt_dst, 1);
165 u32 *old_p = __DST_METRICS_PTR(old);
166 unsigned long prev, new;
169 if (inet_metrics_new(peer))
170 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
172 new = (unsigned long) p;
173 prev = cmpxchg(&dst->_metrics, old, new);
176 p = __DST_METRICS_PTR(prev);
177 if (prev & DST_METRICS_READ_ONLY)
181 fib_info_put(rt->fi);
189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
191 static struct dst_ops ipv4_dst_ops = {
193 .protocol = cpu_to_be16(ETH_P_IP),
194 .gc = rt_garbage_collect,
195 .check = ipv4_dst_check,
196 .default_advmss = ipv4_default_advmss,
198 .cow_metrics = ipv4_cow_metrics,
199 .destroy = ipv4_dst_destroy,
200 .ifdown = ipv4_dst_ifdown,
201 .negative_advice = ipv4_negative_advice,
202 .link_failure = ipv4_link_failure,
203 .update_pmtu = ip_rt_update_pmtu,
204 .local_out = __ip_local_out,
205 .neigh_lookup = ipv4_neigh_lookup,
208 #define ECN_OR_COST(class) TC_PRIO_##class
210 const __u8 ip_tos2prio[16] = {
212 ECN_OR_COST(BESTEFFORT),
214 ECN_OR_COST(BESTEFFORT),
220 ECN_OR_COST(INTERACTIVE),
222 ECN_OR_COST(INTERACTIVE),
223 TC_PRIO_INTERACTIVE_BULK,
224 ECN_OR_COST(INTERACTIVE_BULK),
225 TC_PRIO_INTERACTIVE_BULK,
226 ECN_OR_COST(INTERACTIVE_BULK)
234 /* The locking scheme is rather straight forward:
236 * 1) Read-Copy Update protects the buckets of the central route hash.
237 * 2) Only writers remove entries, and they hold the lock
238 * as they look at rtable reference counts.
239 * 3) Only readers acquire references to rtable entries,
240 * they do so with atomic increments and with the
244 struct rt_hash_bucket {
245 struct rtable __rcu *chain;
248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
249 defined(CONFIG_PROVE_LOCKING)
251 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
252 * The size of this table is a power of two and depends on the number of CPUS.
253 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
255 #ifdef CONFIG_LOCKDEP
256 # define RT_HASH_LOCK_SZ 256
259 # define RT_HASH_LOCK_SZ 4096
261 # define RT_HASH_LOCK_SZ 2048
263 # define RT_HASH_LOCK_SZ 1024
265 # define RT_HASH_LOCK_SZ 512
267 # define RT_HASH_LOCK_SZ 256
271 static spinlock_t *rt_hash_locks;
272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
274 static __init void rt_hash_lock_init(void)
278 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
281 panic("IP: failed to allocate rt_hash_locks\n");
283 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
284 spin_lock_init(&rt_hash_locks[i]);
287 # define rt_hash_lock_addr(slot) NULL
289 static inline void rt_hash_lock_init(void)
294 static struct rt_hash_bucket *rt_hash_table __read_mostly;
295 static unsigned rt_hash_mask __read_mostly;
296 static unsigned int rt_hash_log __read_mostly;
298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
304 return jhash_3words((__force u32)daddr, (__force u32)saddr,
309 static inline int rt_genid(struct net *net)
311 return atomic_read(&net->ipv4.rt_genid);
314 #ifdef CONFIG_PROC_FS
315 struct rt_cache_iter_state {
316 struct seq_net_private p;
321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
323 struct rt_cache_iter_state *st = seq->private;
324 struct rtable *r = NULL;
326 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
327 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
330 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
332 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
333 r->rt_genid == st->genid)
335 r = rcu_dereference_bh(r->dst.rt_next);
337 rcu_read_unlock_bh();
342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
345 struct rt_cache_iter_state *st = seq->private;
347 r = rcu_dereference_bh(r->dst.rt_next);
349 rcu_read_unlock_bh();
351 if (--st->bucket < 0)
353 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
355 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
363 struct rt_cache_iter_state *st = seq->private;
364 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
365 if (dev_net(r->dst.dev) != seq_file_net(seq))
367 if (r->rt_genid == st->genid)
373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
375 struct rtable *r = rt_cache_get_first(seq);
378 while (pos && (r = rt_cache_get_next(seq, r)))
380 return pos ? NULL : r;
383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
385 struct rt_cache_iter_state *st = seq->private;
387 return rt_cache_get_idx(seq, *pos - 1);
388 st->genid = rt_genid(seq_file_net(seq));
389 return SEQ_START_TOKEN;
392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
396 if (v == SEQ_START_TOKEN)
397 r = rt_cache_get_first(seq);
399 r = rt_cache_get_next(seq, v);
404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
406 if (v && v != SEQ_START_TOKEN)
407 rcu_read_unlock_bh();
410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
412 if (v == SEQ_START_TOKEN)
413 seq_printf(seq, "%-127s\n",
414 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
415 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
418 struct rtable *r = v;
423 n = dst_get_neighbour(&r->dst);
424 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
427 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
428 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
429 r->dst.dev ? r->dst.dev->name : "*",
430 (__force u32)r->rt_dst,
431 (__force u32)r->rt_gateway,
432 r->rt_flags, atomic_read(&r->dst.__refcnt),
433 r->dst.__use, 0, (__force u32)r->rt_src,
434 dst_metric_advmss(&r->dst) + 40,
435 dst_metric(&r->dst, RTAX_WINDOW),
436 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
437 dst_metric(&r->dst, RTAX_RTTVAR)),
441 r->rt_spec_dst, &len);
443 seq_printf(seq, "%*s\n", 127 - len, "");
448 static const struct seq_operations rt_cache_seq_ops = {
449 .start = rt_cache_seq_start,
450 .next = rt_cache_seq_next,
451 .stop = rt_cache_seq_stop,
452 .show = rt_cache_seq_show,
455 static int rt_cache_seq_open(struct inode *inode, struct file *file)
457 return seq_open_net(inode, file, &rt_cache_seq_ops,
458 sizeof(struct rt_cache_iter_state));
461 static const struct file_operations rt_cache_seq_fops = {
462 .owner = THIS_MODULE,
463 .open = rt_cache_seq_open,
466 .release = seq_release_net,
470 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475 return SEQ_START_TOKEN;
477 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
478 if (!cpu_possible(cpu))
481 return &per_cpu(rt_cache_stat, cpu);
486 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
490 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
491 if (!cpu_possible(cpu))
494 return &per_cpu(rt_cache_stat, cpu);
500 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
507 struct rt_cache_stat *st = v;
509 if (v == SEQ_START_TOKEN) {
510 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
514 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
515 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
516 dst_entries_get_slow(&ipv4_dst_ops),
539 static const struct seq_operations rt_cpu_seq_ops = {
540 .start = rt_cpu_seq_start,
541 .next = rt_cpu_seq_next,
542 .stop = rt_cpu_seq_stop,
543 .show = rt_cpu_seq_show,
547 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
549 return seq_open(file, &rt_cpu_seq_ops);
552 static const struct file_operations rt_cpu_seq_fops = {
553 .owner = THIS_MODULE,
554 .open = rt_cpu_seq_open,
557 .release = seq_release,
560 #ifdef CONFIG_IP_ROUTE_CLASSID
561 static int rt_acct_proc_show(struct seq_file *m, void *v)
563 struct ip_rt_acct *dst, *src;
566 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
570 for_each_possible_cpu(i) {
571 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
572 for (j = 0; j < 256; j++) {
573 dst[j].o_bytes += src[j].o_bytes;
574 dst[j].o_packets += src[j].o_packets;
575 dst[j].i_bytes += src[j].i_bytes;
576 dst[j].i_packets += src[j].i_packets;
580 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585 static int rt_acct_proc_open(struct inode *inode, struct file *file)
587 return single_open(file, rt_acct_proc_show, NULL);
590 static const struct file_operations rt_acct_proc_fops = {
591 .owner = THIS_MODULE,
592 .open = rt_acct_proc_open,
595 .release = single_release,
599 static int __net_init ip_rt_do_proc_init(struct net *net)
601 struct proc_dir_entry *pde;
603 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608 pde = proc_create("rt_cache", S_IRUGO,
609 net->proc_net_stat, &rt_cpu_seq_fops);
613 #ifdef CONFIG_IP_ROUTE_CLASSID
614 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
620 #ifdef CONFIG_IP_ROUTE_CLASSID
622 remove_proc_entry("rt_cache", net->proc_net_stat);
625 remove_proc_entry("rt_cache", net->proc_net);
630 static void __net_exit ip_rt_do_proc_exit(struct net *net)
632 remove_proc_entry("rt_cache", net->proc_net_stat);
633 remove_proc_entry("rt_cache", net->proc_net);
634 #ifdef CONFIG_IP_ROUTE_CLASSID
635 remove_proc_entry("rt_acct", net->proc_net);
639 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
640 .init = ip_rt_do_proc_init,
641 .exit = ip_rt_do_proc_exit,
644 static int __init ip_rt_proc_init(void)
646 return register_pernet_subsys(&ip_rt_proc_ops);
650 static inline int ip_rt_proc_init(void)
654 #endif /* CONFIG_PROC_FS */
656 static inline void rt_free(struct rtable *rt)
658 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
661 static inline void rt_drop(struct rtable *rt)
664 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
667 static inline int rt_fast_clean(struct rtable *rth)
669 /* Kill broadcast/multicast entries very aggresively, if they
670 collide in hash table with more useful entries */
671 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
672 rt_is_input_route(rth) && rth->dst.rt_next;
675 static inline int rt_valuable(struct rtable *rth)
677 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
678 (rth->peer && rth->peer->pmtu_expires);
681 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686 if (atomic_read(&rth->dst.__refcnt))
689 age = jiffies - rth->dst.lastuse;
690 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
691 (age <= tmo2 && rt_valuable(rth)))
697 /* Bits of score are:
699 * 30: not quite useless
700 * 29..0: usage counter
702 static inline u32 rt_score(struct rtable *rt)
704 u32 score = jiffies - rt->dst.lastuse;
706 score = ~score & ~(3<<30);
711 if (rt_is_output_route(rt) ||
712 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
718 static inline bool rt_caching(const struct net *net)
720 return net->ipv4.current_rt_cache_rebuild_count <=
721 net->ipv4.sysctl_rt_cache_rebuild_count;
724 static inline bool compare_hash_inputs(const struct rtable *rt1,
725 const struct rtable *rt2)
727 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
728 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
729 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
732 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
734 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
735 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
736 (rt1->rt_mark ^ rt2->rt_mark) |
737 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
738 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
739 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
742 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
744 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
747 static inline int rt_is_expired(struct rtable *rth)
749 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
753 * Perform a full scan of hash table and free all entries.
754 * Can be called by a softirq or a process.
755 * In the later case, we want to be reschedule if necessary
757 static void rt_do_flush(struct net *net, int process_context)
760 struct rtable *rth, *next;
762 for (i = 0; i <= rt_hash_mask; i++) {
763 struct rtable __rcu **pprev;
766 if (process_context && need_resched())
768 rth = rcu_access_pointer(rt_hash_table[i].chain);
772 spin_lock_bh(rt_hash_lock_addr(i));
775 pprev = &rt_hash_table[i].chain;
776 rth = rcu_dereference_protected(*pprev,
777 lockdep_is_held(rt_hash_lock_addr(i)));
780 next = rcu_dereference_protected(rth->dst.rt_next,
781 lockdep_is_held(rt_hash_lock_addr(i)));
784 net_eq(dev_net(rth->dst.dev), net)) {
785 rcu_assign_pointer(*pprev, next);
786 rcu_assign_pointer(rth->dst.rt_next, list);
789 pprev = &rth->dst.rt_next;
794 spin_unlock_bh(rt_hash_lock_addr(i));
796 for (; list; list = next) {
797 next = rcu_dereference_protected(list->dst.rt_next, 1);
804 * While freeing expired entries, we compute average chain length
805 * and standard deviation, using fixed-point arithmetic.
806 * This to have an estimation of rt_chain_length_max
807 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
808 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
812 #define ONE (1UL << FRACT_BITS)
815 * Given a hash chain and an item in this hash chain,
816 * find if a previous entry has the same hash_inputs
817 * (but differs on tos, mark or oif)
818 * Returns 0 if an alias is found.
819 * Returns ONE if rth has no alias before itself.
821 static int has_noalias(const struct rtable *head, const struct rtable *rth)
823 const struct rtable *aux = head;
826 if (compare_hash_inputs(aux, rth))
828 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
834 * Perturbation of rt_genid by a small quantity [1..256]
835 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
836 * many times (2^24) without giving recent rt_genid.
837 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
839 static void rt_cache_invalidate(struct net *net)
841 unsigned char shuffle;
843 get_random_bytes(&shuffle, sizeof(shuffle));
844 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
849 * delay < 0 : invalidate cache (fast : entries will be deleted later)
850 * delay >= 0 : invalidate & flush cache (can be long)
852 void rt_cache_flush(struct net *net, int delay)
854 rt_cache_invalidate(net);
856 rt_do_flush(net, !in_softirq());
859 /* Flush previous cache invalidated entries from the cache */
860 void rt_cache_flush_batch(struct net *net)
862 rt_do_flush(net, !in_softirq());
865 static void rt_emergency_hash_rebuild(struct net *net)
868 printk(KERN_WARNING "Route hash chain too long!\n");
869 rt_cache_invalidate(net);
873 Short description of GC goals.
875 We want to build algorithm, which will keep routing cache
876 at some equilibrium point, when number of aged off entries
877 is kept approximately equal to newly generated ones.
879 Current expiration strength is variable "expire".
880 We try to adjust it dynamically, so that if networking
881 is idle expires is large enough to keep enough of warm entries,
882 and when load increases it reduces to limit cache size.
885 static int rt_garbage_collect(struct dst_ops *ops)
887 static unsigned long expire = RT_GC_TIMEOUT;
888 static unsigned long last_gc;
890 static int equilibrium;
892 struct rtable __rcu **rthp;
893 unsigned long now = jiffies;
895 int entries = dst_entries_get_fast(&ipv4_dst_ops);
898 * Garbage collection is pretty expensive,
899 * do not make it too frequently.
902 RT_CACHE_STAT_INC(gc_total);
904 if (now - last_gc < ip_rt_gc_min_interval &&
905 entries < ip_rt_max_size) {
906 RT_CACHE_STAT_INC(gc_ignored);
910 entries = dst_entries_get_slow(&ipv4_dst_ops);
911 /* Calculate number of entries, which we want to expire now. */
912 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
914 if (equilibrium < ipv4_dst_ops.gc_thresh)
915 equilibrium = ipv4_dst_ops.gc_thresh;
916 goal = entries - equilibrium;
918 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
919 goal = entries - equilibrium;
922 /* We are in dangerous area. Try to reduce cache really
925 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
926 equilibrium = entries - goal;
929 if (now - last_gc >= ip_rt_gc_min_interval)
940 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
941 unsigned long tmo = expire;
943 k = (k + 1) & rt_hash_mask;
944 rthp = &rt_hash_table[k].chain;
945 spin_lock_bh(rt_hash_lock_addr(k));
946 while ((rth = rcu_dereference_protected(*rthp,
947 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
948 if (!rt_is_expired(rth) &&
949 !rt_may_expire(rth, tmo, expire)) {
951 rthp = &rth->dst.rt_next;
954 *rthp = rth->dst.rt_next;
958 spin_unlock_bh(rt_hash_lock_addr(k));
967 /* Goal is not achieved. We stop process if:
969 - if expire reduced to zero. Otherwise, expire is halfed.
970 - if table is not full.
971 - if we are called from interrupt.
972 - jiffies check is just fallback/debug loop breaker.
973 We will not spin here for long time in any case.
976 RT_CACHE_STAT_INC(gc_goal_miss);
983 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
985 } while (!in_softirq() && time_before_eq(jiffies, now));
987 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
989 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
992 printk(KERN_WARNING "dst cache overflow\n");
993 RT_CACHE_STAT_INC(gc_dst_overflow);
997 expire += ip_rt_gc_min_interval;
998 if (expire > ip_rt_gc_timeout ||
999 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1000 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1001 expire = ip_rt_gc_timeout;
1006 * Returns number of entries in a hash chain that have different hash_inputs
1008 static int slow_chain_length(const struct rtable *head)
1011 const struct rtable *rth = head;
1014 length += has_noalias(head, rth);
1015 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1017 return length >> FRACT_BITS;
1020 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1022 struct neigh_table *tbl = &arp_tbl;
1023 static const __be32 inaddr_any = 0;
1024 struct net_device *dev = dst->dev;
1025 const __be32 *pkey = daddr;
1026 struct neighbour *n;
1028 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1029 if (dev->type == ARPHRD_ATM)
1030 tbl = clip_tbl_hook;
1032 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1035 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1038 return neigh_create(tbl, pkey, dev);
1041 static int rt_bind_neighbour(struct rtable *rt)
1043 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1046 dst_set_neighbour(&rt->dst, n);
1051 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1052 struct sk_buff *skb, int ifindex)
1054 struct rtable *rth, *cand;
1055 struct rtable __rcu **rthp, **candp;
1059 int attempts = !in_softirq();
1063 min_score = ~(u32)0;
1068 if (!rt_caching(dev_net(rt->dst.dev))) {
1070 * If we're not caching, just tell the caller we
1071 * were successful and don't touch the route. The
1072 * caller hold the sole reference to the cache entry, and
1073 * it will be released when the caller is done with it.
1074 * If we drop it here, the callers have no way to resolve routes
1075 * when we're not caching. Instead, just point *rp at rt, so
1076 * the caller gets a single use out of the route
1077 * Note that we do rt_free on this new route entry, so that
1078 * once its refcount hits zero, we are still able to reap it
1080 * Note: To avoid expensive rcu stuff for this uncached dst,
1081 * we set DST_NOCACHE so that dst_release() can free dst without
1082 * waiting a grace period.
1085 rt->dst.flags |= DST_NOCACHE;
1086 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1087 int err = rt_bind_neighbour(rt);
1089 if (net_ratelimit())
1091 "Neighbour table failure & not caching routes.\n");
1093 return ERR_PTR(err);
1100 rthp = &rt_hash_table[hash].chain;
1102 spin_lock_bh(rt_hash_lock_addr(hash));
1103 while ((rth = rcu_dereference_protected(*rthp,
1104 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1105 if (rt_is_expired(rth)) {
1106 *rthp = rth->dst.rt_next;
1110 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1112 *rthp = rth->dst.rt_next;
1114 * Since lookup is lockfree, the deletion
1115 * must be visible to another weakly ordered CPU before
1116 * the insertion at the start of the hash chain.
1118 rcu_assign_pointer(rth->dst.rt_next,
1119 rt_hash_table[hash].chain);
1121 * Since lookup is lockfree, the update writes
1122 * must be ordered for consistency on SMP.
1124 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1126 dst_use(&rth->dst, now);
1127 spin_unlock_bh(rt_hash_lock_addr(hash));
1131 skb_dst_set(skb, &rth->dst);
1135 if (!atomic_read(&rth->dst.__refcnt)) {
1136 u32 score = rt_score(rth);
1138 if (score <= min_score) {
1147 rthp = &rth->dst.rt_next;
1151 /* ip_rt_gc_elasticity used to be average length of chain
1152 * length, when exceeded gc becomes really aggressive.
1154 * The second limit is less certain. At the moment it allows
1155 * only 2 entries per bucket. We will see.
1157 if (chain_length > ip_rt_gc_elasticity) {
1158 *candp = cand->dst.rt_next;
1162 if (chain_length > rt_chain_length_max &&
1163 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1164 struct net *net = dev_net(rt->dst.dev);
1165 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1166 if (!rt_caching(net)) {
1167 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1168 rt->dst.dev->name, num);
1170 rt_emergency_hash_rebuild(net);
1171 spin_unlock_bh(rt_hash_lock_addr(hash));
1173 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1174 ifindex, rt_genid(net));
1179 /* Try to bind route to arp only if it is output
1180 route or unicast forwarding path.
1182 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1183 int err = rt_bind_neighbour(rt);
1185 spin_unlock_bh(rt_hash_lock_addr(hash));
1187 if (err != -ENOBUFS) {
1189 return ERR_PTR(err);
1192 /* Neighbour tables are full and nothing
1193 can be released. Try to shrink route cache,
1194 it is most likely it holds some neighbour records.
1196 if (attempts-- > 0) {
1197 int saved_elasticity = ip_rt_gc_elasticity;
1198 int saved_int = ip_rt_gc_min_interval;
1199 ip_rt_gc_elasticity = 1;
1200 ip_rt_gc_min_interval = 0;
1201 rt_garbage_collect(&ipv4_dst_ops);
1202 ip_rt_gc_min_interval = saved_int;
1203 ip_rt_gc_elasticity = saved_elasticity;
1207 if (net_ratelimit())
1208 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1210 return ERR_PTR(-ENOBUFS);
1214 rt->dst.rt_next = rt_hash_table[hash].chain;
1217 * Since lookup is lockfree, we must make sure
1218 * previous writes to rt are committed to memory
1219 * before making rt visible to other CPUS.
1221 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1223 spin_unlock_bh(rt_hash_lock_addr(hash));
1227 skb_dst_set(skb, &rt->dst);
1231 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1233 static u32 rt_peer_genid(void)
1235 return atomic_read(&__rt_peer_genid);
1238 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1240 struct inet_peer *peer;
1242 peer = inet_getpeer_v4(daddr, create);
1244 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1247 rt->rt_peer_genid = rt_peer_genid();
1251 * Peer allocation may fail only in serious out-of-memory conditions. However
1252 * we still can generate some output.
1253 * Random ID selection looks a bit dangerous because we have no chances to
1254 * select ID being unique in a reasonable period of time.
1255 * But broken packet identifier may be better than no packet at all.
1257 static void ip_select_fb_ident(struct iphdr *iph)
1259 static DEFINE_SPINLOCK(ip_fb_id_lock);
1260 static u32 ip_fallback_id;
1263 spin_lock_bh(&ip_fb_id_lock);
1264 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1265 iph->id = htons(salt & 0xFFFF);
1266 ip_fallback_id = salt;
1267 spin_unlock_bh(&ip_fb_id_lock);
1270 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1272 struct rtable *rt = (struct rtable *) dst;
1275 if (rt->peer == NULL)
1276 rt_bind_peer(rt, rt->rt_dst, 1);
1278 /* If peer is attached to destination, it is never detached,
1279 so that we need not to grab a lock to dereference it.
1282 iph->id = htons(inet_getid(rt->peer, more));
1286 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1287 __builtin_return_address(0));
1289 ip_select_fb_ident(iph);
1291 EXPORT_SYMBOL(__ip_select_ident);
1293 static void rt_del(unsigned hash, struct rtable *rt)
1295 struct rtable __rcu **rthp;
1298 rthp = &rt_hash_table[hash].chain;
1299 spin_lock_bh(rt_hash_lock_addr(hash));
1301 while ((aux = rcu_dereference_protected(*rthp,
1302 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1303 if (aux == rt || rt_is_expired(aux)) {
1304 *rthp = aux->dst.rt_next;
1308 rthp = &aux->dst.rt_next;
1310 spin_unlock_bh(rt_hash_lock_addr(hash));
1313 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1315 struct rtable *rt = (struct rtable *) dst;
1316 __be32 orig_gw = rt->rt_gateway;
1317 struct neighbour *n, *old_n;
1319 dst_confirm(&rt->dst);
1321 rt->rt_gateway = peer->redirect_learned.a4;
1323 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1326 old_n = xchg(&rt->dst._neighbour, n);
1328 neigh_release(old_n);
1329 if (!n || !(n->nud_state & NUD_VALID)) {
1331 neigh_event_send(n, NULL);
1332 rt->rt_gateway = orig_gw;
1335 rt->rt_flags |= RTCF_REDIRECTED;
1336 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1341 /* called in rcu_read_lock() section */
1342 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1343 __be32 saddr, struct net_device *dev)
1346 struct in_device *in_dev = __in_dev_get_rcu(dev);
1347 __be32 skeys[2] = { saddr, 0 };
1348 int ikeys[2] = { dev->ifindex, 0 };
1349 struct inet_peer *peer;
1356 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1357 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1358 ipv4_is_zeronet(new_gw))
1359 goto reject_redirect;
1361 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1362 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1363 goto reject_redirect;
1364 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1365 goto reject_redirect;
1367 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1368 goto reject_redirect;
1371 for (s = 0; s < 2; s++) {
1372 for (i = 0; i < 2; i++) {
1374 struct rtable __rcu **rthp;
1377 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1379 rthp = &rt_hash_table[hash].chain;
1381 while ((rt = rcu_dereference(*rthp)) != NULL) {
1382 rthp = &rt->dst.rt_next;
1384 if (rt->rt_key_dst != daddr ||
1385 rt->rt_key_src != skeys[s] ||
1386 rt->rt_oif != ikeys[i] ||
1387 rt_is_input_route(rt) ||
1388 rt_is_expired(rt) ||
1389 !net_eq(dev_net(rt->dst.dev), net) ||
1391 rt->dst.dev != dev ||
1392 rt->rt_gateway != old_gw)
1396 rt_bind_peer(rt, rt->rt_dst, 1);
1400 if (peer->redirect_learned.a4 != new_gw ||
1401 peer->redirect_genid != redirect_genid) {
1402 peer->redirect_learned.a4 = new_gw;
1403 peer->redirect_genid = redirect_genid;
1404 atomic_inc(&__rt_peer_genid);
1406 check_peer_redir(&rt->dst, peer);
1414 #ifdef CONFIG_IP_ROUTE_VERBOSE
1415 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1416 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1417 " Advised path = %pI4 -> %pI4\n",
1418 &old_gw, dev->name, &new_gw,
1424 static bool peer_pmtu_expired(struct inet_peer *peer)
1426 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1429 time_after_eq(jiffies, orig) &&
1430 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1433 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1435 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1438 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1441 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1443 struct rtable *rt = (struct rtable *)dst;
1444 struct dst_entry *ret = dst;
1447 if (dst->obsolete > 0) {
1450 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1451 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1453 rt_genid(dev_net(dst->dev)));
1456 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1457 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1465 * 1. The first ip_rt_redirect_number redirects are sent
1466 * with exponential backoff, then we stop sending them at all,
1467 * assuming that the host ignores our redirects.
1468 * 2. If we did not see packets requiring redirects
1469 * during ip_rt_redirect_silence, we assume that the host
1470 * forgot redirected route and start to send redirects again.
1472 * This algorithm is much cheaper and more intelligent than dumb load limiting
1475 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1476 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1479 void ip_rt_send_redirect(struct sk_buff *skb)
1481 struct rtable *rt = skb_rtable(skb);
1482 struct in_device *in_dev;
1483 struct inet_peer *peer;
1487 in_dev = __in_dev_get_rcu(rt->dst.dev);
1488 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1492 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1496 rt_bind_peer(rt, rt->rt_dst, 1);
1499 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1503 /* No redirected packets during ip_rt_redirect_silence;
1504 * reset the algorithm.
1506 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1507 peer->rate_tokens = 0;
1509 /* Too many ignored redirects; do not send anything
1510 * set dst.rate_last to the last seen redirected packet.
1512 if (peer->rate_tokens >= ip_rt_redirect_number) {
1513 peer->rate_last = jiffies;
1517 /* Check for load limit; set rate_last to the latest sent
1520 if (peer->rate_tokens == 0 ||
1523 (ip_rt_redirect_load << peer->rate_tokens)))) {
1524 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1525 peer->rate_last = jiffies;
1526 ++peer->rate_tokens;
1527 #ifdef CONFIG_IP_ROUTE_VERBOSE
1529 peer->rate_tokens == ip_rt_redirect_number &&
1531 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1532 &ip_hdr(skb)->saddr, rt->rt_iif,
1533 &rt->rt_dst, &rt->rt_gateway);
1538 static int ip_error(struct sk_buff *skb)
1540 struct rtable *rt = skb_rtable(skb);
1541 struct inet_peer *peer;
1546 switch (rt->dst.error) {
1551 code = ICMP_HOST_UNREACH;
1554 code = ICMP_NET_UNREACH;
1555 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1556 IPSTATS_MIB_INNOROUTES);
1559 code = ICMP_PKT_FILTERED;
1564 rt_bind_peer(rt, rt->rt_dst, 1);
1570 peer->rate_tokens += now - peer->rate_last;
1571 if (peer->rate_tokens > ip_rt_error_burst)
1572 peer->rate_tokens = ip_rt_error_burst;
1573 peer->rate_last = now;
1574 if (peer->rate_tokens >= ip_rt_error_cost)
1575 peer->rate_tokens -= ip_rt_error_cost;
1580 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1582 out: kfree_skb(skb);
1587 * The last two values are not from the RFC but
1588 * are needed for AMPRnet AX.25 paths.
1591 static const unsigned short mtu_plateau[] =
1592 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1594 static inline unsigned short guess_mtu(unsigned short old_mtu)
1598 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1599 if (old_mtu > mtu_plateau[i])
1600 return mtu_plateau[i];
1604 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1605 unsigned short new_mtu,
1606 struct net_device *dev)
1608 unsigned short old_mtu = ntohs(iph->tot_len);
1609 unsigned short est_mtu = 0;
1610 struct inet_peer *peer;
1612 peer = inet_getpeer_v4(iph->daddr, 1);
1614 unsigned short mtu = new_mtu;
1616 if (new_mtu < 68 || new_mtu >= old_mtu) {
1617 /* BSD 4.2 derived systems incorrectly adjust
1618 * tot_len by the IP header length, and report
1619 * a zero MTU in the ICMP message.
1622 old_mtu >= 68 + (iph->ihl << 2))
1623 old_mtu -= iph->ihl << 2;
1624 mtu = guess_mtu(old_mtu);
1627 if (mtu < ip_rt_min_pmtu)
1628 mtu = ip_rt_min_pmtu;
1629 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1630 unsigned long pmtu_expires;
1632 pmtu_expires = jiffies + ip_rt_mtu_expires;
1637 peer->pmtu_learned = mtu;
1638 peer->pmtu_expires = pmtu_expires;
1639 atomic_inc(&__rt_peer_genid);
1644 return est_mtu ? : new_mtu;
1647 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1649 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1653 if (time_before(jiffies, expires)) {
1654 u32 orig_dst_mtu = dst_mtu(dst);
1655 if (peer->pmtu_learned < orig_dst_mtu) {
1656 if (!peer->pmtu_orig)
1657 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1658 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1660 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1661 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1664 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1666 struct rtable *rt = (struct rtable *) dst;
1667 struct inet_peer *peer;
1672 rt_bind_peer(rt, rt->rt_dst, 1);
1675 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1677 if (mtu < ip_rt_min_pmtu)
1678 mtu = ip_rt_min_pmtu;
1679 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1681 pmtu_expires = jiffies + ip_rt_mtu_expires;
1685 peer->pmtu_learned = mtu;
1686 peer->pmtu_expires = pmtu_expires;
1688 atomic_inc(&__rt_peer_genid);
1689 rt->rt_peer_genid = rt_peer_genid();
1691 check_peer_pmtu(dst, peer);
1696 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1698 struct rtable *rt = (struct rtable *) dst;
1700 if (rt_is_expired(rt))
1702 if (rt->rt_peer_genid != rt_peer_genid()) {
1703 struct inet_peer *peer;
1706 rt_bind_peer(rt, rt->rt_dst, 0);
1710 check_peer_pmtu(dst, peer);
1712 if (peer->redirect_genid != redirect_genid)
1713 peer->redirect_learned.a4 = 0;
1714 if (peer->redirect_learned.a4 &&
1715 peer->redirect_learned.a4 != rt->rt_gateway) {
1716 if (check_peer_redir(dst, peer))
1721 rt->rt_peer_genid = rt_peer_genid();
1726 static void ipv4_dst_destroy(struct dst_entry *dst)
1728 struct rtable *rt = (struct rtable *) dst;
1729 struct inet_peer *peer = rt->peer;
1732 fib_info_put(rt->fi);
1742 static void ipv4_link_failure(struct sk_buff *skb)
1746 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1748 rt = skb_rtable(skb);
1749 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1750 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1753 static int ip_rt_bug(struct sk_buff *skb)
1755 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1756 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1757 skb->dev ? skb->dev->name : "?");
1764 We do not cache source address of outgoing interface,
1765 because it is used only by IP RR, TS and SRR options,
1766 so that it out of fast path.
1768 BTW remember: "addr" is allowed to be not aligned
1772 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1776 if (rt_is_output_route(rt))
1777 src = ip_hdr(skb)->saddr;
1779 struct fib_result res;
1785 memset(&fl4, 0, sizeof(fl4));
1786 fl4.daddr = iph->daddr;
1787 fl4.saddr = iph->saddr;
1788 fl4.flowi4_tos = RT_TOS(iph->tos);
1789 fl4.flowi4_oif = rt->dst.dev->ifindex;
1790 fl4.flowi4_iif = skb->dev->ifindex;
1791 fl4.flowi4_mark = skb->mark;
1794 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1795 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1797 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1801 memcpy(addr, &src, 4);
1804 #ifdef CONFIG_IP_ROUTE_CLASSID
1805 static void set_class_tag(struct rtable *rt, u32 tag)
1807 if (!(rt->dst.tclassid & 0xFFFF))
1808 rt->dst.tclassid |= tag & 0xFFFF;
1809 if (!(rt->dst.tclassid & 0xFFFF0000))
1810 rt->dst.tclassid |= tag & 0xFFFF0000;
1814 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1816 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1819 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1821 if (advmss > 65535 - 40)
1822 advmss = 65535 - 40;
1827 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1829 const struct rtable *rt = (const struct rtable *) dst;
1830 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1832 if (mtu && rt_is_output_route(rt))
1835 mtu = dst->dev->mtu;
1837 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1839 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1843 if (mtu > IP_MAX_MTU)
1849 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1850 struct fib_info *fi)
1852 struct inet_peer *peer;
1855 /* If a peer entry exists for this destination, we must hook
1856 * it up in order to get at cached metrics.
1858 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1861 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1863 rt->rt_peer_genid = rt_peer_genid();
1864 if (inet_metrics_new(peer))
1865 memcpy(peer->metrics, fi->fib_metrics,
1866 sizeof(u32) * RTAX_MAX);
1867 dst_init_metrics(&rt->dst, peer->metrics, false);
1869 check_peer_pmtu(&rt->dst, peer);
1870 if (peer->redirect_genid != redirect_genid)
1871 peer->redirect_learned.a4 = 0;
1872 if (peer->redirect_learned.a4 &&
1873 peer->redirect_learned.a4 != rt->rt_gateway) {
1874 rt->rt_gateway = peer->redirect_learned.a4;
1875 rt->rt_flags |= RTCF_REDIRECTED;
1878 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1880 atomic_inc(&fi->fib_clntref);
1882 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1886 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1887 const struct fib_result *res,
1888 struct fib_info *fi, u16 type, u32 itag)
1890 struct dst_entry *dst = &rt->dst;
1893 if (FIB_RES_GW(*res) &&
1894 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1895 rt->rt_gateway = FIB_RES_GW(*res);
1896 rt_init_metrics(rt, fl4, fi);
1897 #ifdef CONFIG_IP_ROUTE_CLASSID
1898 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1902 if (dst_mtu(dst) > IP_MAX_MTU)
1903 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1904 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1905 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1907 #ifdef CONFIG_IP_ROUTE_CLASSID
1908 #ifdef CONFIG_IP_MULTIPLE_TABLES
1909 set_class_tag(rt, fib_rules_tclass(res));
1911 set_class_tag(rt, itag);
1915 static struct rtable *rt_dst_alloc(struct net_device *dev,
1916 bool nopolicy, bool noxfrm)
1918 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1920 (nopolicy ? DST_NOPOLICY : 0) |
1921 (noxfrm ? DST_NOXFRM : 0));
1924 /* called in rcu_read_lock() section */
1925 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1926 u8 tos, struct net_device *dev, int our)
1931 struct in_device *in_dev = __in_dev_get_rcu(dev);
1935 /* Primary sanity checks. */
1940 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1941 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1944 if (ipv4_is_zeronet(saddr)) {
1945 if (!ipv4_is_local_multicast(daddr))
1947 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1949 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1954 rth = rt_dst_alloc(init_net.loopback_dev,
1955 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1959 #ifdef CONFIG_IP_ROUTE_CLASSID
1960 rth->dst.tclassid = itag;
1962 rth->dst.output = ip_rt_bug;
1964 rth->rt_key_dst = daddr;
1965 rth->rt_key_src = saddr;
1966 rth->rt_genid = rt_genid(dev_net(dev));
1967 rth->rt_flags = RTCF_MULTICAST;
1968 rth->rt_type = RTN_MULTICAST;
1969 rth->rt_key_tos = tos;
1970 rth->rt_dst = daddr;
1971 rth->rt_src = saddr;
1972 rth->rt_route_iif = dev->ifindex;
1973 rth->rt_iif = dev->ifindex;
1975 rth->rt_mark = skb->mark;
1976 rth->rt_gateway = daddr;
1977 rth->rt_spec_dst= spec_dst;
1978 rth->rt_peer_genid = 0;
1982 rth->dst.input= ip_local_deliver;
1983 rth->rt_flags |= RTCF_LOCAL;
1986 #ifdef CONFIG_IP_MROUTE
1987 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1988 rth->dst.input = ip_mr_input;
1990 RT_CACHE_STAT_INC(in_slow_mc);
1992 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1993 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1994 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2005 static void ip_handle_martian_source(struct net_device *dev,
2006 struct in_device *in_dev,
2007 struct sk_buff *skb,
2011 RT_CACHE_STAT_INC(in_martian_src);
2012 #ifdef CONFIG_IP_ROUTE_VERBOSE
2013 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2015 * RFC1812 recommendation, if source is martian,
2016 * the only hint is MAC header.
2018 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2019 &daddr, &saddr, dev->name);
2020 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2022 const unsigned char *p = skb_mac_header(skb);
2023 printk(KERN_WARNING "ll header: ");
2024 for (i = 0; i < dev->hard_header_len; i++, p++) {
2026 if (i < (dev->hard_header_len - 1))
2035 /* called in rcu_read_lock() section */
2036 static int __mkroute_input(struct sk_buff *skb,
2037 const struct fib_result *res,
2038 struct in_device *in_dev,
2039 __be32 daddr, __be32 saddr, u32 tos,
2040 struct rtable **result)
2044 struct in_device *out_dev;
2045 unsigned int flags = 0;
2049 /* get a working reference to the output device */
2050 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2051 if (out_dev == NULL) {
2052 if (net_ratelimit())
2053 printk(KERN_CRIT "Bug in ip_route_input" \
2054 "_slow(). Please, report\n");
2059 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2060 in_dev->dev, &spec_dst, &itag);
2062 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2069 flags |= RTCF_DIRECTSRC;
2071 if (out_dev == in_dev && err &&
2072 (IN_DEV_SHARED_MEDIA(out_dev) ||
2073 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2074 flags |= RTCF_DOREDIRECT;
2076 if (skb->protocol != htons(ETH_P_IP)) {
2077 /* Not IP (i.e. ARP). Do not create route, if it is
2078 * invalid for proxy arp. DNAT routes are always valid.
2080 * Proxy arp feature have been extended to allow, ARP
2081 * replies back to the same interface, to support
2082 * Private VLAN switch technologies. See arp.c.
2084 if (out_dev == in_dev &&
2085 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2091 rth = rt_dst_alloc(out_dev->dev,
2092 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2093 IN_DEV_CONF_GET(out_dev, NOXFRM));
2099 rth->rt_key_dst = daddr;
2100 rth->rt_key_src = saddr;
2101 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2102 rth->rt_flags = flags;
2103 rth->rt_type = res->type;
2104 rth->rt_key_tos = tos;
2105 rth->rt_dst = daddr;
2106 rth->rt_src = saddr;
2107 rth->rt_route_iif = in_dev->dev->ifindex;
2108 rth->rt_iif = in_dev->dev->ifindex;
2110 rth->rt_mark = skb->mark;
2111 rth->rt_gateway = daddr;
2112 rth->rt_spec_dst= spec_dst;
2113 rth->rt_peer_genid = 0;
2117 rth->dst.input = ip_forward;
2118 rth->dst.output = ip_output;
2120 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2128 static int ip_mkroute_input(struct sk_buff *skb,
2129 struct fib_result *res,
2130 const struct flowi4 *fl4,
2131 struct in_device *in_dev,
2132 __be32 daddr, __be32 saddr, u32 tos)
2134 struct rtable* rth = NULL;
2138 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2139 if (res->fi && res->fi->fib_nhs > 1)
2140 fib_select_multipath(res);
2143 /* create a routing cache entry */
2144 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2148 /* put it into the cache */
2149 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2150 rt_genid(dev_net(rth->dst.dev)));
2151 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2153 return PTR_ERR(rth);
2158 * NOTE. We drop all the packets that has local source
2159 * addresses, because every properly looped back packet
2160 * must have correct destination already attached by output routine.
2162 * Such approach solves two big problems:
2163 * 1. Not simplex devices are handled properly.
2164 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2165 * called with rcu_read_lock()
2168 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2169 u8 tos, struct net_device *dev)
2171 struct fib_result res;
2172 struct in_device *in_dev = __in_dev_get_rcu(dev);
2176 struct rtable * rth;
2180 struct net * net = dev_net(dev);
2182 /* IP on this device is disabled. */
2187 /* Check for the most weird martians, which can be not detected
2191 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2192 ipv4_is_loopback(saddr))
2193 goto martian_source;
2195 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2198 /* Accept zero addresses only to limited broadcast;
2199 * I even do not know to fix it or not. Waiting for complains :-)
2201 if (ipv4_is_zeronet(saddr))
2202 goto martian_source;
2204 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2205 goto martian_destination;
2208 * Now we are ready to route packet.
2211 fl4.flowi4_iif = dev->ifindex;
2212 fl4.flowi4_mark = skb->mark;
2213 fl4.flowi4_tos = tos;
2214 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2217 err = fib_lookup(net, &fl4, &res);
2219 if (!IN_DEV_FORWARD(in_dev))
2224 RT_CACHE_STAT_INC(in_slow_tot);
2226 if (res.type == RTN_BROADCAST)
2229 if (res.type == RTN_LOCAL) {
2230 err = fib_validate_source(skb, saddr, daddr, tos,
2231 net->loopback_dev->ifindex,
2232 dev, &spec_dst, &itag);
2234 goto martian_source_keep_err;
2236 flags |= RTCF_DIRECTSRC;
2241 if (!IN_DEV_FORWARD(in_dev))
2243 if (res.type != RTN_UNICAST)
2244 goto martian_destination;
2246 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2250 if (skb->protocol != htons(ETH_P_IP))
2253 if (ipv4_is_zeronet(saddr))
2254 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2256 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2259 goto martian_source_keep_err;
2261 flags |= RTCF_DIRECTSRC;
2263 flags |= RTCF_BROADCAST;
2264 res.type = RTN_BROADCAST;
2265 RT_CACHE_STAT_INC(in_brd);
2268 rth = rt_dst_alloc(net->loopback_dev,
2269 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2273 rth->dst.input= ip_local_deliver;
2274 rth->dst.output= ip_rt_bug;
2275 #ifdef CONFIG_IP_ROUTE_CLASSID
2276 rth->dst.tclassid = itag;
2279 rth->rt_key_dst = daddr;
2280 rth->rt_key_src = saddr;
2281 rth->rt_genid = rt_genid(net);
2282 rth->rt_flags = flags|RTCF_LOCAL;
2283 rth->rt_type = res.type;
2284 rth->rt_key_tos = tos;
2285 rth->rt_dst = daddr;
2286 rth->rt_src = saddr;
2287 #ifdef CONFIG_IP_ROUTE_CLASSID
2288 rth->dst.tclassid = itag;
2290 rth->rt_route_iif = dev->ifindex;
2291 rth->rt_iif = dev->ifindex;
2293 rth->rt_mark = skb->mark;
2294 rth->rt_gateway = daddr;
2295 rth->rt_spec_dst= spec_dst;
2296 rth->rt_peer_genid = 0;
2299 if (res.type == RTN_UNREACHABLE) {
2300 rth->dst.input= ip_error;
2301 rth->dst.error= -err;
2302 rth->rt_flags &= ~RTCF_LOCAL;
2304 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2305 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2312 RT_CACHE_STAT_INC(in_no_route);
2313 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2314 res.type = RTN_UNREACHABLE;
2320 * Do not cache martian addresses: they should be logged (RFC1812)
2322 martian_destination:
2323 RT_CACHE_STAT_INC(in_martian_dst);
2324 #ifdef CONFIG_IP_ROUTE_VERBOSE
2325 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2326 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2327 &daddr, &saddr, dev->name);
2331 err = -EHOSTUNREACH;
2344 martian_source_keep_err:
2345 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2349 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2350 u8 tos, struct net_device *dev, bool noref)
2352 struct rtable * rth;
2354 int iif = dev->ifindex;
2362 if (!rt_caching(net))
2365 tos &= IPTOS_RT_MASK;
2366 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2368 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2369 rth = rcu_dereference(rth->dst.rt_next)) {
2370 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2371 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2372 (rth->rt_route_iif ^ iif) |
2373 (rth->rt_key_tos ^ tos)) == 0 &&
2374 rth->rt_mark == skb->mark &&
2375 net_eq(dev_net(rth->dst.dev), net) &&
2376 !rt_is_expired(rth)) {
2378 dst_use_noref(&rth->dst, jiffies);
2379 skb_dst_set_noref(skb, &rth->dst);
2381 dst_use(&rth->dst, jiffies);
2382 skb_dst_set(skb, &rth->dst);
2384 RT_CACHE_STAT_INC(in_hit);
2388 RT_CACHE_STAT_INC(in_hlist_search);
2392 /* Multicast recognition logic is moved from route cache to here.
2393 The problem was that too many Ethernet cards have broken/missing
2394 hardware multicast filters :-( As result the host on multicasting
2395 network acquires a lot of useless route cache entries, sort of
2396 SDR messages from all the world. Now we try to get rid of them.
2397 Really, provided software IP multicast filter is organized
2398 reasonably (at least, hashed), it does not result in a slowdown
2399 comparing with route cache reject entries.
2400 Note, that multicast routers are not affected, because
2401 route cache entry is created eventually.
2403 if (ipv4_is_multicast(daddr)) {
2404 struct in_device *in_dev = __in_dev_get_rcu(dev);
2407 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2408 ip_hdr(skb)->protocol);
2410 #ifdef CONFIG_IP_MROUTE
2412 (!ipv4_is_local_multicast(daddr) &&
2413 IN_DEV_MFORWARD(in_dev))
2416 int res = ip_route_input_mc(skb, daddr, saddr,
2425 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2429 EXPORT_SYMBOL(ip_route_input_common);
2431 /* called with rcu_read_lock() */
2432 static struct rtable *__mkroute_output(const struct fib_result *res,
2433 const struct flowi4 *fl4,
2434 __be32 orig_daddr, __be32 orig_saddr,
2435 int orig_oif, struct net_device *dev_out,
2438 struct fib_info *fi = res->fi;
2439 u32 tos = RT_FL_TOS(fl4);
2440 struct in_device *in_dev;
2441 u16 type = res->type;
2444 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2445 return ERR_PTR(-EINVAL);
2447 if (ipv4_is_lbcast(fl4->daddr))
2448 type = RTN_BROADCAST;
2449 else if (ipv4_is_multicast(fl4->daddr))
2450 type = RTN_MULTICAST;
2451 else if (ipv4_is_zeronet(fl4->daddr))
2452 return ERR_PTR(-EINVAL);
2454 if (dev_out->flags & IFF_LOOPBACK)
2455 flags |= RTCF_LOCAL;
2457 in_dev = __in_dev_get_rcu(dev_out);
2459 return ERR_PTR(-EINVAL);
2461 if (type == RTN_BROADCAST) {
2462 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2464 } else if (type == RTN_MULTICAST) {
2465 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2466 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2468 flags &= ~RTCF_LOCAL;
2469 /* If multicast route do not exist use
2470 * default one, but do not gateway in this case.
2473 if (fi && res->prefixlen < 4)
2477 rth = rt_dst_alloc(dev_out,
2478 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2479 IN_DEV_CONF_GET(in_dev, NOXFRM));
2481 return ERR_PTR(-ENOBUFS);
2483 rth->dst.output = ip_output;
2485 rth->rt_key_dst = orig_daddr;
2486 rth->rt_key_src = orig_saddr;
2487 rth->rt_genid = rt_genid(dev_net(dev_out));
2488 rth->rt_flags = flags;
2489 rth->rt_type = type;
2490 rth->rt_key_tos = tos;
2491 rth->rt_dst = fl4->daddr;
2492 rth->rt_src = fl4->saddr;
2493 rth->rt_route_iif = 0;
2494 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2495 rth->rt_oif = orig_oif;
2496 rth->rt_mark = fl4->flowi4_mark;
2497 rth->rt_gateway = fl4->daddr;
2498 rth->rt_spec_dst= fl4->saddr;
2499 rth->rt_peer_genid = 0;
2503 RT_CACHE_STAT_INC(out_slow_tot);
2505 if (flags & RTCF_LOCAL) {
2506 rth->dst.input = ip_local_deliver;
2507 rth->rt_spec_dst = fl4->daddr;
2509 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2510 rth->rt_spec_dst = fl4->saddr;
2511 if (flags & RTCF_LOCAL &&
2512 !(dev_out->flags & IFF_LOOPBACK)) {
2513 rth->dst.output = ip_mc_output;
2514 RT_CACHE_STAT_INC(out_slow_mc);
2516 #ifdef CONFIG_IP_MROUTE
2517 if (type == RTN_MULTICAST) {
2518 if (IN_DEV_MFORWARD(in_dev) &&
2519 !ipv4_is_local_multicast(fl4->daddr)) {
2520 rth->dst.input = ip_mr_input;
2521 rth->dst.output = ip_mc_output;
2527 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2533 * Major route resolver routine.
2534 * called with rcu_read_lock();
2537 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2539 struct net_device *dev_out = NULL;
2540 u32 tos = RT_FL_TOS(fl4);
2541 unsigned int flags = 0;
2542 struct fib_result res;
2549 #ifdef CONFIG_IP_MULTIPLE_TABLES
2553 orig_daddr = fl4->daddr;
2554 orig_saddr = fl4->saddr;
2555 orig_oif = fl4->flowi4_oif;
2557 fl4->flowi4_iif = net->loopback_dev->ifindex;
2558 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2559 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2560 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2564 rth = ERR_PTR(-EINVAL);
2565 if (ipv4_is_multicast(fl4->saddr) ||
2566 ipv4_is_lbcast(fl4->saddr) ||
2567 ipv4_is_zeronet(fl4->saddr))
2570 /* I removed check for oif == dev_out->oif here.
2571 It was wrong for two reasons:
2572 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2573 is assigned to multiple interfaces.
2574 2. Moreover, we are allowed to send packets with saddr
2575 of another iface. --ANK
2578 if (fl4->flowi4_oif == 0 &&
2579 (ipv4_is_multicast(fl4->daddr) ||
2580 ipv4_is_lbcast(fl4->daddr))) {
2581 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2582 dev_out = __ip_dev_find(net, fl4->saddr, false);
2583 if (dev_out == NULL)
2586 /* Special hack: user can direct multicasts
2587 and limited broadcast via necessary interface
2588 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2589 This hack is not just for fun, it allows
2590 vic,vat and friends to work.
2591 They bind socket to loopback, set ttl to zero
2592 and expect that it will work.
2593 From the viewpoint of routing cache they are broken,
2594 because we are not allowed to build multicast path
2595 with loopback source addr (look, routing cache
2596 cannot know, that ttl is zero, so that packet
2597 will not leave this host and route is valid).
2598 Luckily, this hack is good workaround.
2601 fl4->flowi4_oif = dev_out->ifindex;
2605 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2606 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2607 if (!__ip_dev_find(net, fl4->saddr, false))
2613 if (fl4->flowi4_oif) {
2614 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2615 rth = ERR_PTR(-ENODEV);
2616 if (dev_out == NULL)
2619 /* RACE: Check return value of inet_select_addr instead. */
2620 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2621 rth = ERR_PTR(-ENETUNREACH);
2624 if (ipv4_is_local_multicast(fl4->daddr) ||
2625 ipv4_is_lbcast(fl4->daddr)) {
2627 fl4->saddr = inet_select_addr(dev_out, 0,
2632 if (ipv4_is_multicast(fl4->daddr))
2633 fl4->saddr = inet_select_addr(dev_out, 0,
2635 else if (!fl4->daddr)
2636 fl4->saddr = inet_select_addr(dev_out, 0,
2642 fl4->daddr = fl4->saddr;
2644 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2645 dev_out = net->loopback_dev;
2646 fl4->flowi4_oif = net->loopback_dev->ifindex;
2647 res.type = RTN_LOCAL;
2648 flags |= RTCF_LOCAL;
2652 if (fib_lookup(net, fl4, &res)) {
2654 if (fl4->flowi4_oif) {
2655 /* Apparently, routing tables are wrong. Assume,
2656 that the destination is on link.
2659 Because we are allowed to send to iface
2660 even if it has NO routes and NO assigned
2661 addresses. When oif is specified, routing
2662 tables are looked up with only one purpose:
2663 to catch if destination is gatewayed, rather than
2664 direct. Moreover, if MSG_DONTROUTE is set,
2665 we send packet, ignoring both routing tables
2666 and ifaddr state. --ANK
2669 We could make it even if oif is unknown,
2670 likely IPv6, but we do not.
2673 if (fl4->saddr == 0)
2674 fl4->saddr = inet_select_addr(dev_out, 0,
2676 res.type = RTN_UNICAST;
2679 rth = ERR_PTR(-ENETUNREACH);
2683 if (res.type == RTN_LOCAL) {
2685 if (res.fi->fib_prefsrc)
2686 fl4->saddr = res.fi->fib_prefsrc;
2688 fl4->saddr = fl4->daddr;
2690 dev_out = net->loopback_dev;
2691 fl4->flowi4_oif = dev_out->ifindex;
2693 flags |= RTCF_LOCAL;
2697 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2698 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2699 fib_select_multipath(&res);
2702 if (!res.prefixlen &&
2703 res.table->tb_num_default > 1 &&
2704 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2705 fib_select_default(&res);
2708 fl4->saddr = FIB_RES_PREFSRC(net, res);
2710 dev_out = FIB_RES_DEV(res);
2711 fl4->flowi4_oif = dev_out->ifindex;
2715 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2720 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2721 rt_genid(dev_net(dev_out)));
2722 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2730 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2735 if (!rt_caching(net))
2738 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2741 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2742 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2743 if (rth->rt_key_dst == flp4->daddr &&
2744 rth->rt_key_src == flp4->saddr &&
2745 rt_is_output_route(rth) &&
2746 rth->rt_oif == flp4->flowi4_oif &&
2747 rth->rt_mark == flp4->flowi4_mark &&
2748 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2749 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2750 net_eq(dev_net(rth->dst.dev), net) &&
2751 !rt_is_expired(rth)) {
2752 dst_use(&rth->dst, jiffies);
2753 RT_CACHE_STAT_INC(out_hit);
2754 rcu_read_unlock_bh();
2756 flp4->saddr = rth->rt_src;
2758 flp4->daddr = rth->rt_dst;
2761 RT_CACHE_STAT_INC(out_hlist_search);
2763 rcu_read_unlock_bh();
2766 return ip_route_output_slow(net, flp4);
2768 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2770 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2775 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2777 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2779 return mtu ? : dst->dev->mtu;
2782 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2786 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2792 static struct dst_ops ipv4_dst_blackhole_ops = {
2794 .protocol = cpu_to_be16(ETH_P_IP),
2795 .destroy = ipv4_dst_destroy,
2796 .check = ipv4_blackhole_dst_check,
2797 .mtu = ipv4_blackhole_mtu,
2798 .default_advmss = ipv4_default_advmss,
2799 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2800 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2801 .neigh_lookup = ipv4_neigh_lookup,
2804 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2806 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2807 struct rtable *ort = (struct rtable *) dst_orig;
2810 struct dst_entry *new = &rt->dst;
2813 new->input = dst_discard;
2814 new->output = dst_discard;
2815 dst_copy_metrics(new, &ort->dst);
2817 new->dev = ort->dst.dev;
2821 rt->rt_key_dst = ort->rt_key_dst;
2822 rt->rt_key_src = ort->rt_key_src;
2823 rt->rt_key_tos = ort->rt_key_tos;
2824 rt->rt_route_iif = ort->rt_route_iif;
2825 rt->rt_iif = ort->rt_iif;
2826 rt->rt_oif = ort->rt_oif;
2827 rt->rt_mark = ort->rt_mark;
2829 rt->rt_genid = rt_genid(net);
2830 rt->rt_flags = ort->rt_flags;
2831 rt->rt_type = ort->rt_type;
2832 rt->rt_dst = ort->rt_dst;
2833 rt->rt_src = ort->rt_src;
2834 rt->rt_gateway = ort->rt_gateway;
2835 rt->rt_spec_dst = ort->rt_spec_dst;
2836 rt->peer = ort->peer;
2838 atomic_inc(&rt->peer->refcnt);
2841 atomic_inc(&rt->fi->fib_clntref);
2846 dst_release(dst_orig);
2848 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2851 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2854 struct rtable *rt = __ip_route_output_key(net, flp4);
2859 if (flp4->flowi4_proto)
2860 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2861 flowi4_to_flowi(flp4),
2866 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2868 static int rt_fill_info(struct net *net,
2869 struct sk_buff *skb, u32 pid, u32 seq, int event,
2870 int nowait, unsigned int flags)
2872 struct rtable *rt = skb_rtable(skb);
2874 struct nlmsghdr *nlh;
2875 unsigned long expires = 0;
2876 const struct inet_peer *peer = rt->peer;
2877 u32 id = 0, ts = 0, tsage = 0, error;
2879 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2883 r = nlmsg_data(nlh);
2884 r->rtm_family = AF_INET;
2885 r->rtm_dst_len = 32;
2887 r->rtm_tos = rt->rt_key_tos;
2888 r->rtm_table = RT_TABLE_MAIN;
2889 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2890 r->rtm_type = rt->rt_type;
2891 r->rtm_scope = RT_SCOPE_UNIVERSE;
2892 r->rtm_protocol = RTPROT_UNSPEC;
2893 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2894 if (rt->rt_flags & RTCF_NOTIFY)
2895 r->rtm_flags |= RTM_F_NOTIFY;
2897 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2899 if (rt->rt_key_src) {
2900 r->rtm_src_len = 32;
2901 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2904 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2905 #ifdef CONFIG_IP_ROUTE_CLASSID
2906 if (rt->dst.tclassid)
2907 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2909 if (rt_is_input_route(rt))
2910 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2911 else if (rt->rt_src != rt->rt_key_src)
2912 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2914 if (rt->rt_dst != rt->rt_gateway)
2915 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2917 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2918 goto nla_put_failure;
2921 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2923 error = rt->dst.error;
2925 inet_peer_refcheck(rt->peer);
2926 id = atomic_read(&peer->ip_id_count) & 0xffff;
2927 if (peer->tcp_ts_stamp) {
2929 tsage = get_seconds() - peer->tcp_ts_stamp;
2931 expires = ACCESS_ONCE(peer->pmtu_expires);
2933 if (time_before(jiffies, expires))
2940 if (rt_is_input_route(rt)) {
2941 #ifdef CONFIG_IP_MROUTE
2942 __be32 dst = rt->rt_dst;
2944 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2945 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2946 int err = ipmr_get_route(net, skb,
2947 rt->rt_src, rt->rt_dst,
2953 goto nla_put_failure;
2955 if (err == -EMSGSIZE)
2956 goto nla_put_failure;
2962 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2965 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2966 expires, error) < 0)
2967 goto nla_put_failure;
2969 return nlmsg_end(skb, nlh);
2972 nlmsg_cancel(skb, nlh);
2976 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2978 struct net *net = sock_net(in_skb->sk);
2980 struct nlattr *tb[RTA_MAX+1];
2981 struct rtable *rt = NULL;
2987 struct sk_buff *skb;
2989 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2993 rtm = nlmsg_data(nlh);
2995 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3001 /* Reserve room for dummy headers, this skb can pass
3002 through good chunk of routing engine.
3004 skb_reset_mac_header(skb);
3005 skb_reset_network_header(skb);
3007 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3008 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3009 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3011 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3012 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3013 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3014 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3017 struct net_device *dev;
3019 dev = __dev_get_by_index(net, iif);
3025 skb->protocol = htons(ETH_P_IP);
3029 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3032 rt = skb_rtable(skb);
3033 if (err == 0 && rt->dst.error)
3034 err = -rt->dst.error;
3036 struct flowi4 fl4 = {
3039 .flowi4_tos = rtm->rtm_tos,
3040 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3041 .flowi4_mark = mark,
3043 rt = ip_route_output_key(net, &fl4);
3053 skb_dst_set(skb, &rt->dst);
3054 if (rtm->rtm_flags & RTM_F_NOTIFY)
3055 rt->rt_flags |= RTCF_NOTIFY;
3057 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3058 RTM_NEWROUTE, 0, 0);
3062 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3071 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3078 net = sock_net(skb->sk);
3083 s_idx = idx = cb->args[1];
3084 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3085 if (!rt_hash_table[h].chain)
3088 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3089 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3090 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3092 if (rt_is_expired(rt))
3094 skb_dst_set_noref(skb, &rt->dst);
3095 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3096 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3097 1, NLM_F_MULTI) <= 0) {
3099 rcu_read_unlock_bh();
3104 rcu_read_unlock_bh();
3113 void ip_rt_multicast_event(struct in_device *in_dev)
3115 rt_cache_flush(dev_net(in_dev->dev), 0);
3118 #ifdef CONFIG_SYSCTL
3119 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3120 void __user *buffer,
3121 size_t *lenp, loff_t *ppos)
3128 memcpy(&ctl, __ctl, sizeof(ctl));
3129 ctl.data = &flush_delay;
3130 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3132 net = (struct net *)__ctl->extra1;
3133 rt_cache_flush(net, flush_delay);
3140 static ctl_table ipv4_route_table[] = {
3142 .procname = "gc_thresh",
3143 .data = &ipv4_dst_ops.gc_thresh,
3144 .maxlen = sizeof(int),
3146 .proc_handler = proc_dointvec,
3149 .procname = "max_size",
3150 .data = &ip_rt_max_size,
3151 .maxlen = sizeof(int),
3153 .proc_handler = proc_dointvec,
3156 /* Deprecated. Use gc_min_interval_ms */
3158 .procname = "gc_min_interval",
3159 .data = &ip_rt_gc_min_interval,
3160 .maxlen = sizeof(int),
3162 .proc_handler = proc_dointvec_jiffies,
3165 .procname = "gc_min_interval_ms",
3166 .data = &ip_rt_gc_min_interval,
3167 .maxlen = sizeof(int),
3169 .proc_handler = proc_dointvec_ms_jiffies,
3172 .procname = "gc_timeout",
3173 .data = &ip_rt_gc_timeout,
3174 .maxlen = sizeof(int),
3176 .proc_handler = proc_dointvec_jiffies,
3179 .procname = "redirect_load",
3180 .data = &ip_rt_redirect_load,
3181 .maxlen = sizeof(int),
3183 .proc_handler = proc_dointvec,
3186 .procname = "redirect_number",
3187 .data = &ip_rt_redirect_number,
3188 .maxlen = sizeof(int),
3190 .proc_handler = proc_dointvec,
3193 .procname = "redirect_silence",
3194 .data = &ip_rt_redirect_silence,
3195 .maxlen = sizeof(int),
3197 .proc_handler = proc_dointvec,
3200 .procname = "error_cost",
3201 .data = &ip_rt_error_cost,
3202 .maxlen = sizeof(int),
3204 .proc_handler = proc_dointvec,
3207 .procname = "error_burst",
3208 .data = &ip_rt_error_burst,
3209 .maxlen = sizeof(int),
3211 .proc_handler = proc_dointvec,
3214 .procname = "gc_elasticity",
3215 .data = &ip_rt_gc_elasticity,
3216 .maxlen = sizeof(int),
3218 .proc_handler = proc_dointvec,
3221 .procname = "mtu_expires",
3222 .data = &ip_rt_mtu_expires,
3223 .maxlen = sizeof(int),
3225 .proc_handler = proc_dointvec_jiffies,
3228 .procname = "min_pmtu",
3229 .data = &ip_rt_min_pmtu,
3230 .maxlen = sizeof(int),
3232 .proc_handler = proc_dointvec,
3235 .procname = "min_adv_mss",
3236 .data = &ip_rt_min_advmss,
3237 .maxlen = sizeof(int),
3239 .proc_handler = proc_dointvec,
3244 static struct ctl_table empty[1];
3246 static struct ctl_table ipv4_skeleton[] =
3248 { .procname = "route",
3249 .mode = 0555, .child = ipv4_route_table},
3250 { .procname = "neigh",
3251 .mode = 0555, .child = empty},
3255 static __net_initdata struct ctl_path ipv4_path[] = {
3256 { .procname = "net", },
3257 { .procname = "ipv4", },
3261 static struct ctl_table ipv4_route_flush_table[] = {
3263 .procname = "flush",
3264 .maxlen = sizeof(int),
3266 .proc_handler = ipv4_sysctl_rtcache_flush,
3271 static __net_initdata struct ctl_path ipv4_route_path[] = {
3272 { .procname = "net", },
3273 { .procname = "ipv4", },
3274 { .procname = "route", },
3278 static __net_init int sysctl_route_net_init(struct net *net)
3280 struct ctl_table *tbl;
3282 tbl = ipv4_route_flush_table;
3283 if (!net_eq(net, &init_net)) {
3284 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3288 tbl[0].extra1 = net;
3290 net->ipv4.route_hdr =
3291 register_net_sysctl_table(net, ipv4_route_path, tbl);
3292 if (net->ipv4.route_hdr == NULL)
3297 if (tbl != ipv4_route_flush_table)
3303 static __net_exit void sysctl_route_net_exit(struct net *net)
3305 struct ctl_table *tbl;
3307 tbl = net->ipv4.route_hdr->ctl_table_arg;
3308 unregister_net_sysctl_table(net->ipv4.route_hdr);
3309 BUG_ON(tbl == ipv4_route_flush_table);
3313 static __net_initdata struct pernet_operations sysctl_route_ops = {
3314 .init = sysctl_route_net_init,
3315 .exit = sysctl_route_net_exit,
3319 static __net_init int rt_genid_init(struct net *net)
3321 get_random_bytes(&net->ipv4.rt_genid,
3322 sizeof(net->ipv4.rt_genid));
3323 get_random_bytes(&net->ipv4.dev_addr_genid,
3324 sizeof(net->ipv4.dev_addr_genid));
3328 static __net_initdata struct pernet_operations rt_genid_ops = {
3329 .init = rt_genid_init,
3333 #ifdef CONFIG_IP_ROUTE_CLASSID
3334 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3335 #endif /* CONFIG_IP_ROUTE_CLASSID */
3337 static __initdata unsigned long rhash_entries;
3338 static int __init set_rhash_entries(char *str)
3342 rhash_entries = simple_strtoul(str, &str, 0);
3345 __setup("rhash_entries=", set_rhash_entries);
3347 int __init ip_rt_init(void)
3351 #ifdef CONFIG_IP_ROUTE_CLASSID
3352 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3354 panic("IP: failed to allocate ip_rt_acct\n");
3357 ipv4_dst_ops.kmem_cachep =
3358 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3359 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3361 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3363 if (dst_entries_init(&ipv4_dst_ops) < 0)
3364 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3366 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3367 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3369 rt_hash_table = (struct rt_hash_bucket *)
3370 alloc_large_system_hash("IP route cache",
3371 sizeof(struct rt_hash_bucket),
3373 (totalram_pages >= 128 * 1024) ?
3378 rhash_entries ? 0 : 512 * 1024);
3379 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3380 rt_hash_lock_init();
3382 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3383 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3388 if (ip_rt_proc_init())
3389 printk(KERN_ERR "Unable to create route proc files\n");
3392 xfrm4_init(ip_rt_max_size);
3394 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3396 #ifdef CONFIG_SYSCTL
3397 register_pernet_subsys(&sysctl_route_ops);
3399 register_pernet_subsys(&rt_genid_ops);
3403 #ifdef CONFIG_SYSCTL
3405 * We really need to sanitize the damn ipv4 init order, then all
3406 * this nonsense will go away.
3408 void __init ip_static_sysctl_init(void)
3410 register_sysctl_paths(ipv4_path, ipv4_skeleton);