2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
109 #include <linux/sysctl.h>
111 #include <net/atmclip.h>
112 #include <net/secure_seq.h>
114 #define RT_FL_TOS(oldflp4) \
115 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
117 #define IP_MAX_MTU 0xFFF0
119 #define RT_GC_TIMEOUT (300*HZ)
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
123 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124 static int ip_rt_redirect_number __read_mostly = 9;
125 static int ip_rt_redirect_load __read_mostly = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly = HZ;
128 static int ip_rt_error_burst __read_mostly = 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly = 8;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256;
133 static int rt_chain_length_max __read_mostly = 20;
134 static int redirect_genid;
137 * Interface to generic destination cache.
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int ipv4_mtu(const struct dst_entry *dst);
143 static void ipv4_dst_destroy(struct dst_entry *dst);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void ipv4_link_failure(struct sk_buff *skb);
146 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147 static int rt_garbage_collect(struct dst_ops *ops);
149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156 struct rtable *rt = (struct rtable *) dst;
157 struct inet_peer *peer;
161 rt_bind_peer(rt, rt->rt_dst, 1);
165 u32 *old_p = __DST_METRICS_PTR(old);
166 unsigned long prev, new;
169 if (inet_metrics_new(peer))
170 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
172 new = (unsigned long) p;
173 prev = cmpxchg(&dst->_metrics, old, new);
176 p = __DST_METRICS_PTR(prev);
177 if (prev & DST_METRICS_READ_ONLY)
181 fib_info_put(rt->fi);
189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
191 static struct dst_ops ipv4_dst_ops = {
193 .protocol = cpu_to_be16(ETH_P_IP),
194 .gc = rt_garbage_collect,
195 .check = ipv4_dst_check,
196 .default_advmss = ipv4_default_advmss,
198 .cow_metrics = ipv4_cow_metrics,
199 .destroy = ipv4_dst_destroy,
200 .ifdown = ipv4_dst_ifdown,
201 .negative_advice = ipv4_negative_advice,
202 .link_failure = ipv4_link_failure,
203 .update_pmtu = ip_rt_update_pmtu,
204 .local_out = __ip_local_out,
205 .neigh_lookup = ipv4_neigh_lookup,
208 #define ECN_OR_COST(class) TC_PRIO_##class
210 const __u8 ip_tos2prio[16] = {
212 ECN_OR_COST(BESTEFFORT),
214 ECN_OR_COST(BESTEFFORT),
220 ECN_OR_COST(INTERACTIVE),
222 ECN_OR_COST(INTERACTIVE),
223 TC_PRIO_INTERACTIVE_BULK,
224 ECN_OR_COST(INTERACTIVE_BULK),
225 TC_PRIO_INTERACTIVE_BULK,
226 ECN_OR_COST(INTERACTIVE_BULK)
234 /* The locking scheme is rather straight forward:
236 * 1) Read-Copy Update protects the buckets of the central route hash.
237 * 2) Only writers remove entries, and they hold the lock
238 * as they look at rtable reference counts.
239 * 3) Only readers acquire references to rtable entries,
240 * they do so with atomic increments and with the
244 struct rt_hash_bucket {
245 struct rtable __rcu *chain;
248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
249 defined(CONFIG_PROVE_LOCKING)
251 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
252 * The size of this table is a power of two and depends on the number of CPUS.
253 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
255 #ifdef CONFIG_LOCKDEP
256 # define RT_HASH_LOCK_SZ 256
259 # define RT_HASH_LOCK_SZ 4096
261 # define RT_HASH_LOCK_SZ 2048
263 # define RT_HASH_LOCK_SZ 1024
265 # define RT_HASH_LOCK_SZ 512
267 # define RT_HASH_LOCK_SZ 256
271 static spinlock_t *rt_hash_locks;
272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
274 static __init void rt_hash_lock_init(void)
278 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
281 panic("IP: failed to allocate rt_hash_locks\n");
283 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
284 spin_lock_init(&rt_hash_locks[i]);
287 # define rt_hash_lock_addr(slot) NULL
289 static inline void rt_hash_lock_init(void)
294 static struct rt_hash_bucket *rt_hash_table __read_mostly;
295 static unsigned rt_hash_mask __read_mostly;
296 static unsigned int rt_hash_log __read_mostly;
298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
304 return jhash_3words((__force u32)daddr, (__force u32)saddr,
309 static inline int rt_genid(struct net *net)
311 return atomic_read(&net->ipv4.rt_genid);
314 #ifdef CONFIG_PROC_FS
315 struct rt_cache_iter_state {
316 struct seq_net_private p;
321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
323 struct rt_cache_iter_state *st = seq->private;
324 struct rtable *r = NULL;
326 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
327 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
330 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
332 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
333 r->rt_genid == st->genid)
335 r = rcu_dereference_bh(r->dst.rt_next);
337 rcu_read_unlock_bh();
342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
345 struct rt_cache_iter_state *st = seq->private;
347 r = rcu_dereference_bh(r->dst.rt_next);
349 rcu_read_unlock_bh();
351 if (--st->bucket < 0)
353 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
355 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
363 struct rt_cache_iter_state *st = seq->private;
364 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
365 if (dev_net(r->dst.dev) != seq_file_net(seq))
367 if (r->rt_genid == st->genid)
373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
375 struct rtable *r = rt_cache_get_first(seq);
378 while (pos && (r = rt_cache_get_next(seq, r)))
380 return pos ? NULL : r;
383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
385 struct rt_cache_iter_state *st = seq->private;
387 return rt_cache_get_idx(seq, *pos - 1);
388 st->genid = rt_genid(seq_file_net(seq));
389 return SEQ_START_TOKEN;
392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
396 if (v == SEQ_START_TOKEN)
397 r = rt_cache_get_first(seq);
399 r = rt_cache_get_next(seq, v);
404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
406 if (v && v != SEQ_START_TOKEN)
407 rcu_read_unlock_bh();
410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
412 if (v == SEQ_START_TOKEN)
413 seq_printf(seq, "%-127s\n",
414 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
415 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
418 struct rtable *r = v;
423 n = dst_get_neighbour(&r->dst);
424 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
427 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
428 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
429 r->dst.dev ? r->dst.dev->name : "*",
430 (__force u32)r->rt_dst,
431 (__force u32)r->rt_gateway,
432 r->rt_flags, atomic_read(&r->dst.__refcnt),
433 r->dst.__use, 0, (__force u32)r->rt_src,
434 dst_metric_advmss(&r->dst) + 40,
435 dst_metric(&r->dst, RTAX_WINDOW),
436 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
437 dst_metric(&r->dst, RTAX_RTTVAR)),
441 r->rt_spec_dst, &len);
443 seq_printf(seq, "%*s\n", 127 - len, "");
448 static const struct seq_operations rt_cache_seq_ops = {
449 .start = rt_cache_seq_start,
450 .next = rt_cache_seq_next,
451 .stop = rt_cache_seq_stop,
452 .show = rt_cache_seq_show,
455 static int rt_cache_seq_open(struct inode *inode, struct file *file)
457 return seq_open_net(inode, file, &rt_cache_seq_ops,
458 sizeof(struct rt_cache_iter_state));
461 static const struct file_operations rt_cache_seq_fops = {
462 .owner = THIS_MODULE,
463 .open = rt_cache_seq_open,
466 .release = seq_release_net,
470 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475 return SEQ_START_TOKEN;
477 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
478 if (!cpu_possible(cpu))
481 return &per_cpu(rt_cache_stat, cpu);
486 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
490 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
491 if (!cpu_possible(cpu))
494 return &per_cpu(rt_cache_stat, cpu);
500 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
507 struct rt_cache_stat *st = v;
509 if (v == SEQ_START_TOKEN) {
510 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
514 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
515 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
516 dst_entries_get_slow(&ipv4_dst_ops),
539 static const struct seq_operations rt_cpu_seq_ops = {
540 .start = rt_cpu_seq_start,
541 .next = rt_cpu_seq_next,
542 .stop = rt_cpu_seq_stop,
543 .show = rt_cpu_seq_show,
547 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
549 return seq_open(file, &rt_cpu_seq_ops);
552 static const struct file_operations rt_cpu_seq_fops = {
553 .owner = THIS_MODULE,
554 .open = rt_cpu_seq_open,
557 .release = seq_release,
560 #ifdef CONFIG_IP_ROUTE_CLASSID
561 static int rt_acct_proc_show(struct seq_file *m, void *v)
563 struct ip_rt_acct *dst, *src;
566 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
570 for_each_possible_cpu(i) {
571 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
572 for (j = 0; j < 256; j++) {
573 dst[j].o_bytes += src[j].o_bytes;
574 dst[j].o_packets += src[j].o_packets;
575 dst[j].i_bytes += src[j].i_bytes;
576 dst[j].i_packets += src[j].i_packets;
580 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585 static int rt_acct_proc_open(struct inode *inode, struct file *file)
587 return single_open(file, rt_acct_proc_show, NULL);
590 static const struct file_operations rt_acct_proc_fops = {
591 .owner = THIS_MODULE,
592 .open = rt_acct_proc_open,
595 .release = single_release,
599 static int __net_init ip_rt_do_proc_init(struct net *net)
601 struct proc_dir_entry *pde;
603 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608 pde = proc_create("rt_cache", S_IRUGO,
609 net->proc_net_stat, &rt_cpu_seq_fops);
613 #ifdef CONFIG_IP_ROUTE_CLASSID
614 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
620 #ifdef CONFIG_IP_ROUTE_CLASSID
622 remove_proc_entry("rt_cache", net->proc_net_stat);
625 remove_proc_entry("rt_cache", net->proc_net);
630 static void __net_exit ip_rt_do_proc_exit(struct net *net)
632 remove_proc_entry("rt_cache", net->proc_net_stat);
633 remove_proc_entry("rt_cache", net->proc_net);
634 #ifdef CONFIG_IP_ROUTE_CLASSID
635 remove_proc_entry("rt_acct", net->proc_net);
639 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
640 .init = ip_rt_do_proc_init,
641 .exit = ip_rt_do_proc_exit,
644 static int __init ip_rt_proc_init(void)
646 return register_pernet_subsys(&ip_rt_proc_ops);
650 static inline int ip_rt_proc_init(void)
654 #endif /* CONFIG_PROC_FS */
656 static inline void rt_free(struct rtable *rt)
658 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
661 static inline void rt_drop(struct rtable *rt)
664 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
667 static inline int rt_fast_clean(struct rtable *rth)
669 /* Kill broadcast/multicast entries very aggresively, if they
670 collide in hash table with more useful entries */
671 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
672 rt_is_input_route(rth) && rth->dst.rt_next;
675 static inline int rt_valuable(struct rtable *rth)
677 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
678 (rth->peer && rth->peer->pmtu_expires);
681 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686 if (atomic_read(&rth->dst.__refcnt))
689 age = jiffies - rth->dst.lastuse;
690 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
691 (age <= tmo2 && rt_valuable(rth)))
697 /* Bits of score are:
699 * 30: not quite useless
700 * 29..0: usage counter
702 static inline u32 rt_score(struct rtable *rt)
704 u32 score = jiffies - rt->dst.lastuse;
706 score = ~score & ~(3<<30);
711 if (rt_is_output_route(rt) ||
712 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
718 static inline bool rt_caching(const struct net *net)
720 return net->ipv4.current_rt_cache_rebuild_count <=
721 net->ipv4.sysctl_rt_cache_rebuild_count;
724 static inline bool compare_hash_inputs(const struct rtable *rt1,
725 const struct rtable *rt2)
727 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
728 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
729 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
732 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
734 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
735 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
736 (rt1->rt_mark ^ rt2->rt_mark) |
737 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
738 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
739 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
742 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
744 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
747 static inline int rt_is_expired(struct rtable *rth)
749 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
753 * Perform a full scan of hash table and free all entries.
754 * Can be called by a softirq or a process.
755 * In the later case, we want to be reschedule if necessary
757 static void rt_do_flush(struct net *net, int process_context)
760 struct rtable *rth, *next;
762 for (i = 0; i <= rt_hash_mask; i++) {
763 struct rtable __rcu **pprev;
766 if (process_context && need_resched())
768 rth = rcu_access_pointer(rt_hash_table[i].chain);
772 spin_lock_bh(rt_hash_lock_addr(i));
775 pprev = &rt_hash_table[i].chain;
776 rth = rcu_dereference_protected(*pprev,
777 lockdep_is_held(rt_hash_lock_addr(i)));
780 next = rcu_dereference_protected(rth->dst.rt_next,
781 lockdep_is_held(rt_hash_lock_addr(i)));
784 net_eq(dev_net(rth->dst.dev), net)) {
785 rcu_assign_pointer(*pprev, next);
786 rcu_assign_pointer(rth->dst.rt_next, list);
789 pprev = &rth->dst.rt_next;
794 spin_unlock_bh(rt_hash_lock_addr(i));
796 for (; list; list = next) {
797 next = rcu_dereference_protected(list->dst.rt_next, 1);
804 * While freeing expired entries, we compute average chain length
805 * and standard deviation, using fixed-point arithmetic.
806 * This to have an estimation of rt_chain_length_max
807 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
808 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
812 #define ONE (1UL << FRACT_BITS)
815 * Given a hash chain and an item in this hash chain,
816 * find if a previous entry has the same hash_inputs
817 * (but differs on tos, mark or oif)
818 * Returns 0 if an alias is found.
819 * Returns ONE if rth has no alias before itself.
821 static int has_noalias(const struct rtable *head, const struct rtable *rth)
823 const struct rtable *aux = head;
826 if (compare_hash_inputs(aux, rth))
828 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
834 * Perturbation of rt_genid by a small quantity [1..256]
835 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
836 * many times (2^24) without giving recent rt_genid.
837 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
839 static void rt_cache_invalidate(struct net *net)
841 unsigned char shuffle;
843 get_random_bytes(&shuffle, sizeof(shuffle));
844 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
849 * delay < 0 : invalidate cache (fast : entries will be deleted later)
850 * delay >= 0 : invalidate & flush cache (can be long)
852 void rt_cache_flush(struct net *net, int delay)
854 rt_cache_invalidate(net);
856 rt_do_flush(net, !in_softirq());
859 /* Flush previous cache invalidated entries from the cache */
860 void rt_cache_flush_batch(struct net *net)
862 rt_do_flush(net, !in_softirq());
865 static void rt_emergency_hash_rebuild(struct net *net)
868 printk(KERN_WARNING "Route hash chain too long!\n");
869 rt_cache_invalidate(net);
873 Short description of GC goals.
875 We want to build algorithm, which will keep routing cache
876 at some equilibrium point, when number of aged off entries
877 is kept approximately equal to newly generated ones.
879 Current expiration strength is variable "expire".
880 We try to adjust it dynamically, so that if networking
881 is idle expires is large enough to keep enough of warm entries,
882 and when load increases it reduces to limit cache size.
885 static int rt_garbage_collect(struct dst_ops *ops)
887 static unsigned long expire = RT_GC_TIMEOUT;
888 static unsigned long last_gc;
890 static int equilibrium;
892 struct rtable __rcu **rthp;
893 unsigned long now = jiffies;
895 int entries = dst_entries_get_fast(&ipv4_dst_ops);
898 * Garbage collection is pretty expensive,
899 * do not make it too frequently.
902 RT_CACHE_STAT_INC(gc_total);
904 if (now - last_gc < ip_rt_gc_min_interval &&
905 entries < ip_rt_max_size) {
906 RT_CACHE_STAT_INC(gc_ignored);
910 entries = dst_entries_get_slow(&ipv4_dst_ops);
911 /* Calculate number of entries, which we want to expire now. */
912 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
914 if (equilibrium < ipv4_dst_ops.gc_thresh)
915 equilibrium = ipv4_dst_ops.gc_thresh;
916 goal = entries - equilibrium;
918 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
919 goal = entries - equilibrium;
922 /* We are in dangerous area. Try to reduce cache really
925 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
926 equilibrium = entries - goal;
929 if (now - last_gc >= ip_rt_gc_min_interval)
940 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
941 unsigned long tmo = expire;
943 k = (k + 1) & rt_hash_mask;
944 rthp = &rt_hash_table[k].chain;
945 spin_lock_bh(rt_hash_lock_addr(k));
946 while ((rth = rcu_dereference_protected(*rthp,
947 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
948 if (!rt_is_expired(rth) &&
949 !rt_may_expire(rth, tmo, expire)) {
951 rthp = &rth->dst.rt_next;
954 *rthp = rth->dst.rt_next;
958 spin_unlock_bh(rt_hash_lock_addr(k));
967 /* Goal is not achieved. We stop process if:
969 - if expire reduced to zero. Otherwise, expire is halfed.
970 - if table is not full.
971 - if we are called from interrupt.
972 - jiffies check is just fallback/debug loop breaker.
973 We will not spin here for long time in any case.
976 RT_CACHE_STAT_INC(gc_goal_miss);
983 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
985 } while (!in_softirq() && time_before_eq(jiffies, now));
987 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
989 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
992 printk(KERN_WARNING "dst cache overflow\n");
993 RT_CACHE_STAT_INC(gc_dst_overflow);
997 expire += ip_rt_gc_min_interval;
998 if (expire > ip_rt_gc_timeout ||
999 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1000 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1001 expire = ip_rt_gc_timeout;
1006 * Returns number of entries in a hash chain that have different hash_inputs
1008 static int slow_chain_length(const struct rtable *head)
1011 const struct rtable *rth = head;
1014 length += has_noalias(head, rth);
1015 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1017 return length >> FRACT_BITS;
1020 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1022 struct neigh_table *tbl = &arp_tbl;
1023 static const __be32 inaddr_any = 0;
1024 struct net_device *dev = dst->dev;
1025 const __be32 *pkey = daddr;
1026 struct neighbour *n;
1028 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1029 if (dev->type == ARPHRD_ATM)
1030 tbl = clip_tbl_hook;
1032 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1035 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1038 return neigh_create(tbl, pkey, dev);
1041 static int rt_bind_neighbour(struct rtable *rt)
1043 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1046 dst_set_neighbour(&rt->dst, n);
1051 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1052 struct sk_buff *skb, int ifindex)
1054 struct rtable *rth, *cand;
1055 struct rtable __rcu **rthp, **candp;
1059 int attempts = !in_softirq();
1063 min_score = ~(u32)0;
1068 if (!rt_caching(dev_net(rt->dst.dev))) {
1070 * If we're not caching, just tell the caller we
1071 * were successful and don't touch the route. The
1072 * caller hold the sole reference to the cache entry, and
1073 * it will be released when the caller is done with it.
1074 * If we drop it here, the callers have no way to resolve routes
1075 * when we're not caching. Instead, just point *rp at rt, so
1076 * the caller gets a single use out of the route
1077 * Note that we do rt_free on this new route entry, so that
1078 * once its refcount hits zero, we are still able to reap it
1080 * Note: To avoid expensive rcu stuff for this uncached dst,
1081 * we set DST_NOCACHE so that dst_release() can free dst without
1082 * waiting a grace period.
1085 rt->dst.flags |= DST_NOCACHE;
1086 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1087 int err = rt_bind_neighbour(rt);
1089 if (net_ratelimit())
1091 "Neighbour table failure & not caching routes.\n");
1093 return ERR_PTR(err);
1100 rthp = &rt_hash_table[hash].chain;
1102 spin_lock_bh(rt_hash_lock_addr(hash));
1103 while ((rth = rcu_dereference_protected(*rthp,
1104 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1105 if (rt_is_expired(rth)) {
1106 *rthp = rth->dst.rt_next;
1110 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1112 *rthp = rth->dst.rt_next;
1114 * Since lookup is lockfree, the deletion
1115 * must be visible to another weakly ordered CPU before
1116 * the insertion at the start of the hash chain.
1118 rcu_assign_pointer(rth->dst.rt_next,
1119 rt_hash_table[hash].chain);
1121 * Since lookup is lockfree, the update writes
1122 * must be ordered for consistency on SMP.
1124 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1126 dst_use(&rth->dst, now);
1127 spin_unlock_bh(rt_hash_lock_addr(hash));
1131 skb_dst_set(skb, &rth->dst);
1135 if (!atomic_read(&rth->dst.__refcnt)) {
1136 u32 score = rt_score(rth);
1138 if (score <= min_score) {
1147 rthp = &rth->dst.rt_next;
1151 /* ip_rt_gc_elasticity used to be average length of chain
1152 * length, when exceeded gc becomes really aggressive.
1154 * The second limit is less certain. At the moment it allows
1155 * only 2 entries per bucket. We will see.
1157 if (chain_length > ip_rt_gc_elasticity) {
1158 *candp = cand->dst.rt_next;
1162 if (chain_length > rt_chain_length_max &&
1163 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1164 struct net *net = dev_net(rt->dst.dev);
1165 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1166 if (!rt_caching(net)) {
1167 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1168 rt->dst.dev->name, num);
1170 rt_emergency_hash_rebuild(net);
1171 spin_unlock_bh(rt_hash_lock_addr(hash));
1173 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1174 ifindex, rt_genid(net));
1179 /* Try to bind route to arp only if it is output
1180 route or unicast forwarding path.
1182 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1183 int err = rt_bind_neighbour(rt);
1185 spin_unlock_bh(rt_hash_lock_addr(hash));
1187 if (err != -ENOBUFS) {
1189 return ERR_PTR(err);
1192 /* Neighbour tables are full and nothing
1193 can be released. Try to shrink route cache,
1194 it is most likely it holds some neighbour records.
1196 if (attempts-- > 0) {
1197 int saved_elasticity = ip_rt_gc_elasticity;
1198 int saved_int = ip_rt_gc_min_interval;
1199 ip_rt_gc_elasticity = 1;
1200 ip_rt_gc_min_interval = 0;
1201 rt_garbage_collect(&ipv4_dst_ops);
1202 ip_rt_gc_min_interval = saved_int;
1203 ip_rt_gc_elasticity = saved_elasticity;
1207 if (net_ratelimit())
1208 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1210 return ERR_PTR(-ENOBUFS);
1214 rt->dst.rt_next = rt_hash_table[hash].chain;
1217 * Since lookup is lockfree, we must make sure
1218 * previous writes to rt are committed to memory
1219 * before making rt visible to other CPUS.
1221 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1223 spin_unlock_bh(rt_hash_lock_addr(hash));
1227 skb_dst_set(skb, &rt->dst);
1231 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1233 static u32 rt_peer_genid(void)
1235 return atomic_read(&__rt_peer_genid);
1238 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1240 struct inet_peer *peer;
1242 peer = inet_getpeer_v4(daddr, create);
1244 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1247 rt->rt_peer_genid = rt_peer_genid();
1251 * Peer allocation may fail only in serious out-of-memory conditions. However
1252 * we still can generate some output.
1253 * Random ID selection looks a bit dangerous because we have no chances to
1254 * select ID being unique in a reasonable period of time.
1255 * But broken packet identifier may be better than no packet at all.
1257 static void ip_select_fb_ident(struct iphdr *iph)
1259 static DEFINE_SPINLOCK(ip_fb_id_lock);
1260 static u32 ip_fallback_id;
1263 spin_lock_bh(&ip_fb_id_lock);
1264 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1265 iph->id = htons(salt & 0xFFFF);
1266 ip_fallback_id = salt;
1267 spin_unlock_bh(&ip_fb_id_lock);
1270 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1272 struct rtable *rt = (struct rtable *) dst;
1275 if (rt->peer == NULL)
1276 rt_bind_peer(rt, rt->rt_dst, 1);
1278 /* If peer is attached to destination, it is never detached,
1279 so that we need not to grab a lock to dereference it.
1282 iph->id = htons(inet_getid(rt->peer, more));
1286 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1287 __builtin_return_address(0));
1289 ip_select_fb_ident(iph);
1291 EXPORT_SYMBOL(__ip_select_ident);
1293 static void rt_del(unsigned hash, struct rtable *rt)
1295 struct rtable __rcu **rthp;
1298 rthp = &rt_hash_table[hash].chain;
1299 spin_lock_bh(rt_hash_lock_addr(hash));
1301 while ((aux = rcu_dereference_protected(*rthp,
1302 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1303 if (aux == rt || rt_is_expired(aux)) {
1304 *rthp = aux->dst.rt_next;
1308 rthp = &aux->dst.rt_next;
1310 spin_unlock_bh(rt_hash_lock_addr(hash));
1313 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1315 struct rtable *rt = (struct rtable *) dst;
1316 __be32 orig_gw = rt->rt_gateway;
1317 struct neighbour *n, *old_n;
1319 dst_confirm(&rt->dst);
1321 rt->rt_gateway = peer->redirect_learned.a4;
1323 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1326 old_n = xchg(&rt->dst._neighbour, n);
1328 neigh_release(old_n);
1329 if (!n || !(n->nud_state & NUD_VALID)) {
1331 neigh_event_send(n, NULL);
1332 rt->rt_gateway = orig_gw;
1335 rt->rt_flags |= RTCF_REDIRECTED;
1336 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1341 /* called in rcu_read_lock() section */
1342 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1343 __be32 saddr, struct net_device *dev)
1346 struct in_device *in_dev = __in_dev_get_rcu(dev);
1347 __be32 skeys[2] = { saddr, 0 };
1348 int ikeys[2] = { dev->ifindex, 0 };
1349 struct inet_peer *peer;
1356 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1357 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1358 ipv4_is_zeronet(new_gw))
1359 goto reject_redirect;
1361 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1362 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1363 goto reject_redirect;
1364 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1365 goto reject_redirect;
1367 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1368 goto reject_redirect;
1371 for (s = 0; s < 2; s++) {
1372 for (i = 0; i < 2; i++) {
1374 struct rtable __rcu **rthp;
1377 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1379 rthp = &rt_hash_table[hash].chain;
1381 while ((rt = rcu_dereference(*rthp)) != NULL) {
1382 rthp = &rt->dst.rt_next;
1384 if (rt->rt_key_dst != daddr ||
1385 rt->rt_key_src != skeys[s] ||
1386 rt->rt_oif != ikeys[i] ||
1387 rt_is_input_route(rt) ||
1388 rt_is_expired(rt) ||
1389 !net_eq(dev_net(rt->dst.dev), net) ||
1391 rt->dst.dev != dev ||
1392 rt->rt_gateway != old_gw)
1396 rt_bind_peer(rt, rt->rt_dst, 1);
1400 if (peer->redirect_learned.a4 != new_gw ||
1401 peer->redirect_genid != redirect_genid) {
1402 peer->redirect_learned.a4 = new_gw;
1403 peer->redirect_genid = redirect_genid;
1404 atomic_inc(&__rt_peer_genid);
1406 check_peer_redir(&rt->dst, peer);
1414 #ifdef CONFIG_IP_ROUTE_VERBOSE
1415 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1416 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1417 " Advised path = %pI4 -> %pI4\n",
1418 &old_gw, dev->name, &new_gw,
1424 static bool peer_pmtu_expired(struct inet_peer *peer)
1426 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1429 time_after_eq(jiffies, orig) &&
1430 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1433 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1435 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1438 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1441 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1443 struct rtable *rt = (struct rtable *)dst;
1444 struct dst_entry *ret = dst;
1447 if (dst->obsolete > 0) {
1450 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1451 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1453 rt_genid(dev_net(dst->dev)));
1456 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1457 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1465 * 1. The first ip_rt_redirect_number redirects are sent
1466 * with exponential backoff, then we stop sending them at all,
1467 * assuming that the host ignores our redirects.
1468 * 2. If we did not see packets requiring redirects
1469 * during ip_rt_redirect_silence, we assume that the host
1470 * forgot redirected route and start to send redirects again.
1472 * This algorithm is much cheaper and more intelligent than dumb load limiting
1475 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1476 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1479 void ip_rt_send_redirect(struct sk_buff *skb)
1481 struct rtable *rt = skb_rtable(skb);
1482 struct in_device *in_dev;
1483 struct inet_peer *peer;
1487 in_dev = __in_dev_get_rcu(rt->dst.dev);
1488 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1492 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1496 rt_bind_peer(rt, rt->rt_dst, 1);
1499 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1503 /* No redirected packets during ip_rt_redirect_silence;
1504 * reset the algorithm.
1506 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1507 peer->rate_tokens = 0;
1509 /* Too many ignored redirects; do not send anything
1510 * set dst.rate_last to the last seen redirected packet.
1512 if (peer->rate_tokens >= ip_rt_redirect_number) {
1513 peer->rate_last = jiffies;
1517 /* Check for load limit; set rate_last to the latest sent
1520 if (peer->rate_tokens == 0 ||
1523 (ip_rt_redirect_load << peer->rate_tokens)))) {
1524 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1525 peer->rate_last = jiffies;
1526 ++peer->rate_tokens;
1527 #ifdef CONFIG_IP_ROUTE_VERBOSE
1529 peer->rate_tokens == ip_rt_redirect_number &&
1531 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1532 &ip_hdr(skb)->saddr, rt->rt_iif,
1533 &rt->rt_dst, &rt->rt_gateway);
1538 static int ip_error(struct sk_buff *skb)
1540 struct rtable *rt = skb_rtable(skb);
1541 struct inet_peer *peer;
1546 switch (rt->dst.error) {
1551 code = ICMP_HOST_UNREACH;
1554 code = ICMP_NET_UNREACH;
1555 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1556 IPSTATS_MIB_INNOROUTES);
1559 code = ICMP_PKT_FILTERED;
1564 rt_bind_peer(rt, rt->rt_dst, 1);
1570 peer->rate_tokens += now - peer->rate_last;
1571 if (peer->rate_tokens > ip_rt_error_burst)
1572 peer->rate_tokens = ip_rt_error_burst;
1573 peer->rate_last = now;
1574 if (peer->rate_tokens >= ip_rt_error_cost)
1575 peer->rate_tokens -= ip_rt_error_cost;
1580 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1582 out: kfree_skb(skb);
1587 * The last two values are not from the RFC but
1588 * are needed for AMPRnet AX.25 paths.
1591 static const unsigned short mtu_plateau[] =
1592 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1594 static inline unsigned short guess_mtu(unsigned short old_mtu)
1598 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1599 if (old_mtu > mtu_plateau[i])
1600 return mtu_plateau[i];
1604 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1605 unsigned short new_mtu,
1606 struct net_device *dev)
1608 unsigned short old_mtu = ntohs(iph->tot_len);
1609 unsigned short est_mtu = 0;
1610 struct inet_peer *peer;
1612 peer = inet_getpeer_v4(iph->daddr, 1);
1614 unsigned short mtu = new_mtu;
1616 if (new_mtu < 68 || new_mtu >= old_mtu) {
1617 /* BSD 4.2 derived systems incorrectly adjust
1618 * tot_len by the IP header length, and report
1619 * a zero MTU in the ICMP message.
1622 old_mtu >= 68 + (iph->ihl << 2))
1623 old_mtu -= iph->ihl << 2;
1624 mtu = guess_mtu(old_mtu);
1627 if (mtu < ip_rt_min_pmtu)
1628 mtu = ip_rt_min_pmtu;
1629 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1630 unsigned long pmtu_expires;
1632 pmtu_expires = jiffies + ip_rt_mtu_expires;
1637 peer->pmtu_learned = mtu;
1638 peer->pmtu_expires = pmtu_expires;
1639 atomic_inc(&__rt_peer_genid);
1644 return est_mtu ? : new_mtu;
1647 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1649 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1653 if (time_before(jiffies, expires)) {
1654 u32 orig_dst_mtu = dst_mtu(dst);
1655 if (peer->pmtu_learned < orig_dst_mtu) {
1656 if (!peer->pmtu_orig)
1657 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1658 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1660 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1661 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1664 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1666 struct rtable *rt = (struct rtable *) dst;
1667 struct inet_peer *peer;
1672 rt_bind_peer(rt, rt->rt_dst, 1);
1675 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1677 if (mtu < ip_rt_min_pmtu)
1678 mtu = ip_rt_min_pmtu;
1679 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1681 pmtu_expires = jiffies + ip_rt_mtu_expires;
1685 peer->pmtu_learned = mtu;
1686 peer->pmtu_expires = pmtu_expires;
1688 atomic_inc(&__rt_peer_genid);
1689 rt->rt_peer_genid = rt_peer_genid();
1691 check_peer_pmtu(dst, peer);
1696 static struct rtable *ipv4_validate_peer(struct rtable *rt)
1698 if (rt->rt_peer_genid != rt_peer_genid()) {
1699 struct inet_peer *peer;
1702 rt_bind_peer(rt, rt->rt_dst, 0);
1706 check_peer_pmtu(&rt->dst, peer);
1708 if (peer->redirect_genid != redirect_genid)
1709 peer->redirect_learned.a4 = 0;
1710 if (peer->redirect_learned.a4 &&
1711 peer->redirect_learned.a4 != rt->rt_gateway) {
1712 if (check_peer_redir(&rt->dst, peer))
1717 rt->rt_peer_genid = rt_peer_genid();
1722 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1724 struct rtable *rt = (struct rtable *) dst;
1726 if (rt_is_expired(rt))
1728 dst = (struct dst_entry *) ipv4_validate_peer(rt);
1732 static void ipv4_dst_destroy(struct dst_entry *dst)
1734 struct rtable *rt = (struct rtable *) dst;
1735 struct inet_peer *peer = rt->peer;
1738 fib_info_put(rt->fi);
1748 static void ipv4_link_failure(struct sk_buff *skb)
1752 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1754 rt = skb_rtable(skb);
1755 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1756 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1759 static int ip_rt_bug(struct sk_buff *skb)
1761 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1762 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1763 skb->dev ? skb->dev->name : "?");
1770 We do not cache source address of outgoing interface,
1771 because it is used only by IP RR, TS and SRR options,
1772 so that it out of fast path.
1774 BTW remember: "addr" is allowed to be not aligned
1778 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1782 if (rt_is_output_route(rt))
1783 src = ip_hdr(skb)->saddr;
1785 struct fib_result res;
1791 memset(&fl4, 0, sizeof(fl4));
1792 fl4.daddr = iph->daddr;
1793 fl4.saddr = iph->saddr;
1794 fl4.flowi4_tos = RT_TOS(iph->tos);
1795 fl4.flowi4_oif = rt->dst.dev->ifindex;
1796 fl4.flowi4_iif = skb->dev->ifindex;
1797 fl4.flowi4_mark = skb->mark;
1800 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1801 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1803 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1807 memcpy(addr, &src, 4);
1810 #ifdef CONFIG_IP_ROUTE_CLASSID
1811 static void set_class_tag(struct rtable *rt, u32 tag)
1813 if (!(rt->dst.tclassid & 0xFFFF))
1814 rt->dst.tclassid |= tag & 0xFFFF;
1815 if (!(rt->dst.tclassid & 0xFFFF0000))
1816 rt->dst.tclassid |= tag & 0xFFFF0000;
1820 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1822 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1825 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1827 if (advmss > 65535 - 40)
1828 advmss = 65535 - 40;
1833 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1835 const struct rtable *rt = (const struct rtable *) dst;
1836 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1838 if (mtu && rt_is_output_route(rt))
1841 mtu = dst->dev->mtu;
1843 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1845 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1849 if (mtu > IP_MAX_MTU)
1855 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1856 struct fib_info *fi)
1858 struct inet_peer *peer;
1861 /* If a peer entry exists for this destination, we must hook
1862 * it up in order to get at cached metrics.
1864 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1867 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1869 rt->rt_peer_genid = rt_peer_genid();
1870 if (inet_metrics_new(peer))
1871 memcpy(peer->metrics, fi->fib_metrics,
1872 sizeof(u32) * RTAX_MAX);
1873 dst_init_metrics(&rt->dst, peer->metrics, false);
1875 check_peer_pmtu(&rt->dst, peer);
1876 if (peer->redirect_genid != redirect_genid)
1877 peer->redirect_learned.a4 = 0;
1878 if (peer->redirect_learned.a4 &&
1879 peer->redirect_learned.a4 != rt->rt_gateway) {
1880 rt->rt_gateway = peer->redirect_learned.a4;
1881 rt->rt_flags |= RTCF_REDIRECTED;
1884 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1886 atomic_inc(&fi->fib_clntref);
1888 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1892 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1893 const struct fib_result *res,
1894 struct fib_info *fi, u16 type, u32 itag)
1896 struct dst_entry *dst = &rt->dst;
1899 if (FIB_RES_GW(*res) &&
1900 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1901 rt->rt_gateway = FIB_RES_GW(*res);
1902 rt_init_metrics(rt, fl4, fi);
1903 #ifdef CONFIG_IP_ROUTE_CLASSID
1904 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1908 if (dst_mtu(dst) > IP_MAX_MTU)
1909 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1910 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1911 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1913 #ifdef CONFIG_IP_ROUTE_CLASSID
1914 #ifdef CONFIG_IP_MULTIPLE_TABLES
1915 set_class_tag(rt, fib_rules_tclass(res));
1917 set_class_tag(rt, itag);
1921 static struct rtable *rt_dst_alloc(struct net_device *dev,
1922 bool nopolicy, bool noxfrm)
1924 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1926 (nopolicy ? DST_NOPOLICY : 0) |
1927 (noxfrm ? DST_NOXFRM : 0));
1930 /* called in rcu_read_lock() section */
1931 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1932 u8 tos, struct net_device *dev, int our)
1937 struct in_device *in_dev = __in_dev_get_rcu(dev);
1941 /* Primary sanity checks. */
1946 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1947 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1950 if (ipv4_is_zeronet(saddr)) {
1951 if (!ipv4_is_local_multicast(daddr))
1953 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1955 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1960 rth = rt_dst_alloc(init_net.loopback_dev,
1961 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1965 #ifdef CONFIG_IP_ROUTE_CLASSID
1966 rth->dst.tclassid = itag;
1968 rth->dst.output = ip_rt_bug;
1970 rth->rt_key_dst = daddr;
1971 rth->rt_key_src = saddr;
1972 rth->rt_genid = rt_genid(dev_net(dev));
1973 rth->rt_flags = RTCF_MULTICAST;
1974 rth->rt_type = RTN_MULTICAST;
1975 rth->rt_key_tos = tos;
1976 rth->rt_dst = daddr;
1977 rth->rt_src = saddr;
1978 rth->rt_route_iif = dev->ifindex;
1979 rth->rt_iif = dev->ifindex;
1981 rth->rt_mark = skb->mark;
1982 rth->rt_gateway = daddr;
1983 rth->rt_spec_dst= spec_dst;
1984 rth->rt_peer_genid = 0;
1988 rth->dst.input= ip_local_deliver;
1989 rth->rt_flags |= RTCF_LOCAL;
1992 #ifdef CONFIG_IP_MROUTE
1993 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1994 rth->dst.input = ip_mr_input;
1996 RT_CACHE_STAT_INC(in_slow_mc);
1998 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1999 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2000 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2011 static void ip_handle_martian_source(struct net_device *dev,
2012 struct in_device *in_dev,
2013 struct sk_buff *skb,
2017 RT_CACHE_STAT_INC(in_martian_src);
2018 #ifdef CONFIG_IP_ROUTE_VERBOSE
2019 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2021 * RFC1812 recommendation, if source is martian,
2022 * the only hint is MAC header.
2024 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2025 &daddr, &saddr, dev->name);
2026 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2028 const unsigned char *p = skb_mac_header(skb);
2029 printk(KERN_WARNING "ll header: ");
2030 for (i = 0; i < dev->hard_header_len; i++, p++) {
2032 if (i < (dev->hard_header_len - 1))
2041 /* called in rcu_read_lock() section */
2042 static int __mkroute_input(struct sk_buff *skb,
2043 const struct fib_result *res,
2044 struct in_device *in_dev,
2045 __be32 daddr, __be32 saddr, u32 tos,
2046 struct rtable **result)
2050 struct in_device *out_dev;
2051 unsigned int flags = 0;
2055 /* get a working reference to the output device */
2056 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2057 if (out_dev == NULL) {
2058 if (net_ratelimit())
2059 printk(KERN_CRIT "Bug in ip_route_input" \
2060 "_slow(). Please, report\n");
2065 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2066 in_dev->dev, &spec_dst, &itag);
2068 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2075 flags |= RTCF_DIRECTSRC;
2077 if (out_dev == in_dev && err &&
2078 (IN_DEV_SHARED_MEDIA(out_dev) ||
2079 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2080 flags |= RTCF_DOREDIRECT;
2082 if (skb->protocol != htons(ETH_P_IP)) {
2083 /* Not IP (i.e. ARP). Do not create route, if it is
2084 * invalid for proxy arp. DNAT routes are always valid.
2086 * Proxy arp feature have been extended to allow, ARP
2087 * replies back to the same interface, to support
2088 * Private VLAN switch technologies. See arp.c.
2090 if (out_dev == in_dev &&
2091 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2097 rth = rt_dst_alloc(out_dev->dev,
2098 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2099 IN_DEV_CONF_GET(out_dev, NOXFRM));
2105 rth->rt_key_dst = daddr;
2106 rth->rt_key_src = saddr;
2107 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2108 rth->rt_flags = flags;
2109 rth->rt_type = res->type;
2110 rth->rt_key_tos = tos;
2111 rth->rt_dst = daddr;
2112 rth->rt_src = saddr;
2113 rth->rt_route_iif = in_dev->dev->ifindex;
2114 rth->rt_iif = in_dev->dev->ifindex;
2116 rth->rt_mark = skb->mark;
2117 rth->rt_gateway = daddr;
2118 rth->rt_spec_dst= spec_dst;
2119 rth->rt_peer_genid = 0;
2123 rth->dst.input = ip_forward;
2124 rth->dst.output = ip_output;
2126 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2134 static int ip_mkroute_input(struct sk_buff *skb,
2135 struct fib_result *res,
2136 const struct flowi4 *fl4,
2137 struct in_device *in_dev,
2138 __be32 daddr, __be32 saddr, u32 tos)
2140 struct rtable* rth = NULL;
2144 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2145 if (res->fi && res->fi->fib_nhs > 1)
2146 fib_select_multipath(res);
2149 /* create a routing cache entry */
2150 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2154 /* put it into the cache */
2155 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2156 rt_genid(dev_net(rth->dst.dev)));
2157 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2159 return PTR_ERR(rth);
2164 * NOTE. We drop all the packets that has local source
2165 * addresses, because every properly looped back packet
2166 * must have correct destination already attached by output routine.
2168 * Such approach solves two big problems:
2169 * 1. Not simplex devices are handled properly.
2170 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2171 * called with rcu_read_lock()
2174 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2175 u8 tos, struct net_device *dev)
2177 struct fib_result res;
2178 struct in_device *in_dev = __in_dev_get_rcu(dev);
2182 struct rtable * rth;
2186 struct net * net = dev_net(dev);
2188 /* IP on this device is disabled. */
2193 /* Check for the most weird martians, which can be not detected
2197 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2198 ipv4_is_loopback(saddr))
2199 goto martian_source;
2201 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2204 /* Accept zero addresses only to limited broadcast;
2205 * I even do not know to fix it or not. Waiting for complains :-)
2207 if (ipv4_is_zeronet(saddr))
2208 goto martian_source;
2210 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2211 goto martian_destination;
2214 * Now we are ready to route packet.
2217 fl4.flowi4_iif = dev->ifindex;
2218 fl4.flowi4_mark = skb->mark;
2219 fl4.flowi4_tos = tos;
2220 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2223 err = fib_lookup(net, &fl4, &res);
2225 if (!IN_DEV_FORWARD(in_dev))
2230 RT_CACHE_STAT_INC(in_slow_tot);
2232 if (res.type == RTN_BROADCAST)
2235 if (res.type == RTN_LOCAL) {
2236 err = fib_validate_source(skb, saddr, daddr, tos,
2237 net->loopback_dev->ifindex,
2238 dev, &spec_dst, &itag);
2240 goto martian_source_keep_err;
2242 flags |= RTCF_DIRECTSRC;
2247 if (!IN_DEV_FORWARD(in_dev))
2249 if (res.type != RTN_UNICAST)
2250 goto martian_destination;
2252 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2256 if (skb->protocol != htons(ETH_P_IP))
2259 if (ipv4_is_zeronet(saddr))
2260 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2262 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2265 goto martian_source_keep_err;
2267 flags |= RTCF_DIRECTSRC;
2269 flags |= RTCF_BROADCAST;
2270 res.type = RTN_BROADCAST;
2271 RT_CACHE_STAT_INC(in_brd);
2274 rth = rt_dst_alloc(net->loopback_dev,
2275 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2279 rth->dst.input= ip_local_deliver;
2280 rth->dst.output= ip_rt_bug;
2281 #ifdef CONFIG_IP_ROUTE_CLASSID
2282 rth->dst.tclassid = itag;
2285 rth->rt_key_dst = daddr;
2286 rth->rt_key_src = saddr;
2287 rth->rt_genid = rt_genid(net);
2288 rth->rt_flags = flags|RTCF_LOCAL;
2289 rth->rt_type = res.type;
2290 rth->rt_key_tos = tos;
2291 rth->rt_dst = daddr;
2292 rth->rt_src = saddr;
2293 #ifdef CONFIG_IP_ROUTE_CLASSID
2294 rth->dst.tclassid = itag;
2296 rth->rt_route_iif = dev->ifindex;
2297 rth->rt_iif = dev->ifindex;
2299 rth->rt_mark = skb->mark;
2300 rth->rt_gateway = daddr;
2301 rth->rt_spec_dst= spec_dst;
2302 rth->rt_peer_genid = 0;
2305 if (res.type == RTN_UNREACHABLE) {
2306 rth->dst.input= ip_error;
2307 rth->dst.error= -err;
2308 rth->rt_flags &= ~RTCF_LOCAL;
2310 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2311 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2318 RT_CACHE_STAT_INC(in_no_route);
2319 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2320 res.type = RTN_UNREACHABLE;
2326 * Do not cache martian addresses: they should be logged (RFC1812)
2328 martian_destination:
2329 RT_CACHE_STAT_INC(in_martian_dst);
2330 #ifdef CONFIG_IP_ROUTE_VERBOSE
2331 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2332 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2333 &daddr, &saddr, dev->name);
2337 err = -EHOSTUNREACH;
2350 martian_source_keep_err:
2351 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2355 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2356 u8 tos, struct net_device *dev, bool noref)
2358 struct rtable * rth;
2360 int iif = dev->ifindex;
2368 if (!rt_caching(net))
2371 tos &= IPTOS_RT_MASK;
2372 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2374 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2375 rth = rcu_dereference(rth->dst.rt_next)) {
2376 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2377 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2378 (rth->rt_route_iif ^ iif) |
2379 (rth->rt_key_tos ^ tos)) == 0 &&
2380 rth->rt_mark == skb->mark &&
2381 net_eq(dev_net(rth->dst.dev), net) &&
2382 !rt_is_expired(rth)) {
2383 rth = ipv4_validate_peer(rth);
2387 dst_use_noref(&rth->dst, jiffies);
2388 skb_dst_set_noref(skb, &rth->dst);
2390 dst_use(&rth->dst, jiffies);
2391 skb_dst_set(skb, &rth->dst);
2393 RT_CACHE_STAT_INC(in_hit);
2397 RT_CACHE_STAT_INC(in_hlist_search);
2401 /* Multicast recognition logic is moved from route cache to here.
2402 The problem was that too many Ethernet cards have broken/missing
2403 hardware multicast filters :-( As result the host on multicasting
2404 network acquires a lot of useless route cache entries, sort of
2405 SDR messages from all the world. Now we try to get rid of them.
2406 Really, provided software IP multicast filter is organized
2407 reasonably (at least, hashed), it does not result in a slowdown
2408 comparing with route cache reject entries.
2409 Note, that multicast routers are not affected, because
2410 route cache entry is created eventually.
2412 if (ipv4_is_multicast(daddr)) {
2413 struct in_device *in_dev = __in_dev_get_rcu(dev);
2416 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2417 ip_hdr(skb)->protocol);
2419 #ifdef CONFIG_IP_MROUTE
2421 (!ipv4_is_local_multicast(daddr) &&
2422 IN_DEV_MFORWARD(in_dev))
2425 int res = ip_route_input_mc(skb, daddr, saddr,
2434 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2438 EXPORT_SYMBOL(ip_route_input_common);
2440 /* called with rcu_read_lock() */
2441 static struct rtable *__mkroute_output(const struct fib_result *res,
2442 const struct flowi4 *fl4,
2443 __be32 orig_daddr, __be32 orig_saddr,
2444 int orig_oif, struct net_device *dev_out,
2447 struct fib_info *fi = res->fi;
2448 u32 tos = RT_FL_TOS(fl4);
2449 struct in_device *in_dev;
2450 u16 type = res->type;
2453 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2454 return ERR_PTR(-EINVAL);
2456 if (ipv4_is_lbcast(fl4->daddr))
2457 type = RTN_BROADCAST;
2458 else if (ipv4_is_multicast(fl4->daddr))
2459 type = RTN_MULTICAST;
2460 else if (ipv4_is_zeronet(fl4->daddr))
2461 return ERR_PTR(-EINVAL);
2463 if (dev_out->flags & IFF_LOOPBACK)
2464 flags |= RTCF_LOCAL;
2466 in_dev = __in_dev_get_rcu(dev_out);
2468 return ERR_PTR(-EINVAL);
2470 if (type == RTN_BROADCAST) {
2471 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2473 } else if (type == RTN_MULTICAST) {
2474 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2475 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2477 flags &= ~RTCF_LOCAL;
2478 /* If multicast route do not exist use
2479 * default one, but do not gateway in this case.
2482 if (fi && res->prefixlen < 4)
2486 rth = rt_dst_alloc(dev_out,
2487 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2488 IN_DEV_CONF_GET(in_dev, NOXFRM));
2490 return ERR_PTR(-ENOBUFS);
2492 rth->dst.output = ip_output;
2494 rth->rt_key_dst = orig_daddr;
2495 rth->rt_key_src = orig_saddr;
2496 rth->rt_genid = rt_genid(dev_net(dev_out));
2497 rth->rt_flags = flags;
2498 rth->rt_type = type;
2499 rth->rt_key_tos = tos;
2500 rth->rt_dst = fl4->daddr;
2501 rth->rt_src = fl4->saddr;
2502 rth->rt_route_iif = 0;
2503 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2504 rth->rt_oif = orig_oif;
2505 rth->rt_mark = fl4->flowi4_mark;
2506 rth->rt_gateway = fl4->daddr;
2507 rth->rt_spec_dst= fl4->saddr;
2508 rth->rt_peer_genid = 0;
2512 RT_CACHE_STAT_INC(out_slow_tot);
2514 if (flags & RTCF_LOCAL) {
2515 rth->dst.input = ip_local_deliver;
2516 rth->rt_spec_dst = fl4->daddr;
2518 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2519 rth->rt_spec_dst = fl4->saddr;
2520 if (flags & RTCF_LOCAL &&
2521 !(dev_out->flags & IFF_LOOPBACK)) {
2522 rth->dst.output = ip_mc_output;
2523 RT_CACHE_STAT_INC(out_slow_mc);
2525 #ifdef CONFIG_IP_MROUTE
2526 if (type == RTN_MULTICAST) {
2527 if (IN_DEV_MFORWARD(in_dev) &&
2528 !ipv4_is_local_multicast(fl4->daddr)) {
2529 rth->dst.input = ip_mr_input;
2530 rth->dst.output = ip_mc_output;
2536 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2542 * Major route resolver routine.
2543 * called with rcu_read_lock();
2546 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2548 struct net_device *dev_out = NULL;
2549 u32 tos = RT_FL_TOS(fl4);
2550 unsigned int flags = 0;
2551 struct fib_result res;
2558 #ifdef CONFIG_IP_MULTIPLE_TABLES
2562 orig_daddr = fl4->daddr;
2563 orig_saddr = fl4->saddr;
2564 orig_oif = fl4->flowi4_oif;
2566 fl4->flowi4_iif = net->loopback_dev->ifindex;
2567 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2568 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2569 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2573 rth = ERR_PTR(-EINVAL);
2574 if (ipv4_is_multicast(fl4->saddr) ||
2575 ipv4_is_lbcast(fl4->saddr) ||
2576 ipv4_is_zeronet(fl4->saddr))
2579 /* I removed check for oif == dev_out->oif here.
2580 It was wrong for two reasons:
2581 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2582 is assigned to multiple interfaces.
2583 2. Moreover, we are allowed to send packets with saddr
2584 of another iface. --ANK
2587 if (fl4->flowi4_oif == 0 &&
2588 (ipv4_is_multicast(fl4->daddr) ||
2589 ipv4_is_lbcast(fl4->daddr))) {
2590 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2591 dev_out = __ip_dev_find(net, fl4->saddr, false);
2592 if (dev_out == NULL)
2595 /* Special hack: user can direct multicasts
2596 and limited broadcast via necessary interface
2597 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2598 This hack is not just for fun, it allows
2599 vic,vat and friends to work.
2600 They bind socket to loopback, set ttl to zero
2601 and expect that it will work.
2602 From the viewpoint of routing cache they are broken,
2603 because we are not allowed to build multicast path
2604 with loopback source addr (look, routing cache
2605 cannot know, that ttl is zero, so that packet
2606 will not leave this host and route is valid).
2607 Luckily, this hack is good workaround.
2610 fl4->flowi4_oif = dev_out->ifindex;
2614 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2615 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2616 if (!__ip_dev_find(net, fl4->saddr, false))
2622 if (fl4->flowi4_oif) {
2623 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2624 rth = ERR_PTR(-ENODEV);
2625 if (dev_out == NULL)
2628 /* RACE: Check return value of inet_select_addr instead. */
2629 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2630 rth = ERR_PTR(-ENETUNREACH);
2633 if (ipv4_is_local_multicast(fl4->daddr) ||
2634 ipv4_is_lbcast(fl4->daddr)) {
2636 fl4->saddr = inet_select_addr(dev_out, 0,
2641 if (ipv4_is_multicast(fl4->daddr))
2642 fl4->saddr = inet_select_addr(dev_out, 0,
2644 else if (!fl4->daddr)
2645 fl4->saddr = inet_select_addr(dev_out, 0,
2651 fl4->daddr = fl4->saddr;
2653 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2654 dev_out = net->loopback_dev;
2655 fl4->flowi4_oif = net->loopback_dev->ifindex;
2656 res.type = RTN_LOCAL;
2657 flags |= RTCF_LOCAL;
2661 if (fib_lookup(net, fl4, &res)) {
2663 if (fl4->flowi4_oif) {
2664 /* Apparently, routing tables are wrong. Assume,
2665 that the destination is on link.
2668 Because we are allowed to send to iface
2669 even if it has NO routes and NO assigned
2670 addresses. When oif is specified, routing
2671 tables are looked up with only one purpose:
2672 to catch if destination is gatewayed, rather than
2673 direct. Moreover, if MSG_DONTROUTE is set,
2674 we send packet, ignoring both routing tables
2675 and ifaddr state. --ANK
2678 We could make it even if oif is unknown,
2679 likely IPv6, but we do not.
2682 if (fl4->saddr == 0)
2683 fl4->saddr = inet_select_addr(dev_out, 0,
2685 res.type = RTN_UNICAST;
2688 rth = ERR_PTR(-ENETUNREACH);
2692 if (res.type == RTN_LOCAL) {
2694 if (res.fi->fib_prefsrc)
2695 fl4->saddr = res.fi->fib_prefsrc;
2697 fl4->saddr = fl4->daddr;
2699 dev_out = net->loopback_dev;
2700 fl4->flowi4_oif = dev_out->ifindex;
2702 flags |= RTCF_LOCAL;
2706 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2707 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2708 fib_select_multipath(&res);
2711 if (!res.prefixlen &&
2712 res.table->tb_num_default > 1 &&
2713 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2714 fib_select_default(&res);
2717 fl4->saddr = FIB_RES_PREFSRC(net, res);
2719 dev_out = FIB_RES_DEV(res);
2720 fl4->flowi4_oif = dev_out->ifindex;
2724 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2729 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2730 rt_genid(dev_net(dev_out)));
2731 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2739 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2744 if (!rt_caching(net))
2747 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2750 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2751 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2752 if (rth->rt_key_dst == flp4->daddr &&
2753 rth->rt_key_src == flp4->saddr &&
2754 rt_is_output_route(rth) &&
2755 rth->rt_oif == flp4->flowi4_oif &&
2756 rth->rt_mark == flp4->flowi4_mark &&
2757 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2758 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2759 net_eq(dev_net(rth->dst.dev), net) &&
2760 !rt_is_expired(rth)) {
2761 rth = ipv4_validate_peer(rth);
2764 dst_use(&rth->dst, jiffies);
2765 RT_CACHE_STAT_INC(out_hit);
2766 rcu_read_unlock_bh();
2768 flp4->saddr = rth->rt_src;
2770 flp4->daddr = rth->rt_dst;
2773 RT_CACHE_STAT_INC(out_hlist_search);
2775 rcu_read_unlock_bh();
2778 return ip_route_output_slow(net, flp4);
2780 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2782 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2787 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2789 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2791 return mtu ? : dst->dev->mtu;
2794 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2798 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2804 static struct dst_ops ipv4_dst_blackhole_ops = {
2806 .protocol = cpu_to_be16(ETH_P_IP),
2807 .destroy = ipv4_dst_destroy,
2808 .check = ipv4_blackhole_dst_check,
2809 .mtu = ipv4_blackhole_mtu,
2810 .default_advmss = ipv4_default_advmss,
2811 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2812 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2813 .neigh_lookup = ipv4_neigh_lookup,
2816 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2818 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2819 struct rtable *ort = (struct rtable *) dst_orig;
2822 struct dst_entry *new = &rt->dst;
2825 new->input = dst_discard;
2826 new->output = dst_discard;
2827 dst_copy_metrics(new, &ort->dst);
2829 new->dev = ort->dst.dev;
2833 rt->rt_key_dst = ort->rt_key_dst;
2834 rt->rt_key_src = ort->rt_key_src;
2835 rt->rt_key_tos = ort->rt_key_tos;
2836 rt->rt_route_iif = ort->rt_route_iif;
2837 rt->rt_iif = ort->rt_iif;
2838 rt->rt_oif = ort->rt_oif;
2839 rt->rt_mark = ort->rt_mark;
2841 rt->rt_genid = rt_genid(net);
2842 rt->rt_flags = ort->rt_flags;
2843 rt->rt_type = ort->rt_type;
2844 rt->rt_dst = ort->rt_dst;
2845 rt->rt_src = ort->rt_src;
2846 rt->rt_gateway = ort->rt_gateway;
2847 rt->rt_spec_dst = ort->rt_spec_dst;
2848 rt->peer = ort->peer;
2850 atomic_inc(&rt->peer->refcnt);
2853 atomic_inc(&rt->fi->fib_clntref);
2858 dst_release(dst_orig);
2860 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2863 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2866 struct rtable *rt = __ip_route_output_key(net, flp4);
2871 if (flp4->flowi4_proto)
2872 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2873 flowi4_to_flowi(flp4),
2878 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2880 static int rt_fill_info(struct net *net,
2881 struct sk_buff *skb, u32 pid, u32 seq, int event,
2882 int nowait, unsigned int flags)
2884 struct rtable *rt = skb_rtable(skb);
2886 struct nlmsghdr *nlh;
2887 unsigned long expires = 0;
2888 const struct inet_peer *peer = rt->peer;
2889 u32 id = 0, ts = 0, tsage = 0, error;
2891 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2895 r = nlmsg_data(nlh);
2896 r->rtm_family = AF_INET;
2897 r->rtm_dst_len = 32;
2899 r->rtm_tos = rt->rt_key_tos;
2900 r->rtm_table = RT_TABLE_MAIN;
2901 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2902 r->rtm_type = rt->rt_type;
2903 r->rtm_scope = RT_SCOPE_UNIVERSE;
2904 r->rtm_protocol = RTPROT_UNSPEC;
2905 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2906 if (rt->rt_flags & RTCF_NOTIFY)
2907 r->rtm_flags |= RTM_F_NOTIFY;
2909 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2911 if (rt->rt_key_src) {
2912 r->rtm_src_len = 32;
2913 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2916 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2917 #ifdef CONFIG_IP_ROUTE_CLASSID
2918 if (rt->dst.tclassid)
2919 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2921 if (rt_is_input_route(rt))
2922 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2923 else if (rt->rt_src != rt->rt_key_src)
2924 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2926 if (rt->rt_dst != rt->rt_gateway)
2927 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2929 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2930 goto nla_put_failure;
2933 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2935 error = rt->dst.error;
2937 inet_peer_refcheck(rt->peer);
2938 id = atomic_read(&peer->ip_id_count) & 0xffff;
2939 if (peer->tcp_ts_stamp) {
2941 tsage = get_seconds() - peer->tcp_ts_stamp;
2943 expires = ACCESS_ONCE(peer->pmtu_expires);
2945 if (time_before(jiffies, expires))
2952 if (rt_is_input_route(rt)) {
2953 #ifdef CONFIG_IP_MROUTE
2954 __be32 dst = rt->rt_dst;
2956 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2957 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2958 int err = ipmr_get_route(net, skb,
2959 rt->rt_src, rt->rt_dst,
2965 goto nla_put_failure;
2967 if (err == -EMSGSIZE)
2968 goto nla_put_failure;
2974 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2977 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2978 expires, error) < 0)
2979 goto nla_put_failure;
2981 return nlmsg_end(skb, nlh);
2984 nlmsg_cancel(skb, nlh);
2988 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2990 struct net *net = sock_net(in_skb->sk);
2992 struct nlattr *tb[RTA_MAX+1];
2993 struct rtable *rt = NULL;
2999 struct sk_buff *skb;
3001 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3005 rtm = nlmsg_data(nlh);
3007 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3013 /* Reserve room for dummy headers, this skb can pass
3014 through good chunk of routing engine.
3016 skb_reset_mac_header(skb);
3017 skb_reset_network_header(skb);
3019 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3020 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3021 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3023 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3024 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3025 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3026 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3029 struct net_device *dev;
3031 dev = __dev_get_by_index(net, iif);
3037 skb->protocol = htons(ETH_P_IP);
3041 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3044 rt = skb_rtable(skb);
3045 if (err == 0 && rt->dst.error)
3046 err = -rt->dst.error;
3048 struct flowi4 fl4 = {
3051 .flowi4_tos = rtm->rtm_tos,
3052 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3053 .flowi4_mark = mark,
3055 rt = ip_route_output_key(net, &fl4);
3065 skb_dst_set(skb, &rt->dst);
3066 if (rtm->rtm_flags & RTM_F_NOTIFY)
3067 rt->rt_flags |= RTCF_NOTIFY;
3069 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3070 RTM_NEWROUTE, 0, 0);
3074 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3083 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3090 net = sock_net(skb->sk);
3095 s_idx = idx = cb->args[1];
3096 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3097 if (!rt_hash_table[h].chain)
3100 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3101 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3102 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3104 if (rt_is_expired(rt))
3106 skb_dst_set_noref(skb, &rt->dst);
3107 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3108 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3109 1, NLM_F_MULTI) <= 0) {
3111 rcu_read_unlock_bh();
3116 rcu_read_unlock_bh();
3125 void ip_rt_multicast_event(struct in_device *in_dev)
3127 rt_cache_flush(dev_net(in_dev->dev), 0);
3130 #ifdef CONFIG_SYSCTL
3131 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3132 void __user *buffer,
3133 size_t *lenp, loff_t *ppos)
3140 memcpy(&ctl, __ctl, sizeof(ctl));
3141 ctl.data = &flush_delay;
3142 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3144 net = (struct net *)__ctl->extra1;
3145 rt_cache_flush(net, flush_delay);
3152 static ctl_table ipv4_route_table[] = {
3154 .procname = "gc_thresh",
3155 .data = &ipv4_dst_ops.gc_thresh,
3156 .maxlen = sizeof(int),
3158 .proc_handler = proc_dointvec,
3161 .procname = "max_size",
3162 .data = &ip_rt_max_size,
3163 .maxlen = sizeof(int),
3165 .proc_handler = proc_dointvec,
3168 /* Deprecated. Use gc_min_interval_ms */
3170 .procname = "gc_min_interval",
3171 .data = &ip_rt_gc_min_interval,
3172 .maxlen = sizeof(int),
3174 .proc_handler = proc_dointvec_jiffies,
3177 .procname = "gc_min_interval_ms",
3178 .data = &ip_rt_gc_min_interval,
3179 .maxlen = sizeof(int),
3181 .proc_handler = proc_dointvec_ms_jiffies,
3184 .procname = "gc_timeout",
3185 .data = &ip_rt_gc_timeout,
3186 .maxlen = sizeof(int),
3188 .proc_handler = proc_dointvec_jiffies,
3191 .procname = "redirect_load",
3192 .data = &ip_rt_redirect_load,
3193 .maxlen = sizeof(int),
3195 .proc_handler = proc_dointvec,
3198 .procname = "redirect_number",
3199 .data = &ip_rt_redirect_number,
3200 .maxlen = sizeof(int),
3202 .proc_handler = proc_dointvec,
3205 .procname = "redirect_silence",
3206 .data = &ip_rt_redirect_silence,
3207 .maxlen = sizeof(int),
3209 .proc_handler = proc_dointvec,
3212 .procname = "error_cost",
3213 .data = &ip_rt_error_cost,
3214 .maxlen = sizeof(int),
3216 .proc_handler = proc_dointvec,
3219 .procname = "error_burst",
3220 .data = &ip_rt_error_burst,
3221 .maxlen = sizeof(int),
3223 .proc_handler = proc_dointvec,
3226 .procname = "gc_elasticity",
3227 .data = &ip_rt_gc_elasticity,
3228 .maxlen = sizeof(int),
3230 .proc_handler = proc_dointvec,
3233 .procname = "mtu_expires",
3234 .data = &ip_rt_mtu_expires,
3235 .maxlen = sizeof(int),
3237 .proc_handler = proc_dointvec_jiffies,
3240 .procname = "min_pmtu",
3241 .data = &ip_rt_min_pmtu,
3242 .maxlen = sizeof(int),
3244 .proc_handler = proc_dointvec,
3247 .procname = "min_adv_mss",
3248 .data = &ip_rt_min_advmss,
3249 .maxlen = sizeof(int),
3251 .proc_handler = proc_dointvec,
3256 static struct ctl_table empty[1];
3258 static struct ctl_table ipv4_skeleton[] =
3260 { .procname = "route",
3261 .mode = 0555, .child = ipv4_route_table},
3262 { .procname = "neigh",
3263 .mode = 0555, .child = empty},
3267 static __net_initdata struct ctl_path ipv4_path[] = {
3268 { .procname = "net", },
3269 { .procname = "ipv4", },
3273 static struct ctl_table ipv4_route_flush_table[] = {
3275 .procname = "flush",
3276 .maxlen = sizeof(int),
3278 .proc_handler = ipv4_sysctl_rtcache_flush,
3283 static __net_initdata struct ctl_path ipv4_route_path[] = {
3284 { .procname = "net", },
3285 { .procname = "ipv4", },
3286 { .procname = "route", },
3290 static __net_init int sysctl_route_net_init(struct net *net)
3292 struct ctl_table *tbl;
3294 tbl = ipv4_route_flush_table;
3295 if (!net_eq(net, &init_net)) {
3296 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3300 tbl[0].extra1 = net;
3302 net->ipv4.route_hdr =
3303 register_net_sysctl_table(net, ipv4_route_path, tbl);
3304 if (net->ipv4.route_hdr == NULL)
3309 if (tbl != ipv4_route_flush_table)
3315 static __net_exit void sysctl_route_net_exit(struct net *net)
3317 struct ctl_table *tbl;
3319 tbl = net->ipv4.route_hdr->ctl_table_arg;
3320 unregister_net_sysctl_table(net->ipv4.route_hdr);
3321 BUG_ON(tbl == ipv4_route_flush_table);
3325 static __net_initdata struct pernet_operations sysctl_route_ops = {
3326 .init = sysctl_route_net_init,
3327 .exit = sysctl_route_net_exit,
3331 static __net_init int rt_genid_init(struct net *net)
3333 get_random_bytes(&net->ipv4.rt_genid,
3334 sizeof(net->ipv4.rt_genid));
3335 get_random_bytes(&net->ipv4.dev_addr_genid,
3336 sizeof(net->ipv4.dev_addr_genid));
3340 static __net_initdata struct pernet_operations rt_genid_ops = {
3341 .init = rt_genid_init,
3345 #ifdef CONFIG_IP_ROUTE_CLASSID
3346 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3347 #endif /* CONFIG_IP_ROUTE_CLASSID */
3349 static __initdata unsigned long rhash_entries;
3350 static int __init set_rhash_entries(char *str)
3354 rhash_entries = simple_strtoul(str, &str, 0);
3357 __setup("rhash_entries=", set_rhash_entries);
3359 int __init ip_rt_init(void)
3363 #ifdef CONFIG_IP_ROUTE_CLASSID
3364 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3366 panic("IP: failed to allocate ip_rt_acct\n");
3369 ipv4_dst_ops.kmem_cachep =
3370 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3371 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3373 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3375 if (dst_entries_init(&ipv4_dst_ops) < 0)
3376 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3378 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3379 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3381 rt_hash_table = (struct rt_hash_bucket *)
3382 alloc_large_system_hash("IP route cache",
3383 sizeof(struct rt_hash_bucket),
3385 (totalram_pages >= 128 * 1024) ?
3390 rhash_entries ? 0 : 512 * 1024);
3391 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3392 rt_hash_lock_init();
3394 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3395 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3400 if (ip_rt_proc_init())
3401 printk(KERN_ERR "Unable to create route proc files\n");
3404 xfrm4_init(ip_rt_max_size);
3406 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3408 #ifdef CONFIG_SYSCTL
3409 register_pernet_subsys(&sysctl_route_ops);
3411 register_pernet_subsys(&rt_genid_ops);
3415 #ifdef CONFIG_SYSCTL
3417 * We really need to sanitize the damn ipv4 init order, then all
3418 * this nonsense will go away.
3420 void __init ip_static_sysctl_init(void)
3422 register_sysctl_paths(ipv4_path, ipv4_skeleton);