2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
100 #include <net/ip_fib.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
108 #include <linux/sysctl.h>
111 #define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 #define IP_MAX_MTU 0xFFF0
116 #define RT_GC_TIMEOUT (300*HZ)
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122 static int ip_rt_redirect_number __read_mostly = 9;
123 static int ip_rt_redirect_load __read_mostly = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly = HZ;
126 static int ip_rt_error_burst __read_mostly = 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly = 8;
128 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly = 256;
131 static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
133 static void rt_worker_func(struct work_struct *work);
134 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
137 * Interface to generic destination cache.
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static void ipv4_dst_destroy(struct dst_entry *dst);
142 static void ipv4_dst_ifdown(struct dst_entry *dst,
143 struct net_device *dev, int how);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void ipv4_link_failure(struct sk_buff *skb);
146 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147 static int rt_garbage_collect(struct dst_ops *ops);
150 static struct dst_ops ipv4_dst_ops = {
152 .protocol = __constant_htons(ETH_P_IP),
153 .gc = rt_garbage_collect,
154 .check = ipv4_dst_check,
155 .destroy = ipv4_dst_destroy,
156 .ifdown = ipv4_dst_ifdown,
157 .negative_advice = ipv4_negative_advice,
158 .link_failure = ipv4_link_failure,
159 .update_pmtu = ip_rt_update_pmtu,
160 .local_out = __ip_local_out,
161 .entry_size = sizeof(struct rtable),
162 .entries = ATOMIC_INIT(0),
165 #define ECN_OR_COST(class) TC_PRIO_##class
167 const __u8 ip_tos2prio[16] = {
171 ECN_OR_COST(BESTEFFORT),
177 ECN_OR_COST(INTERACTIVE),
179 ECN_OR_COST(INTERACTIVE),
180 TC_PRIO_INTERACTIVE_BULK,
181 ECN_OR_COST(INTERACTIVE_BULK),
182 TC_PRIO_INTERACTIVE_BULK,
183 ECN_OR_COST(INTERACTIVE_BULK)
191 /* The locking scheme is rather straight forward:
193 * 1) Read-Copy Update protects the buckets of the central route hash.
194 * 2) Only writers remove entries, and they hold the lock
195 * as they look at rtable reference counts.
196 * 3) Only readers acquire references to rtable entries,
197 * they do so with atomic increments and with the
201 struct rt_hash_bucket {
202 struct rtable *chain;
204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205 defined(CONFIG_PROVE_LOCKING)
207 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
208 * The size of this table is a power of two and depends on the number of CPUS.
209 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
211 #ifdef CONFIG_LOCKDEP
212 # define RT_HASH_LOCK_SZ 256
215 # define RT_HASH_LOCK_SZ 4096
217 # define RT_HASH_LOCK_SZ 2048
219 # define RT_HASH_LOCK_SZ 1024
221 # define RT_HASH_LOCK_SZ 512
223 # define RT_HASH_LOCK_SZ 256
227 static spinlock_t *rt_hash_locks;
228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
230 static __init void rt_hash_lock_init(void)
234 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 panic("IP: failed to allocate rt_hash_locks\n");
239 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
240 spin_lock_init(&rt_hash_locks[i]);
243 # define rt_hash_lock_addr(slot) NULL
245 static inline void rt_hash_lock_init(void)
250 static struct rt_hash_bucket *rt_hash_table __read_mostly;
251 static unsigned rt_hash_mask __read_mostly;
252 static unsigned int rt_hash_log __read_mostly;
253 static atomic_t rt_genid __read_mostly;
255 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
256 #define RT_CACHE_STAT_INC(field) \
257 (__raw_get_cpu_var(rt_cache_stat).field++)
259 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
262 return jhash_3words((__force u32)(__be32)(daddr),
263 (__force u32)(__be32)(saddr),
268 #ifdef CONFIG_PROC_FS
269 struct rt_cache_iter_state {
270 struct seq_net_private p;
275 static struct rtable *rt_cache_get_first(struct seq_file *seq)
277 struct rt_cache_iter_state *st = seq->private;
278 struct rtable *r = NULL;
280 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
282 r = rcu_dereference(rt_hash_table[st->bucket].chain);
284 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
285 r->rt_genid == st->genid)
287 r = rcu_dereference(r->u.dst.rt_next);
289 rcu_read_unlock_bh();
294 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
297 struct rt_cache_iter_state *st = seq->private;
298 r = r->u.dst.rt_next;
300 rcu_read_unlock_bh();
301 if (--st->bucket < 0)
304 r = rt_hash_table[st->bucket].chain;
306 return rcu_dereference(r);
309 static struct rtable *rt_cache_get_next(struct seq_file *seq,
312 struct rt_cache_iter_state *st = seq->private;
313 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
314 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
316 if (r->rt_genid == st->genid)
322 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
324 struct rtable *r = rt_cache_get_first(seq);
327 while (pos && (r = rt_cache_get_next(seq, r)))
329 return pos ? NULL : r;
332 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
334 struct rt_cache_iter_state *st = seq->private;
336 return rt_cache_get_idx(seq, *pos - 1);
337 st->genid = atomic_read(&rt_genid);
338 return SEQ_START_TOKEN;
341 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
345 if (v == SEQ_START_TOKEN)
346 r = rt_cache_get_first(seq);
348 r = rt_cache_get_next(seq, v);
353 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
355 if (v && v != SEQ_START_TOKEN)
356 rcu_read_unlock_bh();
359 static int rt_cache_seq_show(struct seq_file *seq, void *v)
361 if (v == SEQ_START_TOKEN)
362 seq_printf(seq, "%-127s\n",
363 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
364 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
367 struct rtable *r = v;
370 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
371 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
372 r->u.dst.dev ? r->u.dst.dev->name : "*",
373 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
374 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
375 r->u.dst.__use, 0, (unsigned long)r->rt_src,
376 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
377 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
378 dst_metric(&r->u.dst, RTAX_WINDOW),
379 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
380 dst_metric(&r->u.dst, RTAX_RTTVAR)),
382 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
383 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
385 r->rt_spec_dst, &len);
387 seq_printf(seq, "%*s\n", 127 - len, "");
392 static const struct seq_operations rt_cache_seq_ops = {
393 .start = rt_cache_seq_start,
394 .next = rt_cache_seq_next,
395 .stop = rt_cache_seq_stop,
396 .show = rt_cache_seq_show,
399 static int rt_cache_seq_open(struct inode *inode, struct file *file)
401 return seq_open_net(inode, file, &rt_cache_seq_ops,
402 sizeof(struct rt_cache_iter_state));
405 static const struct file_operations rt_cache_seq_fops = {
406 .owner = THIS_MODULE,
407 .open = rt_cache_seq_open,
410 .release = seq_release_net,
414 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
419 return SEQ_START_TOKEN;
421 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
422 if (!cpu_possible(cpu))
425 return &per_cpu(rt_cache_stat, cpu);
430 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
434 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
435 if (!cpu_possible(cpu))
438 return &per_cpu(rt_cache_stat, cpu);
444 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
449 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
451 struct rt_cache_stat *st = v;
453 if (v == SEQ_START_TOKEN) {
454 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
458 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
459 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
460 atomic_read(&ipv4_dst_ops.entries),
483 static const struct seq_operations rt_cpu_seq_ops = {
484 .start = rt_cpu_seq_start,
485 .next = rt_cpu_seq_next,
486 .stop = rt_cpu_seq_stop,
487 .show = rt_cpu_seq_show,
491 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
493 return seq_open(file, &rt_cpu_seq_ops);
496 static const struct file_operations rt_cpu_seq_fops = {
497 .owner = THIS_MODULE,
498 .open = rt_cpu_seq_open,
501 .release = seq_release,
504 #ifdef CONFIG_NET_CLS_ROUTE
505 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
506 int length, int *eof, void *data)
510 if ((offset & 3) || (length & 3))
513 if (offset >= sizeof(struct ip_rt_acct) * 256) {
518 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
519 length = sizeof(struct ip_rt_acct) * 256 - offset;
523 offset /= sizeof(u32);
526 u32 *dst = (u32 *) buffer;
529 memset(dst, 0, length);
531 for_each_possible_cpu(i) {
535 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
536 for (j = 0; j < length/4; j++)
544 static int __net_init ip_rt_do_proc_init(struct net *net)
546 struct proc_dir_entry *pde;
548 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
553 pde = proc_create("rt_cache", S_IRUGO,
554 net->proc_net_stat, &rt_cpu_seq_fops);
558 #ifdef CONFIG_NET_CLS_ROUTE
559 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
560 ip_rt_acct_read, NULL);
566 #ifdef CONFIG_NET_CLS_ROUTE
568 remove_proc_entry("rt_cache", net->proc_net_stat);
571 remove_proc_entry("rt_cache", net->proc_net);
576 static void __net_exit ip_rt_do_proc_exit(struct net *net)
578 remove_proc_entry("rt_cache", net->proc_net_stat);
579 remove_proc_entry("rt_cache", net->proc_net);
580 remove_proc_entry("rt_acct", net->proc_net);
583 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
584 .init = ip_rt_do_proc_init,
585 .exit = ip_rt_do_proc_exit,
588 static int __init ip_rt_proc_init(void)
590 return register_pernet_subsys(&ip_rt_proc_ops);
594 static inline int ip_rt_proc_init(void)
598 #endif /* CONFIG_PROC_FS */
600 static inline void rt_free(struct rtable *rt)
602 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
605 static inline void rt_drop(struct rtable *rt)
608 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
611 static inline int rt_fast_clean(struct rtable *rth)
613 /* Kill broadcast/multicast entries very aggresively, if they
614 collide in hash table with more useful entries */
615 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
616 rth->fl.iif && rth->u.dst.rt_next;
619 static inline int rt_valuable(struct rtable *rth)
621 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
625 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
630 if (atomic_read(&rth->u.dst.__refcnt))
634 if (rth->u.dst.expires &&
635 time_after_eq(jiffies, rth->u.dst.expires))
638 age = jiffies - rth->u.dst.lastuse;
640 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
641 (age <= tmo2 && rt_valuable(rth)))
647 /* Bits of score are:
649 * 30: not quite useless
650 * 29..0: usage counter
652 static inline u32 rt_score(struct rtable *rt)
654 u32 score = jiffies - rt->u.dst.lastuse;
656 score = ~score & ~(3<<30);
662 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
668 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
670 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
671 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
672 (fl1->mark ^ fl2->mark) |
673 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
674 *(u16 *)&fl2->nl_u.ip4_u.tos) |
675 (fl1->oif ^ fl2->oif) |
676 (fl1->iif ^ fl2->iif)) == 0;
679 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
681 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
685 * Perform a full scan of hash table and free all entries.
686 * Can be called by a softirq or a process.
687 * In the later case, we want to be reschedule if necessary
689 static void rt_do_flush(int process_context)
692 struct rtable *rth, *next;
694 for (i = 0; i <= rt_hash_mask; i++) {
695 if (process_context && need_resched())
697 rth = rt_hash_table[i].chain;
701 spin_lock_bh(rt_hash_lock_addr(i));
702 rth = rt_hash_table[i].chain;
703 rt_hash_table[i].chain = NULL;
704 spin_unlock_bh(rt_hash_lock_addr(i));
706 for (; rth; rth = next) {
707 next = rth->u.dst.rt_next;
713 static void rt_check_expire(void)
715 static unsigned int rover;
716 unsigned int i = rover, goal;
717 struct rtable *rth, **rthp;
720 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
721 if (ip_rt_gc_timeout > 1)
722 do_div(mult, ip_rt_gc_timeout);
723 goal = (unsigned int)mult;
724 if (goal > rt_hash_mask)
725 goal = rt_hash_mask + 1;
726 for (; goal > 0; goal--) {
727 unsigned long tmo = ip_rt_gc_timeout;
729 i = (i + 1) & rt_hash_mask;
730 rthp = &rt_hash_table[i].chain;
737 spin_lock_bh(rt_hash_lock_addr(i));
738 while ((rth = *rthp) != NULL) {
739 if (rth->rt_genid != atomic_read(&rt_genid)) {
740 *rthp = rth->u.dst.rt_next;
744 if (rth->u.dst.expires) {
745 /* Entry is expired even if it is in use */
746 if (time_before_eq(jiffies, rth->u.dst.expires)) {
748 rthp = &rth->u.dst.rt_next;
751 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
753 rthp = &rth->u.dst.rt_next;
757 /* Cleanup aged off entries. */
758 *rthp = rth->u.dst.rt_next;
761 spin_unlock_bh(rt_hash_lock_addr(i));
767 * rt_worker_func() is run in process context.
768 * we call rt_check_expire() to scan part of the hash table
770 static void rt_worker_func(struct work_struct *work)
773 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
777 * Pertubation of rt_genid by a small quantity [1..256]
778 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
779 * many times (2^24) without giving recent rt_genid.
780 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
782 static void rt_cache_invalidate(struct net *net)
784 unsigned char shuffle;
786 get_random_bytes(&shuffle, sizeof(shuffle));
787 atomic_add(shuffle + 1U, &rt_genid);
791 * delay < 0 : invalidate cache (fast : entries will be deleted later)
792 * delay >= 0 : invalidate & flush cache (can be long)
794 void rt_cache_flush(struct net *net, int delay)
796 rt_cache_invalidate(net);
798 rt_do_flush(!in_softirq());
802 * We change rt_genid and let gc do the cleanup
804 static void rt_secret_rebuild(unsigned long __net)
806 struct net *net = (struct net *)__net;
807 rt_cache_invalidate(net);
808 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
812 Short description of GC goals.
814 We want to build algorithm, which will keep routing cache
815 at some equilibrium point, when number of aged off entries
816 is kept approximately equal to newly generated ones.
818 Current expiration strength is variable "expire".
819 We try to adjust it dynamically, so that if networking
820 is idle expires is large enough to keep enough of warm entries,
821 and when load increases it reduces to limit cache size.
824 static int rt_garbage_collect(struct dst_ops *ops)
826 static unsigned long expire = RT_GC_TIMEOUT;
827 static unsigned long last_gc;
829 static int equilibrium;
830 struct rtable *rth, **rthp;
831 unsigned long now = jiffies;
835 * Garbage collection is pretty expensive,
836 * do not make it too frequently.
839 RT_CACHE_STAT_INC(gc_total);
841 if (now - last_gc < ip_rt_gc_min_interval &&
842 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
843 RT_CACHE_STAT_INC(gc_ignored);
847 /* Calculate number of entries, which we want to expire now. */
848 goal = atomic_read(&ipv4_dst_ops.entries) -
849 (ip_rt_gc_elasticity << rt_hash_log);
851 if (equilibrium < ipv4_dst_ops.gc_thresh)
852 equilibrium = ipv4_dst_ops.gc_thresh;
853 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
855 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
856 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
859 /* We are in dangerous area. Try to reduce cache really
862 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
863 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
866 if (now - last_gc >= ip_rt_gc_min_interval)
877 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
878 unsigned long tmo = expire;
880 k = (k + 1) & rt_hash_mask;
881 rthp = &rt_hash_table[k].chain;
882 spin_lock_bh(rt_hash_lock_addr(k));
883 while ((rth = *rthp) != NULL) {
884 if (rth->rt_genid == atomic_read(&rt_genid) &&
885 !rt_may_expire(rth, tmo, expire)) {
887 rthp = &rth->u.dst.rt_next;
890 *rthp = rth->u.dst.rt_next;
894 spin_unlock_bh(rt_hash_lock_addr(k));
903 /* Goal is not achieved. We stop process if:
905 - if expire reduced to zero. Otherwise, expire is halfed.
906 - if table is not full.
907 - if we are called from interrupt.
908 - jiffies check is just fallback/debug loop breaker.
909 We will not spin here for long time in any case.
912 RT_CACHE_STAT_INC(gc_goal_miss);
918 #if RT_CACHE_DEBUG >= 2
919 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
920 atomic_read(&ipv4_dst_ops.entries), goal, i);
923 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
925 } while (!in_softirq() && time_before_eq(jiffies, now));
927 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
930 printk(KERN_WARNING "dst cache overflow\n");
931 RT_CACHE_STAT_INC(gc_dst_overflow);
935 expire += ip_rt_gc_min_interval;
936 if (expire > ip_rt_gc_timeout ||
937 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
938 expire = ip_rt_gc_timeout;
939 #if RT_CACHE_DEBUG >= 2
940 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
941 atomic_read(&ipv4_dst_ops.entries), goal, rover);
946 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
948 struct rtable *rth, **rthp;
950 struct rtable *cand, **candp;
953 int attempts = !in_softirq();
962 rthp = &rt_hash_table[hash].chain;
964 spin_lock_bh(rt_hash_lock_addr(hash));
965 while ((rth = *rthp) != NULL) {
966 if (rth->rt_genid != atomic_read(&rt_genid)) {
967 *rthp = rth->u.dst.rt_next;
971 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
973 *rthp = rth->u.dst.rt_next;
975 * Since lookup is lockfree, the deletion
976 * must be visible to another weakly ordered CPU before
977 * the insertion at the start of the hash chain.
979 rcu_assign_pointer(rth->u.dst.rt_next,
980 rt_hash_table[hash].chain);
982 * Since lookup is lockfree, the update writes
983 * must be ordered for consistency on SMP.
985 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
987 dst_use(&rth->u.dst, now);
988 spin_unlock_bh(rt_hash_lock_addr(hash));
995 if (!atomic_read(&rth->u.dst.__refcnt)) {
996 u32 score = rt_score(rth);
998 if (score <= min_score) {
1007 rthp = &rth->u.dst.rt_next;
1011 /* ip_rt_gc_elasticity used to be average length of chain
1012 * length, when exceeded gc becomes really aggressive.
1014 * The second limit is less certain. At the moment it allows
1015 * only 2 entries per bucket. We will see.
1017 if (chain_length > ip_rt_gc_elasticity) {
1018 *candp = cand->u.dst.rt_next;
1023 /* Try to bind route to arp only if it is output
1024 route or unicast forwarding path.
1026 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1027 int err = arp_bind_neighbour(&rt->u.dst);
1029 spin_unlock_bh(rt_hash_lock_addr(hash));
1031 if (err != -ENOBUFS) {
1036 /* Neighbour tables are full and nothing
1037 can be released. Try to shrink route cache,
1038 it is most likely it holds some neighbour records.
1040 if (attempts-- > 0) {
1041 int saved_elasticity = ip_rt_gc_elasticity;
1042 int saved_int = ip_rt_gc_min_interval;
1043 ip_rt_gc_elasticity = 1;
1044 ip_rt_gc_min_interval = 0;
1045 rt_garbage_collect(&ipv4_dst_ops);
1046 ip_rt_gc_min_interval = saved_int;
1047 ip_rt_gc_elasticity = saved_elasticity;
1051 if (net_ratelimit())
1052 printk(KERN_WARNING "Neighbour table overflow.\n");
1058 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1059 #if RT_CACHE_DEBUG >= 2
1060 if (rt->u.dst.rt_next) {
1062 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1063 NIPQUAD(rt->rt_dst));
1064 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1065 printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1069 rt_hash_table[hash].chain = rt;
1070 spin_unlock_bh(rt_hash_lock_addr(hash));
1075 void rt_bind_peer(struct rtable *rt, int create)
1077 static DEFINE_SPINLOCK(rt_peer_lock);
1078 struct inet_peer *peer;
1080 peer = inet_getpeer(rt->rt_dst, create);
1082 spin_lock_bh(&rt_peer_lock);
1083 if (rt->peer == NULL) {
1087 spin_unlock_bh(&rt_peer_lock);
1093 * Peer allocation may fail only in serious out-of-memory conditions. However
1094 * we still can generate some output.
1095 * Random ID selection looks a bit dangerous because we have no chances to
1096 * select ID being unique in a reasonable period of time.
1097 * But broken packet identifier may be better than no packet at all.
1099 static void ip_select_fb_ident(struct iphdr *iph)
1101 static DEFINE_SPINLOCK(ip_fb_id_lock);
1102 static u32 ip_fallback_id;
1105 spin_lock_bh(&ip_fb_id_lock);
1106 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1107 iph->id = htons(salt & 0xFFFF);
1108 ip_fallback_id = salt;
1109 spin_unlock_bh(&ip_fb_id_lock);
1112 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1114 struct rtable *rt = (struct rtable *) dst;
1117 if (rt->peer == NULL)
1118 rt_bind_peer(rt, 1);
1120 /* If peer is attached to destination, it is never detached,
1121 so that we need not to grab a lock to dereference it.
1124 iph->id = htons(inet_getid(rt->peer, more));
1128 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1129 __builtin_return_address(0));
1131 ip_select_fb_ident(iph);
1134 static void rt_del(unsigned hash, struct rtable *rt)
1136 struct rtable **rthp, *aux;
1138 rthp = &rt_hash_table[hash].chain;
1139 spin_lock_bh(rt_hash_lock_addr(hash));
1141 while ((aux = *rthp) != NULL) {
1142 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1143 *rthp = aux->u.dst.rt_next;
1147 rthp = &aux->u.dst.rt_next;
1149 spin_unlock_bh(rt_hash_lock_addr(hash));
1152 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1153 __be32 saddr, struct net_device *dev)
1156 struct in_device *in_dev = in_dev_get(dev);
1157 struct rtable *rth, **rthp;
1158 __be32 skeys[2] = { saddr, 0 };
1159 int ikeys[2] = { dev->ifindex, 0 };
1160 struct netevent_redirect netevent;
1167 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1168 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1169 || ipv4_is_zeronet(new_gw))
1170 goto reject_redirect;
1172 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1173 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1174 goto reject_redirect;
1175 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1176 goto reject_redirect;
1178 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1179 goto reject_redirect;
1182 for (i = 0; i < 2; i++) {
1183 for (k = 0; k < 2; k++) {
1184 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1185 atomic_read(&rt_genid));
1187 rthp=&rt_hash_table[hash].chain;
1190 while ((rth = rcu_dereference(*rthp)) != NULL) {
1193 if (rth->fl.fl4_dst != daddr ||
1194 rth->fl.fl4_src != skeys[i] ||
1195 rth->fl.oif != ikeys[k] ||
1197 rth->rt_genid != atomic_read(&rt_genid) ||
1198 !net_eq(dev_net(rth->u.dst.dev), net)) {
1199 rthp = &rth->u.dst.rt_next;
1203 if (rth->rt_dst != daddr ||
1204 rth->rt_src != saddr ||
1206 rth->rt_gateway != old_gw ||
1207 rth->u.dst.dev != dev)
1210 dst_hold(&rth->u.dst);
1213 rt = dst_alloc(&ipv4_dst_ops);
1220 /* Copy all the information. */
1222 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1223 rt->u.dst.__use = 1;
1224 atomic_set(&rt->u.dst.__refcnt, 1);
1225 rt->u.dst.child = NULL;
1227 dev_hold(rt->u.dst.dev);
1229 in_dev_hold(rt->idev);
1230 rt->u.dst.obsolete = 0;
1231 rt->u.dst.lastuse = jiffies;
1232 rt->u.dst.path = &rt->u.dst;
1233 rt->u.dst.neighbour = NULL;
1234 rt->u.dst.hh = NULL;
1235 rt->u.dst.xfrm = NULL;
1236 rt->rt_genid = atomic_read(&rt_genid);
1237 rt->rt_flags |= RTCF_REDIRECTED;
1239 /* Gateway is different ... */
1240 rt->rt_gateway = new_gw;
1242 /* Redirect received -> path was valid */
1243 dst_confirm(&rth->u.dst);
1246 atomic_inc(&rt->peer->refcnt);
1248 if (arp_bind_neighbour(&rt->u.dst) ||
1249 !(rt->u.dst.neighbour->nud_state &
1251 if (rt->u.dst.neighbour)
1252 neigh_event_send(rt->u.dst.neighbour, NULL);
1258 netevent.old = &rth->u.dst;
1259 netevent.new = &rt->u.dst;
1260 call_netevent_notifiers(NETEVENT_REDIRECT,
1264 if (!rt_intern_hash(hash, rt, &rt))
1277 #ifdef CONFIG_IP_ROUTE_VERBOSE
1278 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1279 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1280 NIPQUAD_FMT " ignored.\n"
1281 " Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1282 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1283 NIPQUAD(saddr), NIPQUAD(daddr));
1288 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1290 struct rtable *rt = (struct rtable *)dst;
1291 struct dst_entry *ret = dst;
1294 if (dst->obsolete) {
1297 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1298 rt->u.dst.expires) {
1299 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1301 atomic_read(&rt_genid));
1302 #if RT_CACHE_DEBUG >= 1
1303 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1304 NIPQUAD_FMT "/%02x dropped\n",
1305 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1316 * 1. The first ip_rt_redirect_number redirects are sent
1317 * with exponential backoff, then we stop sending them at all,
1318 * assuming that the host ignores our redirects.
1319 * 2. If we did not see packets requiring redirects
1320 * during ip_rt_redirect_silence, we assume that the host
1321 * forgot redirected route and start to send redirects again.
1323 * This algorithm is much cheaper and more intelligent than dumb load limiting
1326 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1327 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1330 void ip_rt_send_redirect(struct sk_buff *skb)
1332 struct rtable *rt = skb->rtable;
1333 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1338 if (!IN_DEV_TX_REDIRECTS(in_dev))
1341 /* No redirected packets during ip_rt_redirect_silence;
1342 * reset the algorithm.
1344 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1345 rt->u.dst.rate_tokens = 0;
1347 /* Too many ignored redirects; do not send anything
1348 * set u.dst.rate_last to the last seen redirected packet.
1350 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1351 rt->u.dst.rate_last = jiffies;
1355 /* Check for load limit; set rate_last to the latest sent
1358 if (rt->u.dst.rate_tokens == 0 ||
1360 (rt->u.dst.rate_last +
1361 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1362 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1363 rt->u.dst.rate_last = jiffies;
1364 ++rt->u.dst.rate_tokens;
1365 #ifdef CONFIG_IP_ROUTE_VERBOSE
1366 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1367 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1369 printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1370 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1371 NIPQUAD(rt->rt_src), rt->rt_iif,
1372 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1379 static int ip_error(struct sk_buff *skb)
1381 struct rtable *rt = skb->rtable;
1385 switch (rt->u.dst.error) {
1390 code = ICMP_HOST_UNREACH;
1393 code = ICMP_NET_UNREACH;
1394 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1397 code = ICMP_PKT_FILTERED;
1402 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1403 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1404 rt->u.dst.rate_tokens = ip_rt_error_burst;
1405 rt->u.dst.rate_last = now;
1406 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1407 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1408 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1411 out: kfree_skb(skb);
1416 * The last two values are not from the RFC but
1417 * are needed for AMPRnet AX.25 paths.
1420 static const unsigned short mtu_plateau[] =
1421 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1423 static inline unsigned short guess_mtu(unsigned short old_mtu)
1427 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1428 if (old_mtu > mtu_plateau[i])
1429 return mtu_plateau[i];
1433 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1434 unsigned short new_mtu,
1435 struct net_device *dev)
1438 unsigned short old_mtu = ntohs(iph->tot_len);
1440 int ikeys[2] = { dev->ifindex, 0 };
1441 __be32 skeys[2] = { iph->saddr, 0, };
1442 __be32 daddr = iph->daddr;
1443 unsigned short est_mtu = 0;
1445 if (ipv4_config.no_pmtu_disc)
1448 for (k = 0; k < 2; k++) {
1449 for (i = 0; i < 2; i++) {
1450 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1451 atomic_read(&rt_genid));
1454 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1455 rth = rcu_dereference(rth->u.dst.rt_next)) {
1456 unsigned short mtu = new_mtu;
1458 if (rth->fl.fl4_dst != daddr ||
1459 rth->fl.fl4_src != skeys[i] ||
1460 rth->rt_dst != daddr ||
1461 rth->rt_src != iph->saddr ||
1462 rth->fl.oif != ikeys[k] ||
1464 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1465 !net_eq(dev_net(rth->u.dst.dev), net) ||
1466 rth->rt_genid != atomic_read(&rt_genid))
1469 if (new_mtu < 68 || new_mtu >= old_mtu) {
1471 /* BSD 4.2 compatibility hack :-( */
1473 old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
1474 old_mtu >= 68 + (iph->ihl << 2))
1475 old_mtu -= iph->ihl << 2;
1477 mtu = guess_mtu(old_mtu);
1479 if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1480 if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
1481 dst_confirm(&rth->u.dst);
1482 if (mtu < ip_rt_min_pmtu) {
1483 mtu = ip_rt_min_pmtu;
1484 rth->u.dst.metrics[RTAX_LOCK-1] |=
1487 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1488 dst_set_expires(&rth->u.dst,
1497 return est_mtu ? : new_mtu;
1500 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1502 if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
1503 !(dst_metric_locked(dst, RTAX_MTU))) {
1504 if (mtu < ip_rt_min_pmtu) {
1505 mtu = ip_rt_min_pmtu;
1506 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1508 dst->metrics[RTAX_MTU-1] = mtu;
1509 dst_set_expires(dst, ip_rt_mtu_expires);
1510 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1514 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1519 static void ipv4_dst_destroy(struct dst_entry *dst)
1521 struct rtable *rt = (struct rtable *) dst;
1522 struct inet_peer *peer = rt->peer;
1523 struct in_device *idev = rt->idev;
1536 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1539 struct rtable *rt = (struct rtable *) dst;
1540 struct in_device *idev = rt->idev;
1541 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1542 struct in_device *loopback_idev =
1543 in_dev_get(dev_net(dev)->loopback_dev);
1544 if (loopback_idev) {
1545 rt->idev = loopback_idev;
1551 static void ipv4_link_failure(struct sk_buff *skb)
1555 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1559 dst_set_expires(&rt->u.dst, 0);
1562 static int ip_rt_bug(struct sk_buff *skb)
1564 printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1565 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1566 skb->dev ? skb->dev->name : "?");
1572 We do not cache source address of outgoing interface,
1573 because it is used only by IP RR, TS and SRR options,
1574 so that it out of fast path.
1576 BTW remember: "addr" is allowed to be not aligned
1580 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1583 struct fib_result res;
1585 if (rt->fl.iif == 0)
1587 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1588 src = FIB_RES_PREFSRC(res);
1591 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1593 memcpy(addr, &src, 4);
1596 #ifdef CONFIG_NET_CLS_ROUTE
1597 static void set_class_tag(struct rtable *rt, u32 tag)
1599 if (!(rt->u.dst.tclassid & 0xFFFF))
1600 rt->u.dst.tclassid |= tag & 0xFFFF;
1601 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1602 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1606 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1608 struct fib_info *fi = res->fi;
1611 if (FIB_RES_GW(*res) &&
1612 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1613 rt->rt_gateway = FIB_RES_GW(*res);
1614 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1615 sizeof(rt->u.dst.metrics));
1616 if (fi->fib_mtu == 0) {
1617 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1618 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1619 rt->rt_gateway != rt->rt_dst &&
1620 rt->u.dst.dev->mtu > 576)
1621 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1623 #ifdef CONFIG_NET_CLS_ROUTE
1624 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1627 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1629 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1630 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1631 if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
1632 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1633 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1634 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1636 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1637 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1639 #ifdef CONFIG_NET_CLS_ROUTE
1640 #ifdef CONFIG_IP_MULTIPLE_TABLES
1641 set_class_tag(rt, fib_rules_tclass(res));
1643 set_class_tag(rt, itag);
1645 rt->rt_type = res->type;
1648 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1649 u8 tos, struct net_device *dev, int our)
1654 struct in_device *in_dev = in_dev_get(dev);
1657 /* Primary sanity checks. */
1662 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1663 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1666 if (ipv4_is_zeronet(saddr)) {
1667 if (!ipv4_is_local_multicast(daddr))
1669 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1670 } else if (fib_validate_source(saddr, 0, tos, 0,
1671 dev, &spec_dst, &itag) < 0)
1674 rth = dst_alloc(&ipv4_dst_ops);
1678 rth->u.dst.output= ip_rt_bug;
1680 atomic_set(&rth->u.dst.__refcnt, 1);
1681 rth->u.dst.flags= DST_HOST;
1682 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1683 rth->u.dst.flags |= DST_NOPOLICY;
1684 rth->fl.fl4_dst = daddr;
1685 rth->rt_dst = daddr;
1686 rth->fl.fl4_tos = tos;
1687 rth->fl.mark = skb->mark;
1688 rth->fl.fl4_src = saddr;
1689 rth->rt_src = saddr;
1690 #ifdef CONFIG_NET_CLS_ROUTE
1691 rth->u.dst.tclassid = itag;
1694 rth->fl.iif = dev->ifindex;
1695 rth->u.dst.dev = init_net.loopback_dev;
1696 dev_hold(rth->u.dst.dev);
1697 rth->idev = in_dev_get(rth->u.dst.dev);
1699 rth->rt_gateway = daddr;
1700 rth->rt_spec_dst= spec_dst;
1701 rth->rt_genid = atomic_read(&rt_genid);
1702 rth->rt_flags = RTCF_MULTICAST;
1703 rth->rt_type = RTN_MULTICAST;
1705 rth->u.dst.input= ip_local_deliver;
1706 rth->rt_flags |= RTCF_LOCAL;
1709 #ifdef CONFIG_IP_MROUTE
1710 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1711 rth->u.dst.input = ip_mr_input;
1713 RT_CACHE_STAT_INC(in_slow_mc);
1716 hash = rt_hash(daddr, saddr, dev->ifindex, atomic_read(&rt_genid));
1717 return rt_intern_hash(hash, rth, &skb->rtable);
1729 static void ip_handle_martian_source(struct net_device *dev,
1730 struct in_device *in_dev,
1731 struct sk_buff *skb,
1735 RT_CACHE_STAT_INC(in_martian_src);
1736 #ifdef CONFIG_IP_ROUTE_VERBOSE
1737 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1739 * RFC1812 recommendation, if source is martian,
1740 * the only hint is MAC header.
1742 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1743 NIPQUAD_FMT", on dev %s\n",
1744 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1745 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1747 const unsigned char *p = skb_mac_header(skb);
1748 printk(KERN_WARNING "ll header: ");
1749 for (i = 0; i < dev->hard_header_len; i++, p++) {
1751 if (i < (dev->hard_header_len - 1))
1760 static int __mkroute_input(struct sk_buff *skb,
1761 struct fib_result *res,
1762 struct in_device *in_dev,
1763 __be32 daddr, __be32 saddr, u32 tos,
1764 struct rtable **result)
1769 struct in_device *out_dev;
1774 /* get a working reference to the output device */
1775 out_dev = in_dev_get(FIB_RES_DEV(*res));
1776 if (out_dev == NULL) {
1777 if (net_ratelimit())
1778 printk(KERN_CRIT "Bug in ip_route_input" \
1779 "_slow(). Please, report\n");
1784 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1785 in_dev->dev, &spec_dst, &itag);
1787 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1795 flags |= RTCF_DIRECTSRC;
1797 if (out_dev == in_dev && err &&
1798 (IN_DEV_SHARED_MEDIA(out_dev) ||
1799 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1800 flags |= RTCF_DOREDIRECT;
1802 if (skb->protocol != htons(ETH_P_IP)) {
1803 /* Not IP (i.e. ARP). Do not create route, if it is
1804 * invalid for proxy arp. DNAT routes are always valid.
1806 if (out_dev == in_dev) {
1813 rth = dst_alloc(&ipv4_dst_ops);
1819 atomic_set(&rth->u.dst.__refcnt, 1);
1820 rth->u.dst.flags= DST_HOST;
1821 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1822 rth->u.dst.flags |= DST_NOPOLICY;
1823 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1824 rth->u.dst.flags |= DST_NOXFRM;
1825 rth->fl.fl4_dst = daddr;
1826 rth->rt_dst = daddr;
1827 rth->fl.fl4_tos = tos;
1828 rth->fl.mark = skb->mark;
1829 rth->fl.fl4_src = saddr;
1830 rth->rt_src = saddr;
1831 rth->rt_gateway = daddr;
1833 rth->fl.iif = in_dev->dev->ifindex;
1834 rth->u.dst.dev = (out_dev)->dev;
1835 dev_hold(rth->u.dst.dev);
1836 rth->idev = in_dev_get(rth->u.dst.dev);
1838 rth->rt_spec_dst= spec_dst;
1840 rth->u.dst.input = ip_forward;
1841 rth->u.dst.output = ip_output;
1842 rth->rt_genid = atomic_read(&rt_genid);
1844 rt_set_nexthop(rth, res, itag);
1846 rth->rt_flags = flags;
1851 /* release the working reference to the output device */
1852 in_dev_put(out_dev);
1856 static int ip_mkroute_input(struct sk_buff *skb,
1857 struct fib_result *res,
1858 const struct flowi *fl,
1859 struct in_device *in_dev,
1860 __be32 daddr, __be32 saddr, u32 tos)
1862 struct rtable* rth = NULL;
1866 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1867 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1868 fib_select_multipath(fl, res);
1871 /* create a routing cache entry */
1872 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1876 /* put it into the cache */
1877 hash = rt_hash(daddr, saddr, fl->iif, atomic_read(&rt_genid));
1878 return rt_intern_hash(hash, rth, &skb->rtable);
1882 * NOTE. We drop all the packets that has local source
1883 * addresses, because every properly looped back packet
1884 * must have correct destination already attached by output routine.
1886 * Such approach solves two big problems:
1887 * 1. Not simplex devices are handled properly.
1888 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1891 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1892 u8 tos, struct net_device *dev)
1894 struct fib_result res;
1895 struct in_device *in_dev = in_dev_get(dev);
1896 struct flowi fl = { .nl_u = { .ip4_u =
1900 .scope = RT_SCOPE_UNIVERSE,
1903 .iif = dev->ifindex };
1906 struct rtable * rth;
1911 struct net * net = dev_net(dev);
1913 /* IP on this device is disabled. */
1918 /* Check for the most weird martians, which can be not detected
1922 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1923 ipv4_is_loopback(saddr))
1924 goto martian_source;
1926 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1929 /* Accept zero addresses only to limited broadcast;
1930 * I even do not know to fix it or not. Waiting for complains :-)
1932 if (ipv4_is_zeronet(saddr))
1933 goto martian_source;
1935 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1936 ipv4_is_loopback(daddr))
1937 goto martian_destination;
1940 * Now we are ready to route packet.
1942 if ((err = fib_lookup(net, &fl, &res)) != 0) {
1943 if (!IN_DEV_FORWARD(in_dev))
1949 RT_CACHE_STAT_INC(in_slow_tot);
1951 if (res.type == RTN_BROADCAST)
1954 if (res.type == RTN_LOCAL) {
1956 result = fib_validate_source(saddr, daddr, tos,
1957 net->loopback_dev->ifindex,
1958 dev, &spec_dst, &itag);
1960 goto martian_source;
1962 flags |= RTCF_DIRECTSRC;
1967 if (!IN_DEV_FORWARD(in_dev))
1969 if (res.type != RTN_UNICAST)
1970 goto martian_destination;
1972 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1980 if (skb->protocol != htons(ETH_P_IP))
1983 if (ipv4_is_zeronet(saddr))
1984 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1986 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1989 goto martian_source;
1991 flags |= RTCF_DIRECTSRC;
1993 flags |= RTCF_BROADCAST;
1994 res.type = RTN_BROADCAST;
1995 RT_CACHE_STAT_INC(in_brd);
1998 rth = dst_alloc(&ipv4_dst_ops);
2002 rth->u.dst.output= ip_rt_bug;
2003 rth->rt_genid = atomic_read(&rt_genid);
2005 atomic_set(&rth->u.dst.__refcnt, 1);
2006 rth->u.dst.flags= DST_HOST;
2007 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2008 rth->u.dst.flags |= DST_NOPOLICY;
2009 rth->fl.fl4_dst = daddr;
2010 rth->rt_dst = daddr;
2011 rth->fl.fl4_tos = tos;
2012 rth->fl.mark = skb->mark;
2013 rth->fl.fl4_src = saddr;
2014 rth->rt_src = saddr;
2015 #ifdef CONFIG_NET_CLS_ROUTE
2016 rth->u.dst.tclassid = itag;
2019 rth->fl.iif = dev->ifindex;
2020 rth->u.dst.dev = net->loopback_dev;
2021 dev_hold(rth->u.dst.dev);
2022 rth->idev = in_dev_get(rth->u.dst.dev);
2023 rth->rt_gateway = daddr;
2024 rth->rt_spec_dst= spec_dst;
2025 rth->u.dst.input= ip_local_deliver;
2026 rth->rt_flags = flags|RTCF_LOCAL;
2027 if (res.type == RTN_UNREACHABLE) {
2028 rth->u.dst.input= ip_error;
2029 rth->u.dst.error= -err;
2030 rth->rt_flags &= ~RTCF_LOCAL;
2032 rth->rt_type = res.type;
2033 hash = rt_hash(daddr, saddr, fl.iif, atomic_read(&rt_genid));
2034 err = rt_intern_hash(hash, rth, &skb->rtable);
2038 RT_CACHE_STAT_INC(in_no_route);
2039 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2040 res.type = RTN_UNREACHABLE;
2046 * Do not cache martian addresses: they should be logged (RFC1812)
2048 martian_destination:
2049 RT_CACHE_STAT_INC(in_martian_dst);
2050 #ifdef CONFIG_IP_ROUTE_VERBOSE
2051 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2052 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2053 NIPQUAD_FMT ", dev %s\n",
2054 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2058 err = -EHOSTUNREACH;
2070 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2074 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2075 u8 tos, struct net_device *dev)
2077 struct rtable * rth;
2079 int iif = dev->ifindex;
2083 tos &= IPTOS_RT_MASK;
2084 hash = rt_hash(daddr, saddr, iif, atomic_read(&rt_genid));
2087 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2088 rth = rcu_dereference(rth->u.dst.rt_next)) {
2089 if (((rth->fl.fl4_dst ^ daddr) |
2090 (rth->fl.fl4_src ^ saddr) |
2091 (rth->fl.iif ^ iif) |
2093 (rth->fl.fl4_tos ^ tos)) == 0 &&
2094 rth->fl.mark == skb->mark &&
2095 net_eq(dev_net(rth->u.dst.dev), net) &&
2096 rth->rt_genid == atomic_read(&rt_genid)) {
2097 dst_use(&rth->u.dst, jiffies);
2098 RT_CACHE_STAT_INC(in_hit);
2103 RT_CACHE_STAT_INC(in_hlist_search);
2107 /* Multicast recognition logic is moved from route cache to here.
2108 The problem was that too many Ethernet cards have broken/missing
2109 hardware multicast filters :-( As result the host on multicasting
2110 network acquires a lot of useless route cache entries, sort of
2111 SDR messages from all the world. Now we try to get rid of them.
2112 Really, provided software IP multicast filter is organized
2113 reasonably (at least, hashed), it does not result in a slowdown
2114 comparing with route cache reject entries.
2115 Note, that multicast routers are not affected, because
2116 route cache entry is created eventually.
2118 if (ipv4_is_multicast(daddr)) {
2119 struct in_device *in_dev;
2122 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2123 int our = ip_check_mc(in_dev, daddr, saddr,
2124 ip_hdr(skb)->protocol);
2126 #ifdef CONFIG_IP_MROUTE
2127 || (!ipv4_is_local_multicast(daddr) &&
2128 IN_DEV_MFORWARD(in_dev))
2132 return ip_route_input_mc(skb, daddr, saddr,
2139 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2142 static int __mkroute_output(struct rtable **result,
2143 struct fib_result *res,
2144 const struct flowi *fl,
2145 const struct flowi *oldflp,
2146 struct net_device *dev_out,
2150 struct in_device *in_dev;
2151 u32 tos = RT_FL_TOS(oldflp);
2154 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2157 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2158 res->type = RTN_BROADCAST;
2159 else if (ipv4_is_multicast(fl->fl4_dst))
2160 res->type = RTN_MULTICAST;
2161 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2164 if (dev_out->flags & IFF_LOOPBACK)
2165 flags |= RTCF_LOCAL;
2167 /* get work reference to inet device */
2168 in_dev = in_dev_get(dev_out);
2172 if (res->type == RTN_BROADCAST) {
2173 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2175 fib_info_put(res->fi);
2178 } else if (res->type == RTN_MULTICAST) {
2179 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2180 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2182 flags &= ~RTCF_LOCAL;
2183 /* If multicast route do not exist use
2184 default one, but do not gateway in this case.
2187 if (res->fi && res->prefixlen < 4) {
2188 fib_info_put(res->fi);
2194 rth = dst_alloc(&ipv4_dst_ops);
2200 atomic_set(&rth->u.dst.__refcnt, 1);
2201 rth->u.dst.flags= DST_HOST;
2202 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2203 rth->u.dst.flags |= DST_NOXFRM;
2204 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2205 rth->u.dst.flags |= DST_NOPOLICY;
2207 rth->fl.fl4_dst = oldflp->fl4_dst;
2208 rth->fl.fl4_tos = tos;
2209 rth->fl.fl4_src = oldflp->fl4_src;
2210 rth->fl.oif = oldflp->oif;
2211 rth->fl.mark = oldflp->mark;
2212 rth->rt_dst = fl->fl4_dst;
2213 rth->rt_src = fl->fl4_src;
2214 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2215 /* get references to the devices that are to be hold by the routing
2217 rth->u.dst.dev = dev_out;
2219 rth->idev = in_dev_get(dev_out);
2220 rth->rt_gateway = fl->fl4_dst;
2221 rth->rt_spec_dst= fl->fl4_src;
2223 rth->u.dst.output=ip_output;
2224 rth->rt_genid = atomic_read(&rt_genid);
2226 RT_CACHE_STAT_INC(out_slow_tot);
2228 if (flags & RTCF_LOCAL) {
2229 rth->u.dst.input = ip_local_deliver;
2230 rth->rt_spec_dst = fl->fl4_dst;
2232 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2233 rth->rt_spec_dst = fl->fl4_src;
2234 if (flags & RTCF_LOCAL &&
2235 !(dev_out->flags & IFF_LOOPBACK)) {
2236 rth->u.dst.output = ip_mc_output;
2237 RT_CACHE_STAT_INC(out_slow_mc);
2239 #ifdef CONFIG_IP_MROUTE
2240 if (res->type == RTN_MULTICAST) {
2241 if (IN_DEV_MFORWARD(in_dev) &&
2242 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2243 rth->u.dst.input = ip_mr_input;
2244 rth->u.dst.output = ip_mc_output;
2250 rt_set_nexthop(rth, res, 0);
2252 rth->rt_flags = flags;
2256 /* release work reference to inet device */
2262 static int ip_mkroute_output(struct rtable **rp,
2263 struct fib_result *res,
2264 const struct flowi *fl,
2265 const struct flowi *oldflp,
2266 struct net_device *dev_out,
2269 struct rtable *rth = NULL;
2270 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2273 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2274 atomic_read(&rt_genid));
2275 err = rt_intern_hash(hash, rth, rp);
2282 * Major route resolver routine.
2285 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2286 const struct flowi *oldflp)
2288 u32 tos = RT_FL_TOS(oldflp);
2289 struct flowi fl = { .nl_u = { .ip4_u =
2290 { .daddr = oldflp->fl4_dst,
2291 .saddr = oldflp->fl4_src,
2292 .tos = tos & IPTOS_RT_MASK,
2293 .scope = ((tos & RTO_ONLINK) ?
2297 .mark = oldflp->mark,
2298 .iif = net->loopback_dev->ifindex,
2299 .oif = oldflp->oif };
2300 struct fib_result res;
2302 struct net_device *dev_out = NULL;
2308 #ifdef CONFIG_IP_MULTIPLE_TABLES
2312 if (oldflp->fl4_src) {
2314 if (ipv4_is_multicast(oldflp->fl4_src) ||
2315 ipv4_is_lbcast(oldflp->fl4_src) ||
2316 ipv4_is_zeronet(oldflp->fl4_src))
2319 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2320 dev_out = ip_dev_find(net, oldflp->fl4_src);
2321 if (dev_out == NULL)
2324 /* I removed check for oif == dev_out->oif here.
2325 It was wrong for two reasons:
2326 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2327 is assigned to multiple interfaces.
2328 2. Moreover, we are allowed to send packets with saddr
2329 of another iface. --ANK
2332 if (oldflp->oif == 0
2333 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2334 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2335 /* Special hack: user can direct multicasts
2336 and limited broadcast via necessary interface
2337 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2338 This hack is not just for fun, it allows
2339 vic,vat and friends to work.
2340 They bind socket to loopback, set ttl to zero
2341 and expect that it will work.
2342 From the viewpoint of routing cache they are broken,
2343 because we are not allowed to build multicast path
2344 with loopback source addr (look, routing cache
2345 cannot know, that ttl is zero, so that packet
2346 will not leave this host and route is valid).
2347 Luckily, this hack is good workaround.
2350 fl.oif = dev_out->ifindex;
2360 dev_out = dev_get_by_index(net, oldflp->oif);
2362 if (dev_out == NULL)
2365 /* RACE: Check return value of inet_select_addr instead. */
2366 if (__in_dev_get_rtnl(dev_out) == NULL) {
2368 goto out; /* Wrong error code */
2371 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2372 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2374 fl.fl4_src = inet_select_addr(dev_out, 0,
2379 if (ipv4_is_multicast(oldflp->fl4_dst))
2380 fl.fl4_src = inet_select_addr(dev_out, 0,
2382 else if (!oldflp->fl4_dst)
2383 fl.fl4_src = inet_select_addr(dev_out, 0,
2389 fl.fl4_dst = fl.fl4_src;
2391 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2394 dev_out = net->loopback_dev;
2396 fl.oif = net->loopback_dev->ifindex;
2397 res.type = RTN_LOCAL;
2398 flags |= RTCF_LOCAL;
2402 if (fib_lookup(net, &fl, &res)) {
2405 /* Apparently, routing tables are wrong. Assume,
2406 that the destination is on link.
2409 Because we are allowed to send to iface
2410 even if it has NO routes and NO assigned
2411 addresses. When oif is specified, routing
2412 tables are looked up with only one purpose:
2413 to catch if destination is gatewayed, rather than
2414 direct. Moreover, if MSG_DONTROUTE is set,
2415 we send packet, ignoring both routing tables
2416 and ifaddr state. --ANK
2419 We could make it even if oif is unknown,
2420 likely IPv6, but we do not.
2423 if (fl.fl4_src == 0)
2424 fl.fl4_src = inet_select_addr(dev_out, 0,
2426 res.type = RTN_UNICAST;
2436 if (res.type == RTN_LOCAL) {
2438 fl.fl4_src = fl.fl4_dst;
2441 dev_out = net->loopback_dev;
2443 fl.oif = dev_out->ifindex;
2445 fib_info_put(res.fi);
2447 flags |= RTCF_LOCAL;
2451 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2452 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2453 fib_select_multipath(&fl, &res);
2456 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2457 fib_select_default(net, &fl, &res);
2460 fl.fl4_src = FIB_RES_PREFSRC(res);
2464 dev_out = FIB_RES_DEV(res);
2466 fl.oif = dev_out->ifindex;
2470 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2480 int __ip_route_output_key(struct net *net, struct rtable **rp,
2481 const struct flowi *flp)
2486 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif,
2487 atomic_read(&rt_genid));
2490 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2491 rth = rcu_dereference(rth->u.dst.rt_next)) {
2492 if (rth->fl.fl4_dst == flp->fl4_dst &&
2493 rth->fl.fl4_src == flp->fl4_src &&
2495 rth->fl.oif == flp->oif &&
2496 rth->fl.mark == flp->mark &&
2497 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2498 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2499 net_eq(dev_net(rth->u.dst.dev), net) &&
2500 rth->rt_genid == atomic_read(&rt_genid)) {
2501 dst_use(&rth->u.dst, jiffies);
2502 RT_CACHE_STAT_INC(out_hit);
2503 rcu_read_unlock_bh();
2507 RT_CACHE_STAT_INC(out_hlist_search);
2509 rcu_read_unlock_bh();
2511 return ip_route_output_slow(net, rp, flp);
2514 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2516 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2520 static struct dst_ops ipv4_dst_blackhole_ops = {
2522 .protocol = __constant_htons(ETH_P_IP),
2523 .destroy = ipv4_dst_destroy,
2524 .check = ipv4_dst_check,
2525 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2526 .entry_size = sizeof(struct rtable),
2527 .entries = ATOMIC_INIT(0),
2531 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2533 struct rtable *ort = *rp;
2534 struct rtable *rt = (struct rtable *)
2535 dst_alloc(&ipv4_dst_blackhole_ops);
2538 struct dst_entry *new = &rt->u.dst;
2540 atomic_set(&new->__refcnt, 1);
2542 new->input = dst_discard;
2543 new->output = dst_discard;
2544 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2546 new->dev = ort->u.dst.dev;
2552 rt->idev = ort->idev;
2554 in_dev_hold(rt->idev);
2555 rt->rt_genid = atomic_read(&rt_genid);
2556 rt->rt_flags = ort->rt_flags;
2557 rt->rt_type = ort->rt_type;
2558 rt->rt_dst = ort->rt_dst;
2559 rt->rt_src = ort->rt_src;
2560 rt->rt_iif = ort->rt_iif;
2561 rt->rt_gateway = ort->rt_gateway;
2562 rt->rt_spec_dst = ort->rt_spec_dst;
2563 rt->peer = ort->peer;
2565 atomic_inc(&rt->peer->refcnt);
2570 dst_release(&(*rp)->u.dst);
2572 return (rt ? 0 : -ENOMEM);
2575 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2576 struct sock *sk, int flags)
2580 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2585 flp->fl4_src = (*rp)->rt_src;
2587 flp->fl4_dst = (*rp)->rt_dst;
2588 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2589 flags ? XFRM_LOOKUP_WAIT : 0);
2590 if (err == -EREMOTE)
2591 err = ipv4_dst_blackhole(rp, flp);
2599 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2601 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2603 return ip_route_output_flow(net, rp, flp, NULL, 0);
2606 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2607 int nowait, unsigned int flags)
2609 struct rtable *rt = skb->rtable;
2611 struct nlmsghdr *nlh;
2613 u32 id = 0, ts = 0, tsage = 0, error;
2615 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2619 r = nlmsg_data(nlh);
2620 r->rtm_family = AF_INET;
2621 r->rtm_dst_len = 32;
2623 r->rtm_tos = rt->fl.fl4_tos;
2624 r->rtm_table = RT_TABLE_MAIN;
2625 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2626 r->rtm_type = rt->rt_type;
2627 r->rtm_scope = RT_SCOPE_UNIVERSE;
2628 r->rtm_protocol = RTPROT_UNSPEC;
2629 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2630 if (rt->rt_flags & RTCF_NOTIFY)
2631 r->rtm_flags |= RTM_F_NOTIFY;
2633 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2635 if (rt->fl.fl4_src) {
2636 r->rtm_src_len = 32;
2637 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2640 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2641 #ifdef CONFIG_NET_CLS_ROUTE
2642 if (rt->u.dst.tclassid)
2643 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2646 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2647 else if (rt->rt_src != rt->fl.fl4_src)
2648 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2650 if (rt->rt_dst != rt->rt_gateway)
2651 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2653 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2654 goto nla_put_failure;
2656 error = rt->u.dst.error;
2657 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2659 id = rt->peer->ip_id_count;
2660 if (rt->peer->tcp_ts_stamp) {
2661 ts = rt->peer->tcp_ts;
2662 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2667 #ifdef CONFIG_IP_MROUTE
2668 __be32 dst = rt->rt_dst;
2670 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2671 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2672 int err = ipmr_get_route(skb, r, nowait);
2677 goto nla_put_failure;
2679 if (err == -EMSGSIZE)
2680 goto nla_put_failure;
2686 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2689 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2690 expires, error) < 0)
2691 goto nla_put_failure;
2693 return nlmsg_end(skb, nlh);
2696 nlmsg_cancel(skb, nlh);
2700 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2702 struct net *net = sock_net(in_skb->sk);
2704 struct nlattr *tb[RTA_MAX+1];
2705 struct rtable *rt = NULL;
2710 struct sk_buff *skb;
2712 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2716 rtm = nlmsg_data(nlh);
2718 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2724 /* Reserve room for dummy headers, this skb can pass
2725 through good chunk of routing engine.
2727 skb_reset_mac_header(skb);
2728 skb_reset_network_header(skb);
2730 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2731 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2732 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2734 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2735 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2736 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2739 struct net_device *dev;
2741 dev = __dev_get_by_index(net, iif);
2747 skb->protocol = htons(ETH_P_IP);
2750 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2754 if (err == 0 && rt->u.dst.error)
2755 err = -rt->u.dst.error;
2762 .tos = rtm->rtm_tos,
2765 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2767 err = ip_route_output_key(net, &rt, &fl);
2774 if (rtm->rtm_flags & RTM_F_NOTIFY)
2775 rt->rt_flags |= RTCF_NOTIFY;
2777 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2778 RTM_NEWROUTE, 0, 0);
2782 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2791 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2798 net = sock_net(skb->sk);
2803 s_idx = idx = cb->args[1];
2804 for (h = s_h; h <= rt_hash_mask; h++) {
2806 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2807 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2808 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2810 if (rt->rt_genid != atomic_read(&rt_genid))
2812 skb->dst = dst_clone(&rt->u.dst);
2813 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2814 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2815 1, NLM_F_MULTI) <= 0) {
2816 dst_release(xchg(&skb->dst, NULL));
2817 rcu_read_unlock_bh();
2820 dst_release(xchg(&skb->dst, NULL));
2822 rcu_read_unlock_bh();
2832 void ip_rt_multicast_event(struct in_device *in_dev)
2834 rt_cache_flush(dev_net(in_dev->dev), 0);
2837 #ifdef CONFIG_SYSCTL
2838 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2839 struct file *filp, void __user *buffer,
2840 size_t *lenp, loff_t *ppos)
2845 static DEFINE_MUTEX(flush_mutex);
2847 mutex_lock(&flush_mutex);
2848 ctl->data = &flush_delay;
2849 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2851 mutex_unlock(&flush_mutex);
2853 net = (struct net *)ctl->extra1;
2854 rt_cache_flush(net, flush_delay);
2861 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2864 void __user *oldval,
2865 size_t __user *oldlenp,
2866 void __user *newval,
2871 if (newlen != sizeof(int))
2873 if (get_user(delay, (int __user *)newval))
2875 net = (struct net *)table->extra1;
2876 rt_cache_flush(net, delay);
2880 ctl_table ipv4_route_table[] = {
2882 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2883 .procname = "gc_thresh",
2884 .data = &ipv4_dst_ops.gc_thresh,
2885 .maxlen = sizeof(int),
2887 .proc_handler = &proc_dointvec,
2890 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2891 .procname = "max_size",
2892 .data = &ip_rt_max_size,
2893 .maxlen = sizeof(int),
2895 .proc_handler = &proc_dointvec,
2898 /* Deprecated. Use gc_min_interval_ms */
2900 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2901 .procname = "gc_min_interval",
2902 .data = &ip_rt_gc_min_interval,
2903 .maxlen = sizeof(int),
2905 .proc_handler = &proc_dointvec_jiffies,
2906 .strategy = &sysctl_jiffies,
2909 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2910 .procname = "gc_min_interval_ms",
2911 .data = &ip_rt_gc_min_interval,
2912 .maxlen = sizeof(int),
2914 .proc_handler = &proc_dointvec_ms_jiffies,
2915 .strategy = &sysctl_ms_jiffies,
2918 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2919 .procname = "gc_timeout",
2920 .data = &ip_rt_gc_timeout,
2921 .maxlen = sizeof(int),
2923 .proc_handler = &proc_dointvec_jiffies,
2924 .strategy = &sysctl_jiffies,
2927 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2928 .procname = "gc_interval",
2929 .data = &ip_rt_gc_interval,
2930 .maxlen = sizeof(int),
2932 .proc_handler = &proc_dointvec_jiffies,
2933 .strategy = &sysctl_jiffies,
2936 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2937 .procname = "redirect_load",
2938 .data = &ip_rt_redirect_load,
2939 .maxlen = sizeof(int),
2941 .proc_handler = &proc_dointvec,
2944 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2945 .procname = "redirect_number",
2946 .data = &ip_rt_redirect_number,
2947 .maxlen = sizeof(int),
2949 .proc_handler = &proc_dointvec,
2952 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2953 .procname = "redirect_silence",
2954 .data = &ip_rt_redirect_silence,
2955 .maxlen = sizeof(int),
2957 .proc_handler = &proc_dointvec,
2960 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2961 .procname = "error_cost",
2962 .data = &ip_rt_error_cost,
2963 .maxlen = sizeof(int),
2965 .proc_handler = &proc_dointvec,
2968 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2969 .procname = "error_burst",
2970 .data = &ip_rt_error_burst,
2971 .maxlen = sizeof(int),
2973 .proc_handler = &proc_dointvec,
2976 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2977 .procname = "gc_elasticity",
2978 .data = &ip_rt_gc_elasticity,
2979 .maxlen = sizeof(int),
2981 .proc_handler = &proc_dointvec,
2984 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2985 .procname = "mtu_expires",
2986 .data = &ip_rt_mtu_expires,
2987 .maxlen = sizeof(int),
2989 .proc_handler = &proc_dointvec_jiffies,
2990 .strategy = &sysctl_jiffies,
2993 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2994 .procname = "min_pmtu",
2995 .data = &ip_rt_min_pmtu,
2996 .maxlen = sizeof(int),
2998 .proc_handler = &proc_dointvec,
3001 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3002 .procname = "min_adv_mss",
3003 .data = &ip_rt_min_advmss,
3004 .maxlen = sizeof(int),
3006 .proc_handler = &proc_dointvec,
3009 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3010 .procname = "secret_interval",
3011 .data = &ip_rt_secret_interval,
3012 .maxlen = sizeof(int),
3014 .proc_handler = &proc_dointvec_jiffies,
3015 .strategy = &sysctl_jiffies,
3020 static __net_initdata struct ctl_path ipv4_route_path[] = {
3021 { .procname = "net", .ctl_name = CTL_NET, },
3022 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3023 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3028 static struct ctl_table ipv4_route_flush_table[] = {
3030 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3031 .procname = "flush",
3032 .maxlen = sizeof(int),
3034 .proc_handler = &ipv4_sysctl_rtcache_flush,
3035 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
3040 static __net_init int sysctl_route_net_init(struct net *net)
3042 struct ctl_table *tbl;
3044 tbl = ipv4_route_flush_table;
3045 if (net != &init_net) {
3046 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3050 tbl[0].extra1 = net;
3052 net->ipv4.route_hdr =
3053 register_net_sysctl_table(net, ipv4_route_path, tbl);
3054 if (net->ipv4.route_hdr == NULL)
3059 if (tbl != ipv4_route_flush_table)
3065 static __net_exit void sysctl_route_net_exit(struct net *net)
3067 struct ctl_table *tbl;
3069 tbl = net->ipv4.route_hdr->ctl_table_arg;
3070 unregister_net_sysctl_table(net->ipv4.route_hdr);
3071 BUG_ON(tbl == ipv4_route_flush_table);
3075 static __net_initdata struct pernet_operations sysctl_route_ops = {
3076 .init = sysctl_route_net_init,
3077 .exit = sysctl_route_net_exit,
3082 static __net_init int rt_secret_timer_init(struct net *net)
3084 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3085 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3086 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3088 net->ipv4.rt_secret_timer.expires =
3089 jiffies + net_random() % ip_rt_secret_interval +
3090 ip_rt_secret_interval;
3091 add_timer(&net->ipv4.rt_secret_timer);
3095 static __net_exit void rt_secret_timer_exit(struct net *net)
3097 del_timer_sync(&net->ipv4.rt_secret_timer);
3100 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3101 .init = rt_secret_timer_init,
3102 .exit = rt_secret_timer_exit,
3106 #ifdef CONFIG_NET_CLS_ROUTE
3107 struct ip_rt_acct *ip_rt_acct __read_mostly;
3108 #endif /* CONFIG_NET_CLS_ROUTE */
3110 static __initdata unsigned long rhash_entries;
3111 static int __init set_rhash_entries(char *str)
3115 rhash_entries = simple_strtoul(str, &str, 0);
3118 __setup("rhash_entries=", set_rhash_entries);
3120 int __init ip_rt_init(void)
3124 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3125 (jiffies ^ (jiffies >> 7))));
3127 #ifdef CONFIG_NET_CLS_ROUTE
3128 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3130 panic("IP: failed to allocate ip_rt_acct\n");
3133 ipv4_dst_ops.kmem_cachep =
3134 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3135 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3137 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3139 rt_hash_table = (struct rt_hash_bucket *)
3140 alloc_large_system_hash("IP route cache",
3141 sizeof(struct rt_hash_bucket),
3143 (num_physpages >= 128 * 1024) ?
3149 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3150 rt_hash_lock_init();
3152 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3153 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3158 /* All the timers, started at system startup tend
3159 to synchronize. Perturb it a bit.
3161 schedule_delayed_work(&expires_work,
3162 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3164 if (register_pernet_subsys(&rt_secret_timer_ops))
3165 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3167 if (ip_rt_proc_init())
3168 printk(KERN_ERR "Unable to create route proc files\n");
3173 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3175 #ifdef CONFIG_SYSCTL
3176 register_pernet_subsys(&sysctl_route_ops);
3181 EXPORT_SYMBOL(__ip_select_ident);
3182 EXPORT_SYMBOL(ip_route_input);
3183 EXPORT_SYMBOL(ip_route_output_key);