net: add a noref bit on skb dst
[pandora-kernel.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132 static int rt_chain_length_max __read_mostly    = 20;
133
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136
137 /*
138  *      Interface to generic destination cache.
139  */
140
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void              ipv4_dst_destroy(struct dst_entry *dst);
143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
144                                          struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void              ipv4_link_failure(struct sk_buff *skb);
147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149
150
151 static struct dst_ops ipv4_dst_ops = {
152         .family =               AF_INET,
153         .protocol =             cpu_to_be16(ETH_P_IP),
154         .gc =                   rt_garbage_collect,
155         .check =                ipv4_dst_check,
156         .destroy =              ipv4_dst_destroy,
157         .ifdown =               ipv4_dst_ifdown,
158         .negative_advice =      ipv4_negative_advice,
159         .link_failure =         ipv4_link_failure,
160         .update_pmtu =          ip_rt_update_pmtu,
161         .local_out =            __ip_local_out,
162         .entries =              ATOMIC_INIT(0),
163 };
164
165 #define ECN_OR_COST(class)      TC_PRIO_##class
166
167 const __u8 ip_tos2prio[16] = {
168         TC_PRIO_BESTEFFORT,
169         ECN_OR_COST(FILLER),
170         TC_PRIO_BESTEFFORT,
171         ECN_OR_COST(BESTEFFORT),
172         TC_PRIO_BULK,
173         ECN_OR_COST(BULK),
174         TC_PRIO_BULK,
175         ECN_OR_COST(BULK),
176         TC_PRIO_INTERACTIVE,
177         ECN_OR_COST(INTERACTIVE),
178         TC_PRIO_INTERACTIVE,
179         ECN_OR_COST(INTERACTIVE),
180         TC_PRIO_INTERACTIVE_BULK,
181         ECN_OR_COST(INTERACTIVE_BULK),
182         TC_PRIO_INTERACTIVE_BULK,
183         ECN_OR_COST(INTERACTIVE_BULK)
184 };
185
186
187 /*
188  * Route cache.
189  */
190
191 /* The locking scheme is rather straight forward:
192  *
193  * 1) Read-Copy Update protects the buckets of the central route hash.
194  * 2) Only writers remove entries, and they hold the lock
195  *    as they look at rtable reference counts.
196  * 3) Only readers acquire references to rtable entries,
197  *    they do so with atomic increments and with the
198  *    lock held.
199  */
200
201 struct rt_hash_bucket {
202         struct rtable   *chain;
203 };
204
205 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
206         defined(CONFIG_PROVE_LOCKING)
207 /*
208  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
209  * The size of this table is a power of two and depends on the number of CPUS.
210  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
211  */
212 #ifdef CONFIG_LOCKDEP
213 # define RT_HASH_LOCK_SZ        256
214 #else
215 # if NR_CPUS >= 32
216 #  define RT_HASH_LOCK_SZ       4096
217 # elif NR_CPUS >= 16
218 #  define RT_HASH_LOCK_SZ       2048
219 # elif NR_CPUS >= 8
220 #  define RT_HASH_LOCK_SZ       1024
221 # elif NR_CPUS >= 4
222 #  define RT_HASH_LOCK_SZ       512
223 # else
224 #  define RT_HASH_LOCK_SZ       256
225 # endif
226 #endif
227
228 static spinlock_t       *rt_hash_locks;
229 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
230
231 static __init void rt_hash_lock_init(void)
232 {
233         int i;
234
235         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
236                         GFP_KERNEL);
237         if (!rt_hash_locks)
238                 panic("IP: failed to allocate rt_hash_locks\n");
239
240         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
241                 spin_lock_init(&rt_hash_locks[i]);
242 }
243 #else
244 # define rt_hash_lock_addr(slot) NULL
245
246 static inline void rt_hash_lock_init(void)
247 {
248 }
249 #endif
250
251 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
252 static unsigned                 rt_hash_mask __read_mostly;
253 static unsigned int             rt_hash_log  __read_mostly;
254
255 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
256 #define RT_CACHE_STAT_INC(field) \
257         (__raw_get_cpu_var(rt_cache_stat).field++)
258
259 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
260                                    int genid)
261 {
262         return jhash_3words((__force u32)daddr, (__force u32)saddr,
263                             idx, genid)
264                 & rt_hash_mask;
265 }
266
267 static inline int rt_genid(struct net *net)
268 {
269         return atomic_read(&net->ipv4.rt_genid);
270 }
271
272 #ifdef CONFIG_PROC_FS
273 struct rt_cache_iter_state {
274         struct seq_net_private p;
275         int bucket;
276         int genid;
277 };
278
279 static struct rtable *rt_cache_get_first(struct seq_file *seq)
280 {
281         struct rt_cache_iter_state *st = seq->private;
282         struct rtable *r = NULL;
283
284         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285                 if (!rt_hash_table[st->bucket].chain)
286                         continue;
287                 rcu_read_lock_bh();
288                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
289                 while (r) {
290                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
291                             r->rt_genid == st->genid)
292                                 return r;
293                         r = rcu_dereference_bh(r->u.dst.rt_next);
294                 }
295                 rcu_read_unlock_bh();
296         }
297         return r;
298 }
299
300 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
301                                           struct rtable *r)
302 {
303         struct rt_cache_iter_state *st = seq->private;
304
305         r = r->u.dst.rt_next;
306         while (!r) {
307                 rcu_read_unlock_bh();
308                 do {
309                         if (--st->bucket < 0)
310                                 return NULL;
311                 } while (!rt_hash_table[st->bucket].chain);
312                 rcu_read_lock_bh();
313                 r = rt_hash_table[st->bucket].chain;
314         }
315         return rcu_dereference_bh(r);
316 }
317
318 static struct rtable *rt_cache_get_next(struct seq_file *seq,
319                                         struct rtable *r)
320 {
321         struct rt_cache_iter_state *st = seq->private;
322         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
323                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
324                         continue;
325                 if (r->rt_genid == st->genid)
326                         break;
327         }
328         return r;
329 }
330
331 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
332 {
333         struct rtable *r = rt_cache_get_first(seq);
334
335         if (r)
336                 while (pos && (r = rt_cache_get_next(seq, r)))
337                         --pos;
338         return pos ? NULL : r;
339 }
340
341 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
342 {
343         struct rt_cache_iter_state *st = seq->private;
344         if (*pos)
345                 return rt_cache_get_idx(seq, *pos - 1);
346         st->genid = rt_genid(seq_file_net(seq));
347         return SEQ_START_TOKEN;
348 }
349
350 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
351 {
352         struct rtable *r;
353
354         if (v == SEQ_START_TOKEN)
355                 r = rt_cache_get_first(seq);
356         else
357                 r = rt_cache_get_next(seq, v);
358         ++*pos;
359         return r;
360 }
361
362 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
363 {
364         if (v && v != SEQ_START_TOKEN)
365                 rcu_read_unlock_bh();
366 }
367
368 static int rt_cache_seq_show(struct seq_file *seq, void *v)
369 {
370         if (v == SEQ_START_TOKEN)
371                 seq_printf(seq, "%-127s\n",
372                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
373                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
374                            "HHUptod\tSpecDst");
375         else {
376                 struct rtable *r = v;
377                 int len;
378
379                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
380                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
381                         r->u.dst.dev ? r->u.dst.dev->name : "*",
382                         (__force u32)r->rt_dst,
383                         (__force u32)r->rt_gateway,
384                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
385                         r->u.dst.__use, 0, (__force u32)r->rt_src,
386                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
387                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
388                         dst_metric(&r->u.dst, RTAX_WINDOW),
389                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
390                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
391                         r->fl.fl4_tos,
392                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
393                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
394                                        dev_queue_xmit) : 0,
395                         r->rt_spec_dst, &len);
396
397                 seq_printf(seq, "%*s\n", 127 - len, "");
398         }
399         return 0;
400 }
401
402 static const struct seq_operations rt_cache_seq_ops = {
403         .start  = rt_cache_seq_start,
404         .next   = rt_cache_seq_next,
405         .stop   = rt_cache_seq_stop,
406         .show   = rt_cache_seq_show,
407 };
408
409 static int rt_cache_seq_open(struct inode *inode, struct file *file)
410 {
411         return seq_open_net(inode, file, &rt_cache_seq_ops,
412                         sizeof(struct rt_cache_iter_state));
413 }
414
415 static const struct file_operations rt_cache_seq_fops = {
416         .owner   = THIS_MODULE,
417         .open    = rt_cache_seq_open,
418         .read    = seq_read,
419         .llseek  = seq_lseek,
420         .release = seq_release_net,
421 };
422
423
424 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
425 {
426         int cpu;
427
428         if (*pos == 0)
429                 return SEQ_START_TOKEN;
430
431         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
432                 if (!cpu_possible(cpu))
433                         continue;
434                 *pos = cpu+1;
435                 return &per_cpu(rt_cache_stat, cpu);
436         }
437         return NULL;
438 }
439
440 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
441 {
442         int cpu;
443
444         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
445                 if (!cpu_possible(cpu))
446                         continue;
447                 *pos = cpu+1;
448                 return &per_cpu(rt_cache_stat, cpu);
449         }
450         return NULL;
451
452 }
453
454 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
455 {
456
457 }
458
459 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
460 {
461         struct rt_cache_stat *st = v;
462
463         if (v == SEQ_START_TOKEN) {
464                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
465                 return 0;
466         }
467
468         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
469                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
470                    atomic_read(&ipv4_dst_ops.entries),
471                    st->in_hit,
472                    st->in_slow_tot,
473                    st->in_slow_mc,
474                    st->in_no_route,
475                    st->in_brd,
476                    st->in_martian_dst,
477                    st->in_martian_src,
478
479                    st->out_hit,
480                    st->out_slow_tot,
481                    st->out_slow_mc,
482
483                    st->gc_total,
484                    st->gc_ignored,
485                    st->gc_goal_miss,
486                    st->gc_dst_overflow,
487                    st->in_hlist_search,
488                    st->out_hlist_search
489                 );
490         return 0;
491 }
492
493 static const struct seq_operations rt_cpu_seq_ops = {
494         .start  = rt_cpu_seq_start,
495         .next   = rt_cpu_seq_next,
496         .stop   = rt_cpu_seq_stop,
497         .show   = rt_cpu_seq_show,
498 };
499
500
501 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
502 {
503         return seq_open(file, &rt_cpu_seq_ops);
504 }
505
506 static const struct file_operations rt_cpu_seq_fops = {
507         .owner   = THIS_MODULE,
508         .open    = rt_cpu_seq_open,
509         .read    = seq_read,
510         .llseek  = seq_lseek,
511         .release = seq_release,
512 };
513
514 #ifdef CONFIG_NET_CLS_ROUTE
515 static int rt_acct_proc_show(struct seq_file *m, void *v)
516 {
517         struct ip_rt_acct *dst, *src;
518         unsigned int i, j;
519
520         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
521         if (!dst)
522                 return -ENOMEM;
523
524         for_each_possible_cpu(i) {
525                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
526                 for (j = 0; j < 256; j++) {
527                         dst[j].o_bytes   += src[j].o_bytes;
528                         dst[j].o_packets += src[j].o_packets;
529                         dst[j].i_bytes   += src[j].i_bytes;
530                         dst[j].i_packets += src[j].i_packets;
531                 }
532         }
533
534         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
535         kfree(dst);
536         return 0;
537 }
538
539 static int rt_acct_proc_open(struct inode *inode, struct file *file)
540 {
541         return single_open(file, rt_acct_proc_show, NULL);
542 }
543
544 static const struct file_operations rt_acct_proc_fops = {
545         .owner          = THIS_MODULE,
546         .open           = rt_acct_proc_open,
547         .read           = seq_read,
548         .llseek         = seq_lseek,
549         .release        = single_release,
550 };
551 #endif
552
553 static int __net_init ip_rt_do_proc_init(struct net *net)
554 {
555         struct proc_dir_entry *pde;
556
557         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
558                         &rt_cache_seq_fops);
559         if (!pde)
560                 goto err1;
561
562         pde = proc_create("rt_cache", S_IRUGO,
563                           net->proc_net_stat, &rt_cpu_seq_fops);
564         if (!pde)
565                 goto err2;
566
567 #ifdef CONFIG_NET_CLS_ROUTE
568         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
569         if (!pde)
570                 goto err3;
571 #endif
572         return 0;
573
574 #ifdef CONFIG_NET_CLS_ROUTE
575 err3:
576         remove_proc_entry("rt_cache", net->proc_net_stat);
577 #endif
578 err2:
579         remove_proc_entry("rt_cache", net->proc_net);
580 err1:
581         return -ENOMEM;
582 }
583
584 static void __net_exit ip_rt_do_proc_exit(struct net *net)
585 {
586         remove_proc_entry("rt_cache", net->proc_net_stat);
587         remove_proc_entry("rt_cache", net->proc_net);
588 #ifdef CONFIG_NET_CLS_ROUTE
589         remove_proc_entry("rt_acct", net->proc_net);
590 #endif
591 }
592
593 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
594         .init = ip_rt_do_proc_init,
595         .exit = ip_rt_do_proc_exit,
596 };
597
598 static int __init ip_rt_proc_init(void)
599 {
600         return register_pernet_subsys(&ip_rt_proc_ops);
601 }
602
603 #else
604 static inline int ip_rt_proc_init(void)
605 {
606         return 0;
607 }
608 #endif /* CONFIG_PROC_FS */
609
610 static inline void rt_free(struct rtable *rt)
611 {
612         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
613 }
614
615 static inline void rt_drop(struct rtable *rt)
616 {
617         ip_rt_put(rt);
618         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
619 }
620
621 static inline int rt_fast_clean(struct rtable *rth)
622 {
623         /* Kill broadcast/multicast entries very aggresively, if they
624            collide in hash table with more useful entries */
625         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
626                 rth->fl.iif && rth->u.dst.rt_next;
627 }
628
629 static inline int rt_valuable(struct rtable *rth)
630 {
631         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
632                 rth->u.dst.expires;
633 }
634
635 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
636 {
637         unsigned long age;
638         int ret = 0;
639
640         if (atomic_read(&rth->u.dst.__refcnt))
641                 goto out;
642
643         ret = 1;
644         if (rth->u.dst.expires &&
645             time_after_eq(jiffies, rth->u.dst.expires))
646                 goto out;
647
648         age = jiffies - rth->u.dst.lastuse;
649         ret = 0;
650         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
651             (age <= tmo2 && rt_valuable(rth)))
652                 goto out;
653         ret = 1;
654 out:    return ret;
655 }
656
657 /* Bits of score are:
658  * 31: very valuable
659  * 30: not quite useless
660  * 29..0: usage counter
661  */
662 static inline u32 rt_score(struct rtable *rt)
663 {
664         u32 score = jiffies - rt->u.dst.lastuse;
665
666         score = ~score & ~(3<<30);
667
668         if (rt_valuable(rt))
669                 score |= (1<<31);
670
671         if (!rt->fl.iif ||
672             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
673                 score |= (1<<30);
674
675         return score;
676 }
677
678 static inline bool rt_caching(const struct net *net)
679 {
680         return net->ipv4.current_rt_cache_rebuild_count <=
681                 net->ipv4.sysctl_rt_cache_rebuild_count;
682 }
683
684 static inline bool compare_hash_inputs(const struct flowi *fl1,
685                                         const struct flowi *fl2)
686 {
687         return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
688                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
689                 (fl1->iif ^ fl2->iif)) == 0);
690 }
691
692 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
693 {
694         return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
695                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
696                 (fl1->mark ^ fl2->mark) |
697                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
698                 (fl1->oif ^ fl2->oif) |
699                 (fl1->iif ^ fl2->iif)) == 0;
700 }
701
702 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
703 {
704         return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
705 }
706
707 static inline int rt_is_expired(struct rtable *rth)
708 {
709         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
710 }
711
712 /*
713  * Perform a full scan of hash table and free all entries.
714  * Can be called by a softirq or a process.
715  * In the later case, we want to be reschedule if necessary
716  */
717 static void rt_do_flush(int process_context)
718 {
719         unsigned int i;
720         struct rtable *rth, *next;
721         struct rtable * tail;
722
723         for (i = 0; i <= rt_hash_mask; i++) {
724                 if (process_context && need_resched())
725                         cond_resched();
726                 rth = rt_hash_table[i].chain;
727                 if (!rth)
728                         continue;
729
730                 spin_lock_bh(rt_hash_lock_addr(i));
731 #ifdef CONFIG_NET_NS
732                 {
733                 struct rtable ** prev, * p;
734
735                 rth = rt_hash_table[i].chain;
736
737                 /* defer releasing the head of the list after spin_unlock */
738                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
739                         if (!rt_is_expired(tail))
740                                 break;
741                 if (rth != tail)
742                         rt_hash_table[i].chain = tail;
743
744                 /* call rt_free on entries after the tail requiring flush */
745                 prev = &rt_hash_table[i].chain;
746                 for (p = *prev; p; p = next) {
747                         next = p->u.dst.rt_next;
748                         if (!rt_is_expired(p)) {
749                                 prev = &p->u.dst.rt_next;
750                         } else {
751                                 *prev = next;
752                                 rt_free(p);
753                         }
754                 }
755                 }
756 #else
757                 rth = rt_hash_table[i].chain;
758                 rt_hash_table[i].chain = NULL;
759                 tail = NULL;
760 #endif
761                 spin_unlock_bh(rt_hash_lock_addr(i));
762
763                 for (; rth != tail; rth = next) {
764                         next = rth->u.dst.rt_next;
765                         rt_free(rth);
766                 }
767         }
768 }
769
770 /*
771  * While freeing expired entries, we compute average chain length
772  * and standard deviation, using fixed-point arithmetic.
773  * This to have an estimation of rt_chain_length_max
774  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
775  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
776  */
777
778 #define FRACT_BITS 3
779 #define ONE (1UL << FRACT_BITS)
780
781 /*
782  * Given a hash chain and an item in this hash chain,
783  * find if a previous entry has the same hash_inputs
784  * (but differs on tos, mark or oif)
785  * Returns 0 if an alias is found.
786  * Returns ONE if rth has no alias before itself.
787  */
788 static int has_noalias(const struct rtable *head, const struct rtable *rth)
789 {
790         const struct rtable *aux = head;
791
792         while (aux != rth) {
793                 if (compare_hash_inputs(&aux->fl, &rth->fl))
794                         return 0;
795                 aux = aux->u.dst.rt_next;
796         }
797         return ONE;
798 }
799
800 static void rt_check_expire(void)
801 {
802         static unsigned int rover;
803         unsigned int i = rover, goal;
804         struct rtable *rth, **rthp;
805         unsigned long samples = 0;
806         unsigned long sum = 0, sum2 = 0;
807         unsigned long delta;
808         u64 mult;
809
810         delta = jiffies - expires_ljiffies;
811         expires_ljiffies = jiffies;
812         mult = ((u64)delta) << rt_hash_log;
813         if (ip_rt_gc_timeout > 1)
814                 do_div(mult, ip_rt_gc_timeout);
815         goal = (unsigned int)mult;
816         if (goal > rt_hash_mask)
817                 goal = rt_hash_mask + 1;
818         for (; goal > 0; goal--) {
819                 unsigned long tmo = ip_rt_gc_timeout;
820                 unsigned long length;
821
822                 i = (i + 1) & rt_hash_mask;
823                 rthp = &rt_hash_table[i].chain;
824
825                 if (need_resched())
826                         cond_resched();
827
828                 samples++;
829
830                 if (*rthp == NULL)
831                         continue;
832                 length = 0;
833                 spin_lock_bh(rt_hash_lock_addr(i));
834                 while ((rth = *rthp) != NULL) {
835                         prefetch(rth->u.dst.rt_next);
836                         if (rt_is_expired(rth)) {
837                                 *rthp = rth->u.dst.rt_next;
838                                 rt_free(rth);
839                                 continue;
840                         }
841                         if (rth->u.dst.expires) {
842                                 /* Entry is expired even if it is in use */
843                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
844 nofree:
845                                         tmo >>= 1;
846                                         rthp = &rth->u.dst.rt_next;
847                                         /*
848                                          * We only count entries on
849                                          * a chain with equal hash inputs once
850                                          * so that entries for different QOS
851                                          * levels, and other non-hash input
852                                          * attributes don't unfairly skew
853                                          * the length computation
854                                          */
855                                         length += has_noalias(rt_hash_table[i].chain, rth);
856                                         continue;
857                                 }
858                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
859                                 goto nofree;
860
861                         /* Cleanup aged off entries. */
862                         *rthp = rth->u.dst.rt_next;
863                         rt_free(rth);
864                 }
865                 spin_unlock_bh(rt_hash_lock_addr(i));
866                 sum += length;
867                 sum2 += length*length;
868         }
869         if (samples) {
870                 unsigned long avg = sum / samples;
871                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
872                 rt_chain_length_max = max_t(unsigned long,
873                                         ip_rt_gc_elasticity,
874                                         (avg + 4*sd) >> FRACT_BITS);
875         }
876         rover = i;
877 }
878
879 /*
880  * rt_worker_func() is run in process context.
881  * we call rt_check_expire() to scan part of the hash table
882  */
883 static void rt_worker_func(struct work_struct *work)
884 {
885         rt_check_expire();
886         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
887 }
888
889 /*
890  * Pertubation of rt_genid by a small quantity [1..256]
891  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
892  * many times (2^24) without giving recent rt_genid.
893  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
894  */
895 static void rt_cache_invalidate(struct net *net)
896 {
897         unsigned char shuffle;
898
899         get_random_bytes(&shuffle, sizeof(shuffle));
900         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
901 }
902
903 /*
904  * delay < 0  : invalidate cache (fast : entries will be deleted later)
905  * delay >= 0 : invalidate & flush cache (can be long)
906  */
907 void rt_cache_flush(struct net *net, int delay)
908 {
909         rt_cache_invalidate(net);
910         if (delay >= 0)
911                 rt_do_flush(!in_softirq());
912 }
913
914 /* Flush previous cache invalidated entries from the cache */
915 void rt_cache_flush_batch(void)
916 {
917         rt_do_flush(!in_softirq());
918 }
919
920 static void rt_emergency_hash_rebuild(struct net *net)
921 {
922         if (net_ratelimit())
923                 printk(KERN_WARNING "Route hash chain too long!\n");
924         rt_cache_invalidate(net);
925 }
926
927 /*
928    Short description of GC goals.
929
930    We want to build algorithm, which will keep routing cache
931    at some equilibrium point, when number of aged off entries
932    is kept approximately equal to newly generated ones.
933
934    Current expiration strength is variable "expire".
935    We try to adjust it dynamically, so that if networking
936    is idle expires is large enough to keep enough of warm entries,
937    and when load increases it reduces to limit cache size.
938  */
939
940 static int rt_garbage_collect(struct dst_ops *ops)
941 {
942         static unsigned long expire = RT_GC_TIMEOUT;
943         static unsigned long last_gc;
944         static int rover;
945         static int equilibrium;
946         struct rtable *rth, **rthp;
947         unsigned long now = jiffies;
948         int goal;
949
950         /*
951          * Garbage collection is pretty expensive,
952          * do not make it too frequently.
953          */
954
955         RT_CACHE_STAT_INC(gc_total);
956
957         if (now - last_gc < ip_rt_gc_min_interval &&
958             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
959                 RT_CACHE_STAT_INC(gc_ignored);
960                 goto out;
961         }
962
963         /* Calculate number of entries, which we want to expire now. */
964         goal = atomic_read(&ipv4_dst_ops.entries) -
965                 (ip_rt_gc_elasticity << rt_hash_log);
966         if (goal <= 0) {
967                 if (equilibrium < ipv4_dst_ops.gc_thresh)
968                         equilibrium = ipv4_dst_ops.gc_thresh;
969                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
970                 if (goal > 0) {
971                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
972                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
973                 }
974         } else {
975                 /* We are in dangerous area. Try to reduce cache really
976                  * aggressively.
977                  */
978                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
979                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
980         }
981
982         if (now - last_gc >= ip_rt_gc_min_interval)
983                 last_gc = now;
984
985         if (goal <= 0) {
986                 equilibrium += goal;
987                 goto work_done;
988         }
989
990         do {
991                 int i, k;
992
993                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
994                         unsigned long tmo = expire;
995
996                         k = (k + 1) & rt_hash_mask;
997                         rthp = &rt_hash_table[k].chain;
998                         spin_lock_bh(rt_hash_lock_addr(k));
999                         while ((rth = *rthp) != NULL) {
1000                                 if (!rt_is_expired(rth) &&
1001                                         !rt_may_expire(rth, tmo, expire)) {
1002                                         tmo >>= 1;
1003                                         rthp = &rth->u.dst.rt_next;
1004                                         continue;
1005                                 }
1006                                 *rthp = rth->u.dst.rt_next;
1007                                 rt_free(rth);
1008                                 goal--;
1009                         }
1010                         spin_unlock_bh(rt_hash_lock_addr(k));
1011                         if (goal <= 0)
1012                                 break;
1013                 }
1014                 rover = k;
1015
1016                 if (goal <= 0)
1017                         goto work_done;
1018
1019                 /* Goal is not achieved. We stop process if:
1020
1021                    - if expire reduced to zero. Otherwise, expire is halfed.
1022                    - if table is not full.
1023                    - if we are called from interrupt.
1024                    - jiffies check is just fallback/debug loop breaker.
1025                      We will not spin here for long time in any case.
1026                  */
1027
1028                 RT_CACHE_STAT_INC(gc_goal_miss);
1029
1030                 if (expire == 0)
1031                         break;
1032
1033                 expire >>= 1;
1034 #if RT_CACHE_DEBUG >= 2
1035                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1036                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
1037 #endif
1038
1039                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1040                         goto out;
1041         } while (!in_softirq() && time_before_eq(jiffies, now));
1042
1043         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1044                 goto out;
1045         if (net_ratelimit())
1046                 printk(KERN_WARNING "dst cache overflow\n");
1047         RT_CACHE_STAT_INC(gc_dst_overflow);
1048         return 1;
1049
1050 work_done:
1051         expire += ip_rt_gc_min_interval;
1052         if (expire > ip_rt_gc_timeout ||
1053             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1054                 expire = ip_rt_gc_timeout;
1055 #if RT_CACHE_DEBUG >= 2
1056         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1057                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
1058 #endif
1059 out:    return 0;
1060 }
1061
1062 /*
1063  * Returns number of entries in a hash chain that have different hash_inputs
1064  */
1065 static int slow_chain_length(const struct rtable *head)
1066 {
1067         int length = 0;
1068         const struct rtable *rth = head;
1069
1070         while (rth) {
1071                 length += has_noalias(head, rth);
1072                 rth = rth->u.dst.rt_next;
1073         }
1074         return length >> FRACT_BITS;
1075 }
1076
1077 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1078                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1079 {
1080         struct rtable   *rth, **rthp;
1081         unsigned long   now;
1082         struct rtable *cand, **candp;
1083         u32             min_score;
1084         int             chain_length;
1085         int attempts = !in_softirq();
1086
1087 restart:
1088         chain_length = 0;
1089         min_score = ~(u32)0;
1090         cand = NULL;
1091         candp = NULL;
1092         now = jiffies;
1093
1094         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1095                 /*
1096                  * If we're not caching, just tell the caller we
1097                  * were successful and don't touch the route.  The
1098                  * caller hold the sole reference to the cache entry, and
1099                  * it will be released when the caller is done with it.
1100                  * If we drop it here, the callers have no way to resolve routes
1101                  * when we're not caching.  Instead, just point *rp at rt, so
1102                  * the caller gets a single use out of the route
1103                  * Note that we do rt_free on this new route entry, so that
1104                  * once its refcount hits zero, we are still able to reap it
1105                  * (Thanks Alexey)
1106                  * Note also the rt_free uses call_rcu.  We don't actually
1107                  * need rcu protection here, this is just our path to get
1108                  * on the route gc list.
1109                  */
1110
1111                 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1112                         int err = arp_bind_neighbour(&rt->u.dst);
1113                         if (err) {
1114                                 if (net_ratelimit())
1115                                         printk(KERN_WARNING
1116                                             "Neighbour table failure & not caching routes.\n");
1117                                 rt_drop(rt);
1118                                 return err;
1119                         }
1120                 }
1121
1122                 rt_free(rt);
1123                 goto skip_hashing;
1124         }
1125
1126         rthp = &rt_hash_table[hash].chain;
1127
1128         spin_lock_bh(rt_hash_lock_addr(hash));
1129         while ((rth = *rthp) != NULL) {
1130                 if (rt_is_expired(rth)) {
1131                         *rthp = rth->u.dst.rt_next;
1132                         rt_free(rth);
1133                         continue;
1134                 }
1135                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1136                         /* Put it first */
1137                         *rthp = rth->u.dst.rt_next;
1138                         /*
1139                          * Since lookup is lockfree, the deletion
1140                          * must be visible to another weakly ordered CPU before
1141                          * the insertion at the start of the hash chain.
1142                          */
1143                         rcu_assign_pointer(rth->u.dst.rt_next,
1144                                            rt_hash_table[hash].chain);
1145                         /*
1146                          * Since lookup is lockfree, the update writes
1147                          * must be ordered for consistency on SMP.
1148                          */
1149                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1150
1151                         dst_use(&rth->u.dst, now);
1152                         spin_unlock_bh(rt_hash_lock_addr(hash));
1153
1154                         rt_drop(rt);
1155                         if (rp)
1156                                 *rp = rth;
1157                         else
1158                                 skb_dst_set(skb, &rth->u.dst);
1159                         return 0;
1160                 }
1161
1162                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1163                         u32 score = rt_score(rth);
1164
1165                         if (score <= min_score) {
1166                                 cand = rth;
1167                                 candp = rthp;
1168                                 min_score = score;
1169                         }
1170                 }
1171
1172                 chain_length++;
1173
1174                 rthp = &rth->u.dst.rt_next;
1175         }
1176
1177         if (cand) {
1178                 /* ip_rt_gc_elasticity used to be average length of chain
1179                  * length, when exceeded gc becomes really aggressive.
1180                  *
1181                  * The second limit is less certain. At the moment it allows
1182                  * only 2 entries per bucket. We will see.
1183                  */
1184                 if (chain_length > ip_rt_gc_elasticity) {
1185                         *candp = cand->u.dst.rt_next;
1186                         rt_free(cand);
1187                 }
1188         } else {
1189                 if (chain_length > rt_chain_length_max &&
1190                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1191                         struct net *net = dev_net(rt->u.dst.dev);
1192                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1193                         if (!rt_caching(net)) {
1194                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1195                                         rt->u.dst.dev->name, num);
1196                         }
1197                         rt_emergency_hash_rebuild(net);
1198                         spin_unlock_bh(rt_hash_lock_addr(hash));
1199
1200                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1201                                         ifindex, rt_genid(net));
1202                         goto restart;
1203                 }
1204         }
1205
1206         /* Try to bind route to arp only if it is output
1207            route or unicast forwarding path.
1208          */
1209         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1210                 int err = arp_bind_neighbour(&rt->u.dst);
1211                 if (err) {
1212                         spin_unlock_bh(rt_hash_lock_addr(hash));
1213
1214                         if (err != -ENOBUFS) {
1215                                 rt_drop(rt);
1216                                 return err;
1217                         }
1218
1219                         /* Neighbour tables are full and nothing
1220                            can be released. Try to shrink route cache,
1221                            it is most likely it holds some neighbour records.
1222                          */
1223                         if (attempts-- > 0) {
1224                                 int saved_elasticity = ip_rt_gc_elasticity;
1225                                 int saved_int = ip_rt_gc_min_interval;
1226                                 ip_rt_gc_elasticity     = 1;
1227                                 ip_rt_gc_min_interval   = 0;
1228                                 rt_garbage_collect(&ipv4_dst_ops);
1229                                 ip_rt_gc_min_interval   = saved_int;
1230                                 ip_rt_gc_elasticity     = saved_elasticity;
1231                                 goto restart;
1232                         }
1233
1234                         if (net_ratelimit())
1235                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1236                         rt_drop(rt);
1237                         return -ENOBUFS;
1238                 }
1239         }
1240
1241         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1242
1243 #if RT_CACHE_DEBUG >= 2
1244         if (rt->u.dst.rt_next) {
1245                 struct rtable *trt;
1246                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1247                        hash, &rt->rt_dst);
1248                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1249                         printk(" . %pI4", &trt->rt_dst);
1250                 printk("\n");
1251         }
1252 #endif
1253         /*
1254          * Since lookup is lockfree, we must make sure
1255          * previous writes to rt are comitted to memory
1256          * before making rt visible to other CPUS.
1257          */
1258         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1259
1260         spin_unlock_bh(rt_hash_lock_addr(hash));
1261
1262 skip_hashing:
1263         if (rp)
1264                 *rp = rt;
1265         else
1266                 skb_dst_set(skb, &rt->u.dst);
1267         return 0;
1268 }
1269
1270 void rt_bind_peer(struct rtable *rt, int create)
1271 {
1272         static DEFINE_SPINLOCK(rt_peer_lock);
1273         struct inet_peer *peer;
1274
1275         peer = inet_getpeer(rt->rt_dst, create);
1276
1277         spin_lock_bh(&rt_peer_lock);
1278         if (rt->peer == NULL) {
1279                 rt->peer = peer;
1280                 peer = NULL;
1281         }
1282         spin_unlock_bh(&rt_peer_lock);
1283         if (peer)
1284                 inet_putpeer(peer);
1285 }
1286
1287 /*
1288  * Peer allocation may fail only in serious out-of-memory conditions.  However
1289  * we still can generate some output.
1290  * Random ID selection looks a bit dangerous because we have no chances to
1291  * select ID being unique in a reasonable period of time.
1292  * But broken packet identifier may be better than no packet at all.
1293  */
1294 static void ip_select_fb_ident(struct iphdr *iph)
1295 {
1296         static DEFINE_SPINLOCK(ip_fb_id_lock);
1297         static u32 ip_fallback_id;
1298         u32 salt;
1299
1300         spin_lock_bh(&ip_fb_id_lock);
1301         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1302         iph->id = htons(salt & 0xFFFF);
1303         ip_fallback_id = salt;
1304         spin_unlock_bh(&ip_fb_id_lock);
1305 }
1306
1307 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1308 {
1309         struct rtable *rt = (struct rtable *) dst;
1310
1311         if (rt) {
1312                 if (rt->peer == NULL)
1313                         rt_bind_peer(rt, 1);
1314
1315                 /* If peer is attached to destination, it is never detached,
1316                    so that we need not to grab a lock to dereference it.
1317                  */
1318                 if (rt->peer) {
1319                         iph->id = htons(inet_getid(rt->peer, more));
1320                         return;
1321                 }
1322         } else
1323                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1324                        __builtin_return_address(0));
1325
1326         ip_select_fb_ident(iph);
1327 }
1328
1329 static void rt_del(unsigned hash, struct rtable *rt)
1330 {
1331         struct rtable **rthp, *aux;
1332
1333         rthp = &rt_hash_table[hash].chain;
1334         spin_lock_bh(rt_hash_lock_addr(hash));
1335         ip_rt_put(rt);
1336         while ((aux = *rthp) != NULL) {
1337                 if (aux == rt || rt_is_expired(aux)) {
1338                         *rthp = aux->u.dst.rt_next;
1339                         rt_free(aux);
1340                         continue;
1341                 }
1342                 rthp = &aux->u.dst.rt_next;
1343         }
1344         spin_unlock_bh(rt_hash_lock_addr(hash));
1345 }
1346
1347 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1348                     __be32 saddr, struct net_device *dev)
1349 {
1350         int i, k;
1351         struct in_device *in_dev = in_dev_get(dev);
1352         struct rtable *rth, **rthp;
1353         __be32  skeys[2] = { saddr, 0 };
1354         int  ikeys[2] = { dev->ifindex, 0 };
1355         struct netevent_redirect netevent;
1356         struct net *net;
1357
1358         if (!in_dev)
1359                 return;
1360
1361         net = dev_net(dev);
1362         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1363             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1364             ipv4_is_zeronet(new_gw))
1365                 goto reject_redirect;
1366
1367         if (!rt_caching(net))
1368                 goto reject_redirect;
1369
1370         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1371                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1372                         goto reject_redirect;
1373                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1374                         goto reject_redirect;
1375         } else {
1376                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1377                         goto reject_redirect;
1378         }
1379
1380         for (i = 0; i < 2; i++) {
1381                 for (k = 0; k < 2; k++) {
1382                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1383                                                 rt_genid(net));
1384
1385                         rthp=&rt_hash_table[hash].chain;
1386
1387                         rcu_read_lock();
1388                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1389                                 struct rtable *rt;
1390
1391                                 if (rth->fl.fl4_dst != daddr ||
1392                                     rth->fl.fl4_src != skeys[i] ||
1393                                     rth->fl.oif != ikeys[k] ||
1394                                     rth->fl.iif != 0 ||
1395                                     rt_is_expired(rth) ||
1396                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1397                                         rthp = &rth->u.dst.rt_next;
1398                                         continue;
1399                                 }
1400
1401                                 if (rth->rt_dst != daddr ||
1402                                     rth->rt_src != saddr ||
1403                                     rth->u.dst.error ||
1404                                     rth->rt_gateway != old_gw ||
1405                                     rth->u.dst.dev != dev)
1406                                         break;
1407
1408                                 dst_hold(&rth->u.dst);
1409                                 rcu_read_unlock();
1410
1411                                 rt = dst_alloc(&ipv4_dst_ops);
1412                                 if (rt == NULL) {
1413                                         ip_rt_put(rth);
1414                                         in_dev_put(in_dev);
1415                                         return;
1416                                 }
1417
1418                                 /* Copy all the information. */
1419                                 *rt = *rth;
1420                                 rt->u.dst.__use         = 1;
1421                                 atomic_set(&rt->u.dst.__refcnt, 1);
1422                                 rt->u.dst.child         = NULL;
1423                                 if (rt->u.dst.dev)
1424                                         dev_hold(rt->u.dst.dev);
1425                                 if (rt->idev)
1426                                         in_dev_hold(rt->idev);
1427                                 rt->u.dst.obsolete      = -1;
1428                                 rt->u.dst.lastuse       = jiffies;
1429                                 rt->u.dst.path          = &rt->u.dst;
1430                                 rt->u.dst.neighbour     = NULL;
1431                                 rt->u.dst.hh            = NULL;
1432 #ifdef CONFIG_XFRM
1433                                 rt->u.dst.xfrm          = NULL;
1434 #endif
1435                                 rt->rt_genid            = rt_genid(net);
1436                                 rt->rt_flags            |= RTCF_REDIRECTED;
1437
1438                                 /* Gateway is different ... */
1439                                 rt->rt_gateway          = new_gw;
1440
1441                                 /* Redirect received -> path was valid */
1442                                 dst_confirm(&rth->u.dst);
1443
1444                                 if (rt->peer)
1445                                         atomic_inc(&rt->peer->refcnt);
1446
1447                                 if (arp_bind_neighbour(&rt->u.dst) ||
1448                                     !(rt->u.dst.neighbour->nud_state &
1449                                             NUD_VALID)) {
1450                                         if (rt->u.dst.neighbour)
1451                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1452                                         ip_rt_put(rth);
1453                                         rt_drop(rt);
1454                                         goto do_next;
1455                                 }
1456
1457                                 netevent.old = &rth->u.dst;
1458                                 netevent.new = &rt->u.dst;
1459                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1460                                                         &netevent);
1461
1462                                 rt_del(hash, rth);
1463                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1464                                         ip_rt_put(rt);
1465                                 goto do_next;
1466                         }
1467                         rcu_read_unlock();
1468                 do_next:
1469                         ;
1470                 }
1471         }
1472         in_dev_put(in_dev);
1473         return;
1474
1475 reject_redirect:
1476 #ifdef CONFIG_IP_ROUTE_VERBOSE
1477         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1478                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1479                         "  Advised path = %pI4 -> %pI4\n",
1480                        &old_gw, dev->name, &new_gw,
1481                        &saddr, &daddr);
1482 #endif
1483         in_dev_put(in_dev);
1484 }
1485
1486 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1487 {
1488         struct rtable *rt = (struct rtable *)dst;
1489         struct dst_entry *ret = dst;
1490
1491         if (rt) {
1492                 if (dst->obsolete > 0) {
1493                         ip_rt_put(rt);
1494                         ret = NULL;
1495                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1496                            (rt->u.dst.expires &&
1497                             time_after_eq(jiffies, rt->u.dst.expires))) {
1498                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1499                                                 rt->fl.oif,
1500                                                 rt_genid(dev_net(dst->dev)));
1501 #if RT_CACHE_DEBUG >= 1
1502                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1503                                 &rt->rt_dst, rt->fl.fl4_tos);
1504 #endif
1505                         rt_del(hash, rt);
1506                         ret = NULL;
1507                 }
1508         }
1509         return ret;
1510 }
1511
1512 /*
1513  * Algorithm:
1514  *      1. The first ip_rt_redirect_number redirects are sent
1515  *         with exponential backoff, then we stop sending them at all,
1516  *         assuming that the host ignores our redirects.
1517  *      2. If we did not see packets requiring redirects
1518  *         during ip_rt_redirect_silence, we assume that the host
1519  *         forgot redirected route and start to send redirects again.
1520  *
1521  * This algorithm is much cheaper and more intelligent than dumb load limiting
1522  * in icmp.c.
1523  *
1524  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1525  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1526  */
1527
1528 void ip_rt_send_redirect(struct sk_buff *skb)
1529 {
1530         struct rtable *rt = skb_rtable(skb);
1531         struct in_device *in_dev;
1532         int log_martians;
1533
1534         rcu_read_lock();
1535         in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1536         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1537                 rcu_read_unlock();
1538                 return;
1539         }
1540         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1541         rcu_read_unlock();
1542
1543         /* No redirected packets during ip_rt_redirect_silence;
1544          * reset the algorithm.
1545          */
1546         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1547                 rt->u.dst.rate_tokens = 0;
1548
1549         /* Too many ignored redirects; do not send anything
1550          * set u.dst.rate_last to the last seen redirected packet.
1551          */
1552         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1553                 rt->u.dst.rate_last = jiffies;
1554                 return;
1555         }
1556
1557         /* Check for load limit; set rate_last to the latest sent
1558          * redirect.
1559          */
1560         if (rt->u.dst.rate_tokens == 0 ||
1561             time_after(jiffies,
1562                        (rt->u.dst.rate_last +
1563                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1564                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1565                 rt->u.dst.rate_last = jiffies;
1566                 ++rt->u.dst.rate_tokens;
1567 #ifdef CONFIG_IP_ROUTE_VERBOSE
1568                 if (log_martians &&
1569                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1570                     net_ratelimit())
1571                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1572                                 &rt->rt_src, rt->rt_iif,
1573                                 &rt->rt_dst, &rt->rt_gateway);
1574 #endif
1575         }
1576 }
1577
1578 static int ip_error(struct sk_buff *skb)
1579 {
1580         struct rtable *rt = skb_rtable(skb);
1581         unsigned long now;
1582         int code;
1583
1584         switch (rt->u.dst.error) {
1585                 case EINVAL:
1586                 default:
1587                         goto out;
1588                 case EHOSTUNREACH:
1589                         code = ICMP_HOST_UNREACH;
1590                         break;
1591                 case ENETUNREACH:
1592                         code = ICMP_NET_UNREACH;
1593                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1594                                         IPSTATS_MIB_INNOROUTES);
1595                         break;
1596                 case EACCES:
1597                         code = ICMP_PKT_FILTERED;
1598                         break;
1599         }
1600
1601         now = jiffies;
1602         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1603         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1604                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1605         rt->u.dst.rate_last = now;
1606         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1607                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1608                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1609         }
1610
1611 out:    kfree_skb(skb);
1612         return 0;
1613 }
1614
1615 /*
1616  *      The last two values are not from the RFC but
1617  *      are needed for AMPRnet AX.25 paths.
1618  */
1619
1620 static const unsigned short mtu_plateau[] =
1621 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1622
1623 static inline unsigned short guess_mtu(unsigned short old_mtu)
1624 {
1625         int i;
1626
1627         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1628                 if (old_mtu > mtu_plateau[i])
1629                         return mtu_plateau[i];
1630         return 68;
1631 }
1632
1633 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1634                                  unsigned short new_mtu,
1635                                  struct net_device *dev)
1636 {
1637         int i, k;
1638         unsigned short old_mtu = ntohs(iph->tot_len);
1639         struct rtable *rth;
1640         int  ikeys[2] = { dev->ifindex, 0 };
1641         __be32  skeys[2] = { iph->saddr, 0, };
1642         __be32  daddr = iph->daddr;
1643         unsigned short est_mtu = 0;
1644
1645         for (k = 0; k < 2; k++) {
1646                 for (i = 0; i < 2; i++) {
1647                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1648                                                 rt_genid(net));
1649
1650                         rcu_read_lock();
1651                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1652                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1653                                 unsigned short mtu = new_mtu;
1654
1655                                 if (rth->fl.fl4_dst != daddr ||
1656                                     rth->fl.fl4_src != skeys[i] ||
1657                                     rth->rt_dst != daddr ||
1658                                     rth->rt_src != iph->saddr ||
1659                                     rth->fl.oif != ikeys[k] ||
1660                                     rth->fl.iif != 0 ||
1661                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1662                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1663                                     rt_is_expired(rth))
1664                                         continue;
1665
1666                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1667
1668                                         /* BSD 4.2 compatibility hack :-( */
1669                                         if (mtu == 0 &&
1670                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1671                                             old_mtu >= 68 + (iph->ihl << 2))
1672                                                 old_mtu -= iph->ihl << 2;
1673
1674                                         mtu = guess_mtu(old_mtu);
1675                                 }
1676                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1677                                         if (mtu < dst_mtu(&rth->u.dst)) {
1678                                                 dst_confirm(&rth->u.dst);
1679                                                 if (mtu < ip_rt_min_pmtu) {
1680                                                         mtu = ip_rt_min_pmtu;
1681                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1682                                                                 (1 << RTAX_MTU);
1683                                                 }
1684                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1685                                                 dst_set_expires(&rth->u.dst,
1686                                                         ip_rt_mtu_expires);
1687                                         }
1688                                         est_mtu = mtu;
1689                                 }
1690                         }
1691                         rcu_read_unlock();
1692                 }
1693         }
1694         return est_mtu ? : new_mtu;
1695 }
1696
1697 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1698 {
1699         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1700             !(dst_metric_locked(dst, RTAX_MTU))) {
1701                 if (mtu < ip_rt_min_pmtu) {
1702                         mtu = ip_rt_min_pmtu;
1703                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1704                 }
1705                 dst->metrics[RTAX_MTU-1] = mtu;
1706                 dst_set_expires(dst, ip_rt_mtu_expires);
1707                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1708         }
1709 }
1710
1711 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1712 {
1713         if (rt_is_expired((struct rtable *)dst))
1714                 return NULL;
1715         return dst;
1716 }
1717
1718 static void ipv4_dst_destroy(struct dst_entry *dst)
1719 {
1720         struct rtable *rt = (struct rtable *) dst;
1721         struct inet_peer *peer = rt->peer;
1722         struct in_device *idev = rt->idev;
1723
1724         if (peer) {
1725                 rt->peer = NULL;
1726                 inet_putpeer(peer);
1727         }
1728
1729         if (idev) {
1730                 rt->idev = NULL;
1731                 in_dev_put(idev);
1732         }
1733 }
1734
1735 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1736                             int how)
1737 {
1738         struct rtable *rt = (struct rtable *) dst;
1739         struct in_device *idev = rt->idev;
1740         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1741                 struct in_device *loopback_idev =
1742                         in_dev_get(dev_net(dev)->loopback_dev);
1743                 if (loopback_idev) {
1744                         rt->idev = loopback_idev;
1745                         in_dev_put(idev);
1746                 }
1747         }
1748 }
1749
1750 static void ipv4_link_failure(struct sk_buff *skb)
1751 {
1752         struct rtable *rt;
1753
1754         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1755
1756         rt = skb_rtable(skb);
1757         if (rt)
1758                 dst_set_expires(&rt->u.dst, 0);
1759 }
1760
1761 static int ip_rt_bug(struct sk_buff *skb)
1762 {
1763         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1764                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1765                 skb->dev ? skb->dev->name : "?");
1766         kfree_skb(skb);
1767         return 0;
1768 }
1769
1770 /*
1771    We do not cache source address of outgoing interface,
1772    because it is used only by IP RR, TS and SRR options,
1773    so that it out of fast path.
1774
1775    BTW remember: "addr" is allowed to be not aligned
1776    in IP options!
1777  */
1778
1779 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1780 {
1781         __be32 src;
1782         struct fib_result res;
1783
1784         if (rt->fl.iif == 0)
1785                 src = rt->rt_src;
1786         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1787                 src = FIB_RES_PREFSRC(res);
1788                 fib_res_put(&res);
1789         } else
1790                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1791                                         RT_SCOPE_UNIVERSE);
1792         memcpy(addr, &src, 4);
1793 }
1794
1795 #ifdef CONFIG_NET_CLS_ROUTE
1796 static void set_class_tag(struct rtable *rt, u32 tag)
1797 {
1798         if (!(rt->u.dst.tclassid & 0xFFFF))
1799                 rt->u.dst.tclassid |= tag & 0xFFFF;
1800         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1801                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1802 }
1803 #endif
1804
1805 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1806 {
1807         struct fib_info *fi = res->fi;
1808
1809         if (fi) {
1810                 if (FIB_RES_GW(*res) &&
1811                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1812                         rt->rt_gateway = FIB_RES_GW(*res);
1813                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1814                        sizeof(rt->u.dst.metrics));
1815                 if (fi->fib_mtu == 0) {
1816                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1817                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1818                             rt->rt_gateway != rt->rt_dst &&
1819                             rt->u.dst.dev->mtu > 576)
1820                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1821                 }
1822 #ifdef CONFIG_NET_CLS_ROUTE
1823                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1824 #endif
1825         } else
1826                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1827
1828         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1829                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1830         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1831                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1832         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1833                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1834                                        ip_rt_min_advmss);
1835         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1836                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1837
1838 #ifdef CONFIG_NET_CLS_ROUTE
1839 #ifdef CONFIG_IP_MULTIPLE_TABLES
1840         set_class_tag(rt, fib_rules_tclass(res));
1841 #endif
1842         set_class_tag(rt, itag);
1843 #endif
1844         rt->rt_type = res->type;
1845 }
1846
1847 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1848                                 u8 tos, struct net_device *dev, int our)
1849 {
1850         unsigned hash;
1851         struct rtable *rth;
1852         __be32 spec_dst;
1853         struct in_device *in_dev = in_dev_get(dev);
1854         u32 itag = 0;
1855
1856         /* Primary sanity checks. */
1857
1858         if (in_dev == NULL)
1859                 return -EINVAL;
1860
1861         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1862             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1863                 goto e_inval;
1864
1865         if (ipv4_is_zeronet(saddr)) {
1866                 if (!ipv4_is_local_multicast(daddr))
1867                         goto e_inval;
1868                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1869         } else if (fib_validate_source(saddr, 0, tos, 0,
1870                                         dev, &spec_dst, &itag, 0) < 0)
1871                 goto e_inval;
1872
1873         rth = dst_alloc(&ipv4_dst_ops);
1874         if (!rth)
1875                 goto e_nobufs;
1876
1877         rth->u.dst.output = ip_rt_bug;
1878         rth->u.dst.obsolete = -1;
1879
1880         atomic_set(&rth->u.dst.__refcnt, 1);
1881         rth->u.dst.flags= DST_HOST;
1882         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1883                 rth->u.dst.flags |= DST_NOPOLICY;
1884         rth->fl.fl4_dst = daddr;
1885         rth->rt_dst     = daddr;
1886         rth->fl.fl4_tos = tos;
1887         rth->fl.mark    = skb->mark;
1888         rth->fl.fl4_src = saddr;
1889         rth->rt_src     = saddr;
1890 #ifdef CONFIG_NET_CLS_ROUTE
1891         rth->u.dst.tclassid = itag;
1892 #endif
1893         rth->rt_iif     =
1894         rth->fl.iif     = dev->ifindex;
1895         rth->u.dst.dev  = init_net.loopback_dev;
1896         dev_hold(rth->u.dst.dev);
1897         rth->idev       = in_dev_get(rth->u.dst.dev);
1898         rth->fl.oif     = 0;
1899         rth->rt_gateway = daddr;
1900         rth->rt_spec_dst= spec_dst;
1901         rth->rt_genid   = rt_genid(dev_net(dev));
1902         rth->rt_flags   = RTCF_MULTICAST;
1903         rth->rt_type    = RTN_MULTICAST;
1904         if (our) {
1905                 rth->u.dst.input= ip_local_deliver;
1906                 rth->rt_flags |= RTCF_LOCAL;
1907         }
1908
1909 #ifdef CONFIG_IP_MROUTE
1910         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1911                 rth->u.dst.input = ip_mr_input;
1912 #endif
1913         RT_CACHE_STAT_INC(in_slow_mc);
1914
1915         in_dev_put(in_dev);
1916         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1917         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1918
1919 e_nobufs:
1920         in_dev_put(in_dev);
1921         return -ENOBUFS;
1922
1923 e_inval:
1924         in_dev_put(in_dev);
1925         return -EINVAL;
1926 }
1927
1928
1929 static void ip_handle_martian_source(struct net_device *dev,
1930                                      struct in_device *in_dev,
1931                                      struct sk_buff *skb,
1932                                      __be32 daddr,
1933                                      __be32 saddr)
1934 {
1935         RT_CACHE_STAT_INC(in_martian_src);
1936 #ifdef CONFIG_IP_ROUTE_VERBOSE
1937         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1938                 /*
1939                  *      RFC1812 recommendation, if source is martian,
1940                  *      the only hint is MAC header.
1941                  */
1942                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1943                         &daddr, &saddr, dev->name);
1944                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1945                         int i;
1946                         const unsigned char *p = skb_mac_header(skb);
1947                         printk(KERN_WARNING "ll header: ");
1948                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1949                                 printk("%02x", *p);
1950                                 if (i < (dev->hard_header_len - 1))
1951                                         printk(":");
1952                         }
1953                         printk("\n");
1954                 }
1955         }
1956 #endif
1957 }
1958
1959 static int __mkroute_input(struct sk_buff *skb,
1960                            struct fib_result *res,
1961                            struct in_device *in_dev,
1962                            __be32 daddr, __be32 saddr, u32 tos,
1963                            struct rtable **result)
1964 {
1965
1966         struct rtable *rth;
1967         int err;
1968         struct in_device *out_dev;
1969         unsigned flags = 0;
1970         __be32 spec_dst;
1971         u32 itag;
1972
1973         /* get a working reference to the output device */
1974         out_dev = in_dev_get(FIB_RES_DEV(*res));
1975         if (out_dev == NULL) {
1976                 if (net_ratelimit())
1977                         printk(KERN_CRIT "Bug in ip_route_input" \
1978                                "_slow(). Please, report\n");
1979                 return -EINVAL;
1980         }
1981
1982
1983         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1984                                   in_dev->dev, &spec_dst, &itag, skb->mark);
1985         if (err < 0) {
1986                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1987                                          saddr);
1988
1989                 err = -EINVAL;
1990                 goto cleanup;
1991         }
1992
1993         if (err)
1994                 flags |= RTCF_DIRECTSRC;
1995
1996         if (out_dev == in_dev && err &&
1997             (IN_DEV_SHARED_MEDIA(out_dev) ||
1998              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1999                 flags |= RTCF_DOREDIRECT;
2000
2001         if (skb->protocol != htons(ETH_P_IP)) {
2002                 /* Not IP (i.e. ARP). Do not create route, if it is
2003                  * invalid for proxy arp. DNAT routes are always valid.
2004                  *
2005                  * Proxy arp feature have been extended to allow, ARP
2006                  * replies back to the same interface, to support
2007                  * Private VLAN switch technologies. See arp.c.
2008                  */
2009                 if (out_dev == in_dev &&
2010                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2011                         err = -EINVAL;
2012                         goto cleanup;
2013                 }
2014         }
2015
2016
2017         rth = dst_alloc(&ipv4_dst_ops);
2018         if (!rth) {
2019                 err = -ENOBUFS;
2020                 goto cleanup;
2021         }
2022
2023         atomic_set(&rth->u.dst.__refcnt, 1);
2024         rth->u.dst.flags= DST_HOST;
2025         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2026                 rth->u.dst.flags |= DST_NOPOLICY;
2027         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2028                 rth->u.dst.flags |= DST_NOXFRM;
2029         rth->fl.fl4_dst = daddr;
2030         rth->rt_dst     = daddr;
2031         rth->fl.fl4_tos = tos;
2032         rth->fl.mark    = skb->mark;
2033         rth->fl.fl4_src = saddr;
2034         rth->rt_src     = saddr;
2035         rth->rt_gateway = daddr;
2036         rth->rt_iif     =
2037                 rth->fl.iif     = in_dev->dev->ifindex;
2038         rth->u.dst.dev  = (out_dev)->dev;
2039         dev_hold(rth->u.dst.dev);
2040         rth->idev       = in_dev_get(rth->u.dst.dev);
2041         rth->fl.oif     = 0;
2042         rth->rt_spec_dst= spec_dst;
2043
2044         rth->u.dst.obsolete = -1;
2045         rth->u.dst.input = ip_forward;
2046         rth->u.dst.output = ip_output;
2047         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2048
2049         rt_set_nexthop(rth, res, itag);
2050
2051         rth->rt_flags = flags;
2052
2053         *result = rth;
2054         err = 0;
2055  cleanup:
2056         /* release the working reference to the output device */
2057         in_dev_put(out_dev);
2058         return err;
2059 }
2060
2061 static int ip_mkroute_input(struct sk_buff *skb,
2062                             struct fib_result *res,
2063                             const struct flowi *fl,
2064                             struct in_device *in_dev,
2065                             __be32 daddr, __be32 saddr, u32 tos)
2066 {
2067         struct rtable* rth = NULL;
2068         int err;
2069         unsigned hash;
2070
2071 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2072         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2073                 fib_select_multipath(fl, res);
2074 #endif
2075
2076         /* create a routing cache entry */
2077         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2078         if (err)
2079                 return err;
2080
2081         /* put it into the cache */
2082         hash = rt_hash(daddr, saddr, fl->iif,
2083                        rt_genid(dev_net(rth->u.dst.dev)));
2084         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2085 }
2086
2087 /*
2088  *      NOTE. We drop all the packets that has local source
2089  *      addresses, because every properly looped back packet
2090  *      must have correct destination already attached by output routine.
2091  *
2092  *      Such approach solves two big problems:
2093  *      1. Not simplex devices are handled properly.
2094  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2095  */
2096
2097 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2098                                u8 tos, struct net_device *dev)
2099 {
2100         struct fib_result res;
2101         struct in_device *in_dev = in_dev_get(dev);
2102         struct flowi fl = { .nl_u = { .ip4_u =
2103                                       { .daddr = daddr,
2104                                         .saddr = saddr,
2105                                         .tos = tos,
2106                                         .scope = RT_SCOPE_UNIVERSE,
2107                                       } },
2108                             .mark = skb->mark,
2109                             .iif = dev->ifindex };
2110         unsigned        flags = 0;
2111         u32             itag = 0;
2112         struct rtable * rth;
2113         unsigned        hash;
2114         __be32          spec_dst;
2115         int             err = -EINVAL;
2116         int             free_res = 0;
2117         struct net    * net = dev_net(dev);
2118
2119         /* IP on this device is disabled. */
2120
2121         if (!in_dev)
2122                 goto out;
2123
2124         /* Check for the most weird martians, which can be not detected
2125            by fib_lookup.
2126          */
2127
2128         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2129             ipv4_is_loopback(saddr))
2130                 goto martian_source;
2131
2132         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2133                 goto brd_input;
2134
2135         /* Accept zero addresses only to limited broadcast;
2136          * I even do not know to fix it or not. Waiting for complains :-)
2137          */
2138         if (ipv4_is_zeronet(saddr))
2139                 goto martian_source;
2140
2141         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2142             ipv4_is_loopback(daddr))
2143                 goto martian_destination;
2144
2145         /*
2146          *      Now we are ready to route packet.
2147          */
2148         if ((err = fib_lookup(net, &fl, &res)) != 0) {
2149                 if (!IN_DEV_FORWARD(in_dev))
2150                         goto e_hostunreach;
2151                 goto no_route;
2152         }
2153         free_res = 1;
2154
2155         RT_CACHE_STAT_INC(in_slow_tot);
2156
2157         if (res.type == RTN_BROADCAST)
2158                 goto brd_input;
2159
2160         if (res.type == RTN_LOCAL) {
2161                 int result;
2162                 result = fib_validate_source(saddr, daddr, tos,
2163                                              net->loopback_dev->ifindex,
2164                                              dev, &spec_dst, &itag, skb->mark);
2165                 if (result < 0)
2166                         goto martian_source;
2167                 if (result)
2168                         flags |= RTCF_DIRECTSRC;
2169                 spec_dst = daddr;
2170                 goto local_input;
2171         }
2172
2173         if (!IN_DEV_FORWARD(in_dev))
2174                 goto e_hostunreach;
2175         if (res.type != RTN_UNICAST)
2176                 goto martian_destination;
2177
2178         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2179 done:
2180         in_dev_put(in_dev);
2181         if (free_res)
2182                 fib_res_put(&res);
2183 out:    return err;
2184
2185 brd_input:
2186         if (skb->protocol != htons(ETH_P_IP))
2187                 goto e_inval;
2188
2189         if (ipv4_is_zeronet(saddr))
2190                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2191         else {
2192                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2193                                           &itag, skb->mark);
2194                 if (err < 0)
2195                         goto martian_source;
2196                 if (err)
2197                         flags |= RTCF_DIRECTSRC;
2198         }
2199         flags |= RTCF_BROADCAST;
2200         res.type = RTN_BROADCAST;
2201         RT_CACHE_STAT_INC(in_brd);
2202
2203 local_input:
2204         rth = dst_alloc(&ipv4_dst_ops);
2205         if (!rth)
2206                 goto e_nobufs;
2207
2208         rth->u.dst.output= ip_rt_bug;
2209         rth->u.dst.obsolete = -1;
2210         rth->rt_genid = rt_genid(net);
2211
2212         atomic_set(&rth->u.dst.__refcnt, 1);
2213         rth->u.dst.flags= DST_HOST;
2214         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2215                 rth->u.dst.flags |= DST_NOPOLICY;
2216         rth->fl.fl4_dst = daddr;
2217         rth->rt_dst     = daddr;
2218         rth->fl.fl4_tos = tos;
2219         rth->fl.mark    = skb->mark;
2220         rth->fl.fl4_src = saddr;
2221         rth->rt_src     = saddr;
2222 #ifdef CONFIG_NET_CLS_ROUTE
2223         rth->u.dst.tclassid = itag;
2224 #endif
2225         rth->rt_iif     =
2226         rth->fl.iif     = dev->ifindex;
2227         rth->u.dst.dev  = net->loopback_dev;
2228         dev_hold(rth->u.dst.dev);
2229         rth->idev       = in_dev_get(rth->u.dst.dev);
2230         rth->rt_gateway = daddr;
2231         rth->rt_spec_dst= spec_dst;
2232         rth->u.dst.input= ip_local_deliver;
2233         rth->rt_flags   = flags|RTCF_LOCAL;
2234         if (res.type == RTN_UNREACHABLE) {
2235                 rth->u.dst.input= ip_error;
2236                 rth->u.dst.error= -err;
2237                 rth->rt_flags   &= ~RTCF_LOCAL;
2238         }
2239         rth->rt_type    = res.type;
2240         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2241         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2242         goto done;
2243
2244 no_route:
2245         RT_CACHE_STAT_INC(in_no_route);
2246         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2247         res.type = RTN_UNREACHABLE;
2248         if (err == -ESRCH)
2249                 err = -ENETUNREACH;
2250         goto local_input;
2251
2252         /*
2253          *      Do not cache martian addresses: they should be logged (RFC1812)
2254          */
2255 martian_destination:
2256         RT_CACHE_STAT_INC(in_martian_dst);
2257 #ifdef CONFIG_IP_ROUTE_VERBOSE
2258         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2259                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2260                         &daddr, &saddr, dev->name);
2261 #endif
2262
2263 e_hostunreach:
2264         err = -EHOSTUNREACH;
2265         goto done;
2266
2267 e_inval:
2268         err = -EINVAL;
2269         goto done;
2270
2271 e_nobufs:
2272         err = -ENOBUFS;
2273         goto done;
2274
2275 martian_source:
2276         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2277         goto e_inval;
2278 }
2279
2280 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2281                    u8 tos, struct net_device *dev)
2282 {
2283         struct rtable * rth;
2284         unsigned        hash;
2285         int iif = dev->ifindex;
2286         struct net *net;
2287
2288         net = dev_net(dev);
2289
2290         if (!rt_caching(net))
2291                 goto skip_cache;
2292
2293         tos &= IPTOS_RT_MASK;
2294         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2295
2296         rcu_read_lock();
2297         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2298              rth = rcu_dereference(rth->u.dst.rt_next)) {
2299                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2300                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2301                      (rth->fl.iif ^ iif) |
2302                      rth->fl.oif |
2303                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2304                     rth->fl.mark == skb->mark &&
2305                     net_eq(dev_net(rth->u.dst.dev), net) &&
2306                     !rt_is_expired(rth)) {
2307                         dst_use(&rth->u.dst, jiffies);
2308                         RT_CACHE_STAT_INC(in_hit);
2309                         rcu_read_unlock();
2310                         skb_dst_set(skb, &rth->u.dst);
2311                         return 0;
2312                 }
2313                 RT_CACHE_STAT_INC(in_hlist_search);
2314         }
2315         rcu_read_unlock();
2316
2317 skip_cache:
2318         /* Multicast recognition logic is moved from route cache to here.
2319            The problem was that too many Ethernet cards have broken/missing
2320            hardware multicast filters :-( As result the host on multicasting
2321            network acquires a lot of useless route cache entries, sort of
2322            SDR messages from all the world. Now we try to get rid of them.
2323            Really, provided software IP multicast filter is organized
2324            reasonably (at least, hashed), it does not result in a slowdown
2325            comparing with route cache reject entries.
2326            Note, that multicast routers are not affected, because
2327            route cache entry is created eventually.
2328          */
2329         if (ipv4_is_multicast(daddr)) {
2330                 struct in_device *in_dev;
2331
2332                 rcu_read_lock();
2333                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2334                         int our = ip_check_mc(in_dev, daddr, saddr,
2335                                 ip_hdr(skb)->protocol);
2336                         if (our
2337 #ifdef CONFIG_IP_MROUTE
2338                                 ||
2339                             (!ipv4_is_local_multicast(daddr) &&
2340                              IN_DEV_MFORWARD(in_dev))
2341 #endif
2342                            ) {
2343                                 rcu_read_unlock();
2344                                 return ip_route_input_mc(skb, daddr, saddr,
2345                                                          tos, dev, our);
2346                         }
2347                 }
2348                 rcu_read_unlock();
2349                 return -EINVAL;
2350         }
2351         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2352 }
2353
2354 static int __mkroute_output(struct rtable **result,
2355                             struct fib_result *res,
2356                             const struct flowi *fl,
2357                             const struct flowi *oldflp,
2358                             struct net_device *dev_out,
2359                             unsigned flags)
2360 {
2361         struct rtable *rth;
2362         struct in_device *in_dev;
2363         u32 tos = RT_FL_TOS(oldflp);
2364         int err = 0;
2365
2366         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2367                 return -EINVAL;
2368
2369         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2370                 res->type = RTN_BROADCAST;
2371         else if (ipv4_is_multicast(fl->fl4_dst))
2372                 res->type = RTN_MULTICAST;
2373         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2374                 return -EINVAL;
2375
2376         if (dev_out->flags & IFF_LOOPBACK)
2377                 flags |= RTCF_LOCAL;
2378
2379         /* get work reference to inet device */
2380         in_dev = in_dev_get(dev_out);
2381         if (!in_dev)
2382                 return -EINVAL;
2383
2384         if (res->type == RTN_BROADCAST) {
2385                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2386                 if (res->fi) {
2387                         fib_info_put(res->fi);
2388                         res->fi = NULL;
2389                 }
2390         } else if (res->type == RTN_MULTICAST) {
2391                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2392                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2393                                  oldflp->proto))
2394                         flags &= ~RTCF_LOCAL;
2395                 /* If multicast route do not exist use
2396                    default one, but do not gateway in this case.
2397                    Yes, it is hack.
2398                  */
2399                 if (res->fi && res->prefixlen < 4) {
2400                         fib_info_put(res->fi);
2401                         res->fi = NULL;
2402                 }
2403         }
2404
2405
2406         rth = dst_alloc(&ipv4_dst_ops);
2407         if (!rth) {
2408                 err = -ENOBUFS;
2409                 goto cleanup;
2410         }
2411
2412         atomic_set(&rth->u.dst.__refcnt, 1);
2413         rth->u.dst.flags= DST_HOST;
2414         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2415                 rth->u.dst.flags |= DST_NOXFRM;
2416         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2417                 rth->u.dst.flags |= DST_NOPOLICY;
2418
2419         rth->fl.fl4_dst = oldflp->fl4_dst;
2420         rth->fl.fl4_tos = tos;
2421         rth->fl.fl4_src = oldflp->fl4_src;
2422         rth->fl.oif     = oldflp->oif;
2423         rth->fl.mark    = oldflp->mark;
2424         rth->rt_dst     = fl->fl4_dst;
2425         rth->rt_src     = fl->fl4_src;
2426         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2427         /* get references to the devices that are to be hold by the routing
2428            cache entry */
2429         rth->u.dst.dev  = dev_out;
2430         dev_hold(dev_out);
2431         rth->idev       = in_dev_get(dev_out);
2432         rth->rt_gateway = fl->fl4_dst;
2433         rth->rt_spec_dst= fl->fl4_src;
2434
2435         rth->u.dst.output=ip_output;
2436         rth->u.dst.obsolete = -1;
2437         rth->rt_genid = rt_genid(dev_net(dev_out));
2438
2439         RT_CACHE_STAT_INC(out_slow_tot);
2440
2441         if (flags & RTCF_LOCAL) {
2442                 rth->u.dst.input = ip_local_deliver;
2443                 rth->rt_spec_dst = fl->fl4_dst;
2444         }
2445         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2446                 rth->rt_spec_dst = fl->fl4_src;
2447                 if (flags & RTCF_LOCAL &&
2448                     !(dev_out->flags & IFF_LOOPBACK)) {
2449                         rth->u.dst.output = ip_mc_output;
2450                         RT_CACHE_STAT_INC(out_slow_mc);
2451                 }
2452 #ifdef CONFIG_IP_MROUTE
2453                 if (res->type == RTN_MULTICAST) {
2454                         if (IN_DEV_MFORWARD(in_dev) &&
2455                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2456                                 rth->u.dst.input = ip_mr_input;
2457                                 rth->u.dst.output = ip_mc_output;
2458                         }
2459                 }
2460 #endif
2461         }
2462
2463         rt_set_nexthop(rth, res, 0);
2464
2465         rth->rt_flags = flags;
2466
2467         *result = rth;
2468  cleanup:
2469         /* release work reference to inet device */
2470         in_dev_put(in_dev);
2471
2472         return err;
2473 }
2474
2475 static int ip_mkroute_output(struct rtable **rp,
2476                              struct fib_result *res,
2477                              const struct flowi *fl,
2478                              const struct flowi *oldflp,
2479                              struct net_device *dev_out,
2480                              unsigned flags)
2481 {
2482         struct rtable *rth = NULL;
2483         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2484         unsigned hash;
2485         if (err == 0) {
2486                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2487                                rt_genid(dev_net(dev_out)));
2488                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2489         }
2490
2491         return err;
2492 }
2493
2494 /*
2495  * Major route resolver routine.
2496  */
2497
2498 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2499                                 const struct flowi *oldflp)
2500 {
2501         u32 tos = RT_FL_TOS(oldflp);
2502         struct flowi fl = { .nl_u = { .ip4_u =
2503                                       { .daddr = oldflp->fl4_dst,
2504                                         .saddr = oldflp->fl4_src,
2505                                         .tos = tos & IPTOS_RT_MASK,
2506                                         .scope = ((tos & RTO_ONLINK) ?
2507                                                   RT_SCOPE_LINK :
2508                                                   RT_SCOPE_UNIVERSE),
2509                                       } },
2510                             .mark = oldflp->mark,
2511                             .iif = net->loopback_dev->ifindex,
2512                             .oif = oldflp->oif };
2513         struct fib_result res;
2514         unsigned flags = 0;
2515         struct net_device *dev_out = NULL;
2516         int free_res = 0;
2517         int err;
2518
2519
2520         res.fi          = NULL;
2521 #ifdef CONFIG_IP_MULTIPLE_TABLES
2522         res.r           = NULL;
2523 #endif
2524
2525         if (oldflp->fl4_src) {
2526                 err = -EINVAL;
2527                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2528                     ipv4_is_lbcast(oldflp->fl4_src) ||
2529                     ipv4_is_zeronet(oldflp->fl4_src))
2530                         goto out;
2531
2532                 /* I removed check for oif == dev_out->oif here.
2533                    It was wrong for two reasons:
2534                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2535                       is assigned to multiple interfaces.
2536                    2. Moreover, we are allowed to send packets with saddr
2537                       of another iface. --ANK
2538                  */
2539
2540                 if (oldflp->oif == 0 &&
2541                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2542                      oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2543                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2544                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2545                         if (dev_out == NULL)
2546                                 goto out;
2547
2548                         /* Special hack: user can direct multicasts
2549                            and limited broadcast via necessary interface
2550                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2551                            This hack is not just for fun, it allows
2552                            vic,vat and friends to work.
2553                            They bind socket to loopback, set ttl to zero
2554                            and expect that it will work.
2555                            From the viewpoint of routing cache they are broken,
2556                            because we are not allowed to build multicast path
2557                            with loopback source addr (look, routing cache
2558                            cannot know, that ttl is zero, so that packet
2559                            will not leave this host and route is valid).
2560                            Luckily, this hack is good workaround.
2561                          */
2562
2563                         fl.oif = dev_out->ifindex;
2564                         goto make_route;
2565                 }
2566
2567                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2568                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2569                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2570                         if (dev_out == NULL)
2571                                 goto out;
2572                         dev_put(dev_out);
2573                         dev_out = NULL;
2574                 }
2575         }
2576
2577
2578         if (oldflp->oif) {
2579                 dev_out = dev_get_by_index(net, oldflp->oif);
2580                 err = -ENODEV;
2581                 if (dev_out == NULL)
2582                         goto out;
2583
2584                 /* RACE: Check return value of inet_select_addr instead. */
2585                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2586                         dev_put(dev_out);
2587                         goto out;       /* Wrong error code */
2588                 }
2589
2590                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2591                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2592                         if (!fl.fl4_src)
2593                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2594                                                               RT_SCOPE_LINK);
2595                         goto make_route;
2596                 }
2597                 if (!fl.fl4_src) {
2598                         if (ipv4_is_multicast(oldflp->fl4_dst))
2599                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2600                                                               fl.fl4_scope);
2601                         else if (!oldflp->fl4_dst)
2602                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2603                                                               RT_SCOPE_HOST);
2604                 }
2605         }
2606
2607         if (!fl.fl4_dst) {
2608                 fl.fl4_dst = fl.fl4_src;
2609                 if (!fl.fl4_dst)
2610                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2611                 if (dev_out)
2612                         dev_put(dev_out);
2613                 dev_out = net->loopback_dev;
2614                 dev_hold(dev_out);
2615                 fl.oif = net->loopback_dev->ifindex;
2616                 res.type = RTN_LOCAL;
2617                 flags |= RTCF_LOCAL;
2618                 goto make_route;
2619         }
2620
2621         if (fib_lookup(net, &fl, &res)) {
2622                 res.fi = NULL;
2623                 if (oldflp->oif) {
2624                         /* Apparently, routing tables are wrong. Assume,
2625                            that the destination is on link.
2626
2627                            WHY? DW.
2628                            Because we are allowed to send to iface
2629                            even if it has NO routes and NO assigned
2630                            addresses. When oif is specified, routing
2631                            tables are looked up with only one purpose:
2632                            to catch if destination is gatewayed, rather than
2633                            direct. Moreover, if MSG_DONTROUTE is set,
2634                            we send packet, ignoring both routing tables
2635                            and ifaddr state. --ANK
2636
2637
2638                            We could make it even if oif is unknown,
2639                            likely IPv6, but we do not.
2640                          */
2641
2642                         if (fl.fl4_src == 0)
2643                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2644                                                               RT_SCOPE_LINK);
2645                         res.type = RTN_UNICAST;
2646                         goto make_route;
2647                 }
2648                 if (dev_out)
2649                         dev_put(dev_out);
2650                 err = -ENETUNREACH;
2651                 goto out;
2652         }
2653         free_res = 1;
2654
2655         if (res.type == RTN_LOCAL) {
2656                 if (!fl.fl4_src)
2657                         fl.fl4_src = fl.fl4_dst;
2658                 if (dev_out)
2659                         dev_put(dev_out);
2660                 dev_out = net->loopback_dev;
2661                 dev_hold(dev_out);
2662                 fl.oif = dev_out->ifindex;
2663                 if (res.fi)
2664                         fib_info_put(res.fi);
2665                 res.fi = NULL;
2666                 flags |= RTCF_LOCAL;
2667                 goto make_route;
2668         }
2669
2670 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2671         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2672                 fib_select_multipath(&fl, &res);
2673         else
2674 #endif
2675         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2676                 fib_select_default(net, &fl, &res);
2677
2678         if (!fl.fl4_src)
2679                 fl.fl4_src = FIB_RES_PREFSRC(res);
2680
2681         if (dev_out)
2682                 dev_put(dev_out);
2683         dev_out = FIB_RES_DEV(res);
2684         dev_hold(dev_out);
2685         fl.oif = dev_out->ifindex;
2686
2687
2688 make_route:
2689         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2690
2691
2692         if (free_res)
2693                 fib_res_put(&res);
2694         if (dev_out)
2695                 dev_put(dev_out);
2696 out:    return err;
2697 }
2698
2699 int __ip_route_output_key(struct net *net, struct rtable **rp,
2700                           const struct flowi *flp)
2701 {
2702         unsigned hash;
2703         struct rtable *rth;
2704
2705         if (!rt_caching(net))
2706                 goto slow_output;
2707
2708         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2709
2710         rcu_read_lock_bh();
2711         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2712                 rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
2713                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2714                     rth->fl.fl4_src == flp->fl4_src &&
2715                     rth->fl.iif == 0 &&
2716                     rth->fl.oif == flp->oif &&
2717                     rth->fl.mark == flp->mark &&
2718                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2719                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2720                     net_eq(dev_net(rth->u.dst.dev), net) &&
2721                     !rt_is_expired(rth)) {
2722                         dst_use(&rth->u.dst, jiffies);
2723                         RT_CACHE_STAT_INC(out_hit);
2724                         rcu_read_unlock_bh();
2725                         *rp = rth;
2726                         return 0;
2727                 }
2728                 RT_CACHE_STAT_INC(out_hlist_search);
2729         }
2730         rcu_read_unlock_bh();
2731
2732 slow_output:
2733         return ip_route_output_slow(net, rp, flp);
2734 }
2735
2736 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2737
2738 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2739 {
2740 }
2741
2742 static struct dst_ops ipv4_dst_blackhole_ops = {
2743         .family                 =       AF_INET,
2744         .protocol               =       cpu_to_be16(ETH_P_IP),
2745         .destroy                =       ipv4_dst_destroy,
2746         .check                  =       ipv4_dst_check,
2747         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2748         .entries                =       ATOMIC_INIT(0),
2749 };
2750
2751
2752 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2753 {
2754         struct rtable *ort = *rp;
2755         struct rtable *rt = (struct rtable *)
2756                 dst_alloc(&ipv4_dst_blackhole_ops);
2757
2758         if (rt) {
2759                 struct dst_entry *new = &rt->u.dst;
2760
2761                 atomic_set(&new->__refcnt, 1);
2762                 new->__use = 1;
2763                 new->input = dst_discard;
2764                 new->output = dst_discard;
2765                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2766
2767                 new->dev = ort->u.dst.dev;
2768                 if (new->dev)
2769                         dev_hold(new->dev);
2770
2771                 rt->fl = ort->fl;
2772
2773                 rt->idev = ort->idev;
2774                 if (rt->idev)
2775                         in_dev_hold(rt->idev);
2776                 rt->rt_genid = rt_genid(net);
2777                 rt->rt_flags = ort->rt_flags;
2778                 rt->rt_type = ort->rt_type;
2779                 rt->rt_dst = ort->rt_dst;
2780                 rt->rt_src = ort->rt_src;
2781                 rt->rt_iif = ort->rt_iif;
2782                 rt->rt_gateway = ort->rt_gateway;
2783                 rt->rt_spec_dst = ort->rt_spec_dst;
2784                 rt->peer = ort->peer;
2785                 if (rt->peer)
2786                         atomic_inc(&rt->peer->refcnt);
2787
2788                 dst_free(new);
2789         }
2790
2791         dst_release(&(*rp)->u.dst);
2792         *rp = rt;
2793         return (rt ? 0 : -ENOMEM);
2794 }
2795
2796 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2797                          struct sock *sk, int flags)
2798 {
2799         int err;
2800
2801         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2802                 return err;
2803
2804         if (flp->proto) {
2805                 if (!flp->fl4_src)
2806                         flp->fl4_src = (*rp)->rt_src;
2807                 if (!flp->fl4_dst)
2808                         flp->fl4_dst = (*rp)->rt_dst;
2809                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2810                                     flags ? XFRM_LOOKUP_WAIT : 0);
2811                 if (err == -EREMOTE)
2812                         err = ipv4_dst_blackhole(net, rp, flp);
2813
2814                 return err;
2815         }
2816
2817         return 0;
2818 }
2819
2820 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2821
2822 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2823 {
2824         return ip_route_output_flow(net, rp, flp, NULL, 0);
2825 }
2826
2827 static int rt_fill_info(struct net *net,
2828                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2829                         int nowait, unsigned int flags)
2830 {
2831         struct rtable *rt = skb_rtable(skb);
2832         struct rtmsg *r;
2833         struct nlmsghdr *nlh;
2834         long expires;
2835         u32 id = 0, ts = 0, tsage = 0, error;
2836
2837         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2838         if (nlh == NULL)
2839                 return -EMSGSIZE;
2840
2841         r = nlmsg_data(nlh);
2842         r->rtm_family    = AF_INET;
2843         r->rtm_dst_len  = 32;
2844         r->rtm_src_len  = 0;
2845         r->rtm_tos      = rt->fl.fl4_tos;
2846         r->rtm_table    = RT_TABLE_MAIN;
2847         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2848         r->rtm_type     = rt->rt_type;
2849         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2850         r->rtm_protocol = RTPROT_UNSPEC;
2851         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2852         if (rt->rt_flags & RTCF_NOTIFY)
2853                 r->rtm_flags |= RTM_F_NOTIFY;
2854
2855         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2856
2857         if (rt->fl.fl4_src) {
2858                 r->rtm_src_len = 32;
2859                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2860         }
2861         if (rt->u.dst.dev)
2862                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2863 #ifdef CONFIG_NET_CLS_ROUTE
2864         if (rt->u.dst.tclassid)
2865                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2866 #endif
2867         if (rt->fl.iif)
2868                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2869         else if (rt->rt_src != rt->fl.fl4_src)
2870                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2871
2872         if (rt->rt_dst != rt->rt_gateway)
2873                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2874
2875         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2876                 goto nla_put_failure;
2877
2878         error = rt->u.dst.error;
2879         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2880         if (rt->peer) {
2881                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2882                 if (rt->peer->tcp_ts_stamp) {
2883                         ts = rt->peer->tcp_ts;
2884                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2885                 }
2886         }
2887
2888         if (rt->fl.iif) {
2889 #ifdef CONFIG_IP_MROUTE
2890                 __be32 dst = rt->rt_dst;
2891
2892                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2893                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2894                         int err = ipmr_get_route(net, skb, r, nowait);
2895                         if (err <= 0) {
2896                                 if (!nowait) {
2897                                         if (err == 0)
2898                                                 return 0;
2899                                         goto nla_put_failure;
2900                                 } else {
2901                                         if (err == -EMSGSIZE)
2902                                                 goto nla_put_failure;
2903                                         error = err;
2904                                 }
2905                         }
2906                 } else
2907 #endif
2908                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2909         }
2910
2911         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2912                                expires, error) < 0)
2913                 goto nla_put_failure;
2914
2915         return nlmsg_end(skb, nlh);
2916
2917 nla_put_failure:
2918         nlmsg_cancel(skb, nlh);
2919         return -EMSGSIZE;
2920 }
2921
2922 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2923 {
2924         struct net *net = sock_net(in_skb->sk);
2925         struct rtmsg *rtm;
2926         struct nlattr *tb[RTA_MAX+1];
2927         struct rtable *rt = NULL;
2928         __be32 dst = 0;
2929         __be32 src = 0;
2930         u32 iif;
2931         int err;
2932         struct sk_buff *skb;
2933
2934         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2935         if (err < 0)
2936                 goto errout;
2937
2938         rtm = nlmsg_data(nlh);
2939
2940         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2941         if (skb == NULL) {
2942                 err = -ENOBUFS;
2943                 goto errout;
2944         }
2945
2946         /* Reserve room for dummy headers, this skb can pass
2947            through good chunk of routing engine.
2948          */
2949         skb_reset_mac_header(skb);
2950         skb_reset_network_header(skb);
2951
2952         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2953         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2954         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2955
2956         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2957         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2958         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2959
2960         if (iif) {
2961                 struct net_device *dev;
2962
2963                 dev = __dev_get_by_index(net, iif);
2964                 if (dev == NULL) {
2965                         err = -ENODEV;
2966                         goto errout_free;
2967                 }
2968
2969                 skb->protocol   = htons(ETH_P_IP);
2970                 skb->dev        = dev;
2971                 local_bh_disable();
2972                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2973                 local_bh_enable();
2974
2975                 rt = skb_rtable(skb);
2976                 if (err == 0 && rt->u.dst.error)
2977                         err = -rt->u.dst.error;
2978         } else {
2979                 struct flowi fl = {
2980                         .nl_u = {
2981                                 .ip4_u = {
2982                                         .daddr = dst,
2983                                         .saddr = src,
2984                                         .tos = rtm->rtm_tos,
2985                                 },
2986                         },
2987                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2988                 };
2989                 err = ip_route_output_key(net, &rt, &fl);
2990         }
2991
2992         if (err)
2993                 goto errout_free;
2994
2995         skb_dst_set(skb, &rt->u.dst);
2996         if (rtm->rtm_flags & RTM_F_NOTIFY)
2997                 rt->rt_flags |= RTCF_NOTIFY;
2998
2999         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3000                            RTM_NEWROUTE, 0, 0);
3001         if (err <= 0)
3002                 goto errout_free;
3003
3004         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3005 errout:
3006         return err;
3007
3008 errout_free:
3009         kfree_skb(skb);
3010         goto errout;
3011 }
3012
3013 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3014 {
3015         struct rtable *rt;
3016         int h, s_h;
3017         int idx, s_idx;
3018         struct net *net;
3019
3020         net = sock_net(skb->sk);
3021
3022         s_h = cb->args[0];
3023         if (s_h < 0)
3024                 s_h = 0;
3025         s_idx = idx = cb->args[1];
3026         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3027                 if (!rt_hash_table[h].chain)
3028                         continue;
3029                 rcu_read_lock_bh();
3030                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3031                      rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
3032                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3033                                 continue;
3034                         if (rt_is_expired(rt))
3035                                 continue;
3036                         skb_dst_set_noref(skb, &rt->u.dst);
3037                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3038                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3039                                          1, NLM_F_MULTI) <= 0) {
3040                                 skb_dst_drop(skb);
3041                                 rcu_read_unlock_bh();
3042                                 goto done;
3043                         }
3044                         skb_dst_drop(skb);
3045                 }
3046                 rcu_read_unlock_bh();
3047         }
3048
3049 done:
3050         cb->args[0] = h;
3051         cb->args[1] = idx;
3052         return skb->len;
3053 }
3054
3055 void ip_rt_multicast_event(struct in_device *in_dev)
3056 {
3057         rt_cache_flush(dev_net(in_dev->dev), 0);
3058 }
3059
3060 #ifdef CONFIG_SYSCTL
3061 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3062                                         void __user *buffer,
3063                                         size_t *lenp, loff_t *ppos)
3064 {
3065         if (write) {
3066                 int flush_delay;
3067                 ctl_table ctl;
3068                 struct net *net;
3069
3070                 memcpy(&ctl, __ctl, sizeof(ctl));
3071                 ctl.data = &flush_delay;
3072                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3073
3074                 net = (struct net *)__ctl->extra1;
3075                 rt_cache_flush(net, flush_delay);
3076                 return 0;
3077         }
3078
3079         return -EINVAL;
3080 }
3081
3082 static ctl_table ipv4_route_table[] = {
3083         {
3084                 .procname       = "gc_thresh",
3085                 .data           = &ipv4_dst_ops.gc_thresh,
3086                 .maxlen         = sizeof(int),
3087                 .mode           = 0644,
3088                 .proc_handler   = proc_dointvec,
3089         },
3090         {
3091                 .procname       = "max_size",
3092                 .data           = &ip_rt_max_size,
3093                 .maxlen         = sizeof(int),
3094                 .mode           = 0644,
3095                 .proc_handler   = proc_dointvec,
3096         },
3097         {
3098                 /*  Deprecated. Use gc_min_interval_ms */
3099
3100                 .procname       = "gc_min_interval",
3101                 .data           = &ip_rt_gc_min_interval,
3102                 .maxlen         = sizeof(int),
3103                 .mode           = 0644,
3104                 .proc_handler   = proc_dointvec_jiffies,
3105         },
3106         {
3107                 .procname       = "gc_min_interval_ms",
3108                 .data           = &ip_rt_gc_min_interval,
3109                 .maxlen         = sizeof(int),
3110                 .mode           = 0644,
3111                 .proc_handler   = proc_dointvec_ms_jiffies,
3112         },
3113         {
3114                 .procname       = "gc_timeout",
3115                 .data           = &ip_rt_gc_timeout,
3116                 .maxlen         = sizeof(int),
3117                 .mode           = 0644,
3118                 .proc_handler   = proc_dointvec_jiffies,
3119         },
3120         {
3121                 .procname       = "gc_interval",
3122                 .data           = &ip_rt_gc_interval,
3123                 .maxlen         = sizeof(int),
3124                 .mode           = 0644,
3125                 .proc_handler   = proc_dointvec_jiffies,
3126         },
3127         {
3128                 .procname       = "redirect_load",
3129                 .data           = &ip_rt_redirect_load,
3130                 .maxlen         = sizeof(int),
3131                 .mode           = 0644,
3132                 .proc_handler   = proc_dointvec,
3133         },
3134         {
3135                 .procname       = "redirect_number",
3136                 .data           = &ip_rt_redirect_number,
3137                 .maxlen         = sizeof(int),
3138                 .mode           = 0644,
3139                 .proc_handler   = proc_dointvec,
3140         },
3141         {
3142                 .procname       = "redirect_silence",
3143                 .data           = &ip_rt_redirect_silence,
3144                 .maxlen         = sizeof(int),
3145                 .mode           = 0644,
3146                 .proc_handler   = proc_dointvec,
3147         },
3148         {
3149                 .procname       = "error_cost",
3150                 .data           = &ip_rt_error_cost,
3151                 .maxlen         = sizeof(int),
3152                 .mode           = 0644,
3153                 .proc_handler   = proc_dointvec,
3154         },
3155         {
3156                 .procname       = "error_burst",
3157                 .data           = &ip_rt_error_burst,
3158                 .maxlen         = sizeof(int),
3159                 .mode           = 0644,
3160                 .proc_handler   = proc_dointvec,
3161         },
3162         {
3163                 .procname       = "gc_elasticity",
3164                 .data           = &ip_rt_gc_elasticity,
3165                 .maxlen         = sizeof(int),
3166                 .mode           = 0644,
3167                 .proc_handler   = proc_dointvec,
3168         },
3169         {
3170                 .procname       = "mtu_expires",
3171                 .data           = &ip_rt_mtu_expires,
3172                 .maxlen         = sizeof(int),
3173                 .mode           = 0644,
3174                 .proc_handler   = proc_dointvec_jiffies,
3175         },
3176         {
3177                 .procname       = "min_pmtu",
3178                 .data           = &ip_rt_min_pmtu,
3179                 .maxlen         = sizeof(int),
3180                 .mode           = 0644,
3181                 .proc_handler   = proc_dointvec,
3182         },
3183         {
3184                 .procname       = "min_adv_mss",
3185                 .data           = &ip_rt_min_advmss,
3186                 .maxlen         = sizeof(int),
3187                 .mode           = 0644,
3188                 .proc_handler   = proc_dointvec,
3189         },
3190         { }
3191 };
3192
3193 static struct ctl_table empty[1];
3194
3195 static struct ctl_table ipv4_skeleton[] =
3196 {
3197         { .procname = "route", 
3198           .mode = 0555, .child = ipv4_route_table},
3199         { .procname = "neigh", 
3200           .mode = 0555, .child = empty},
3201         { }
3202 };
3203
3204 static __net_initdata struct ctl_path ipv4_path[] = {
3205         { .procname = "net", },
3206         { .procname = "ipv4", },
3207         { },
3208 };
3209
3210 static struct ctl_table ipv4_route_flush_table[] = {
3211         {
3212                 .procname       = "flush",
3213                 .maxlen         = sizeof(int),
3214                 .mode           = 0200,
3215                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3216         },
3217         { },
3218 };
3219
3220 static __net_initdata struct ctl_path ipv4_route_path[] = {
3221         { .procname = "net", },
3222         { .procname = "ipv4", },
3223         { .procname = "route", },
3224         { },
3225 };
3226
3227 static __net_init int sysctl_route_net_init(struct net *net)
3228 {
3229         struct ctl_table *tbl;
3230
3231         tbl = ipv4_route_flush_table;
3232         if (!net_eq(net, &init_net)) {
3233                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3234                 if (tbl == NULL)
3235                         goto err_dup;
3236         }
3237         tbl[0].extra1 = net;
3238
3239         net->ipv4.route_hdr =
3240                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3241         if (net->ipv4.route_hdr == NULL)
3242                 goto err_reg;
3243         return 0;
3244
3245 err_reg:
3246         if (tbl != ipv4_route_flush_table)
3247                 kfree(tbl);
3248 err_dup:
3249         return -ENOMEM;
3250 }
3251
3252 static __net_exit void sysctl_route_net_exit(struct net *net)
3253 {
3254         struct ctl_table *tbl;
3255
3256         tbl = net->ipv4.route_hdr->ctl_table_arg;
3257         unregister_net_sysctl_table(net->ipv4.route_hdr);
3258         BUG_ON(tbl == ipv4_route_flush_table);
3259         kfree(tbl);
3260 }
3261
3262 static __net_initdata struct pernet_operations sysctl_route_ops = {
3263         .init = sysctl_route_net_init,
3264         .exit = sysctl_route_net_exit,
3265 };
3266 #endif
3267
3268 static __net_init int rt_genid_init(struct net *net)
3269 {
3270         get_random_bytes(&net->ipv4.rt_genid,
3271                          sizeof(net->ipv4.rt_genid));
3272         return 0;
3273 }
3274
3275 static __net_initdata struct pernet_operations rt_genid_ops = {
3276         .init = rt_genid_init,
3277 };
3278
3279
3280 #ifdef CONFIG_NET_CLS_ROUTE
3281 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3282 #endif /* CONFIG_NET_CLS_ROUTE */
3283
3284 static __initdata unsigned long rhash_entries;
3285 static int __init set_rhash_entries(char *str)
3286 {
3287         if (!str)
3288                 return 0;
3289         rhash_entries = simple_strtoul(str, &str, 0);
3290         return 1;
3291 }
3292 __setup("rhash_entries=", set_rhash_entries);
3293
3294 int __init ip_rt_init(void)
3295 {
3296         int rc = 0;
3297
3298 #ifdef CONFIG_NET_CLS_ROUTE
3299         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3300         if (!ip_rt_acct)
3301                 panic("IP: failed to allocate ip_rt_acct\n");
3302 #endif
3303
3304         ipv4_dst_ops.kmem_cachep =
3305                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3306                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3307
3308         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3309
3310         rt_hash_table = (struct rt_hash_bucket *)
3311                 alloc_large_system_hash("IP route cache",
3312                                         sizeof(struct rt_hash_bucket),
3313                                         rhash_entries,
3314                                         (totalram_pages >= 128 * 1024) ?
3315                                         15 : 17,
3316                                         0,
3317                                         &rt_hash_log,
3318                                         &rt_hash_mask,
3319                                         rhash_entries ? 0 : 512 * 1024);
3320         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3321         rt_hash_lock_init();
3322
3323         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3324         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3325
3326         devinet_init();
3327         ip_fib_init();
3328
3329         /* All the timers, started at system startup tend
3330            to synchronize. Perturb it a bit.
3331          */
3332         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3333         expires_ljiffies = jiffies;
3334         schedule_delayed_work(&expires_work,
3335                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3336
3337         if (ip_rt_proc_init())
3338                 printk(KERN_ERR "Unable to create route proc files\n");
3339 #ifdef CONFIG_XFRM
3340         xfrm_init();
3341         xfrm4_init(ip_rt_max_size);
3342 #endif
3343         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3344
3345 #ifdef CONFIG_SYSCTL
3346         register_pernet_subsys(&sysctl_route_ops);
3347 #endif
3348         register_pernet_subsys(&rt_genid_ops);
3349         return rc;
3350 }
3351
3352 #ifdef CONFIG_SYSCTL
3353 /*
3354  * We really need to sanitize the damn ipv4 init order, then all
3355  * this nonsense will go away.
3356  */
3357 void __init ip_static_sysctl_init(void)
3358 {
3359         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3360 }
3361 #endif
3362
3363 EXPORT_SYMBOL(__ip_select_ident);
3364 EXPORT_SYMBOL(ip_route_input);
3365 EXPORT_SYMBOL(ip_route_output_key);