VFS: Log the fact that we've given ELOOP rather than creating a loop
[pandora-kernel.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 #include <net/atmclip.h>
112 #include <net/secure_seq.h>
113
114 #define RT_FL_TOS(oldflp4) \
115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
116
117 #define IP_MAX_MTU      0xFFF0
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
124 static int ip_rt_redirect_number __read_mostly  = 9;
125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly       = HZ;
128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly    = 8;
130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly       = 256;
133 static int rt_chain_length_max __read_mostly    = 20;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
142 static void              ipv4_dst_destroy(struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void              ipv4_link_failure(struct sk_buff *skb);
145 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
146 static int rt_garbage_collect(struct dst_ops *ops);
147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149                             int how)
150 {
151 }
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155         struct rtable *rt = (struct rtable *) dst;
156         struct inet_peer *peer;
157         u32 *p = NULL;
158
159         if (!rt->peer)
160                 rt_bind_peer(rt, rt->rt_dst, 1);
161
162         peer = rt->peer;
163         if (peer) {
164                 u32 *old_p = __DST_METRICS_PTR(old);
165                 unsigned long prev, new;
166
167                 p = peer->metrics;
168                 if (inet_metrics_new(peer))
169                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
170
171                 new = (unsigned long) p;
172                 prev = cmpxchg(&dst->_metrics, old, new);
173
174                 if (prev != old) {
175                         p = __DST_METRICS_PTR(prev);
176                         if (prev & DST_METRICS_READ_ONLY)
177                                 p = NULL;
178                 } else {
179                         if (rt->fi) {
180                                 fib_info_put(rt->fi);
181                                 rt->fi = NULL;
182                         }
183                 }
184         }
185         return p;
186 }
187
188 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
189
190 static struct dst_ops ipv4_dst_ops = {
191         .family =               AF_INET,
192         .protocol =             cpu_to_be16(ETH_P_IP),
193         .gc =                   rt_garbage_collect,
194         .check =                ipv4_dst_check,
195         .default_advmss =       ipv4_default_advmss,
196         .default_mtu =          ipv4_default_mtu,
197         .cow_metrics =          ipv4_cow_metrics,
198         .destroy =              ipv4_dst_destroy,
199         .ifdown =               ipv4_dst_ifdown,
200         .negative_advice =      ipv4_negative_advice,
201         .link_failure =         ipv4_link_failure,
202         .update_pmtu =          ip_rt_update_pmtu,
203         .local_out =            __ip_local_out,
204         .neigh_lookup =         ipv4_neigh_lookup,
205 };
206
207 #define ECN_OR_COST(class)      TC_PRIO_##class
208
209 const __u8 ip_tos2prio[16] = {
210         TC_PRIO_BESTEFFORT,
211         ECN_OR_COST(BESTEFFORT),
212         TC_PRIO_BESTEFFORT,
213         ECN_OR_COST(BESTEFFORT),
214         TC_PRIO_BULK,
215         ECN_OR_COST(BULK),
216         TC_PRIO_BULK,
217         ECN_OR_COST(BULK),
218         TC_PRIO_INTERACTIVE,
219         ECN_OR_COST(INTERACTIVE),
220         TC_PRIO_INTERACTIVE,
221         ECN_OR_COST(INTERACTIVE),
222         TC_PRIO_INTERACTIVE_BULK,
223         ECN_OR_COST(INTERACTIVE_BULK),
224         TC_PRIO_INTERACTIVE_BULK,
225         ECN_OR_COST(INTERACTIVE_BULK)
226 };
227
228
229 /*
230  * Route cache.
231  */
232
233 /* The locking scheme is rather straight forward:
234  *
235  * 1) Read-Copy Update protects the buckets of the central route hash.
236  * 2) Only writers remove entries, and they hold the lock
237  *    as they look at rtable reference counts.
238  * 3) Only readers acquire references to rtable entries,
239  *    they do so with atomic increments and with the
240  *    lock held.
241  */
242
243 struct rt_hash_bucket {
244         struct rtable __rcu     *chain;
245 };
246
247 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
248         defined(CONFIG_PROVE_LOCKING)
249 /*
250  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
251  * The size of this table is a power of two and depends on the number of CPUS.
252  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
253  */
254 #ifdef CONFIG_LOCKDEP
255 # define RT_HASH_LOCK_SZ        256
256 #else
257 # if NR_CPUS >= 32
258 #  define RT_HASH_LOCK_SZ       4096
259 # elif NR_CPUS >= 16
260 #  define RT_HASH_LOCK_SZ       2048
261 # elif NR_CPUS >= 8
262 #  define RT_HASH_LOCK_SZ       1024
263 # elif NR_CPUS >= 4
264 #  define RT_HASH_LOCK_SZ       512
265 # else
266 #  define RT_HASH_LOCK_SZ       256
267 # endif
268 #endif
269
270 static spinlock_t       *rt_hash_locks;
271 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
272
273 static __init void rt_hash_lock_init(void)
274 {
275         int i;
276
277         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
278                         GFP_KERNEL);
279         if (!rt_hash_locks)
280                 panic("IP: failed to allocate rt_hash_locks\n");
281
282         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
283                 spin_lock_init(&rt_hash_locks[i]);
284 }
285 #else
286 # define rt_hash_lock_addr(slot) NULL
287
288 static inline void rt_hash_lock_init(void)
289 {
290 }
291 #endif
292
293 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
294 static unsigned                 rt_hash_mask __read_mostly;
295 static unsigned int             rt_hash_log  __read_mostly;
296
297 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
298 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
299
300 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
301                                    int genid)
302 {
303         return jhash_3words((__force u32)daddr, (__force u32)saddr,
304                             idx, genid)
305                 & rt_hash_mask;
306 }
307
308 static inline int rt_genid(struct net *net)
309 {
310         return atomic_read(&net->ipv4.rt_genid);
311 }
312
313 #ifdef CONFIG_PROC_FS
314 struct rt_cache_iter_state {
315         struct seq_net_private p;
316         int bucket;
317         int genid;
318 };
319
320 static struct rtable *rt_cache_get_first(struct seq_file *seq)
321 {
322         struct rt_cache_iter_state *st = seq->private;
323         struct rtable *r = NULL;
324
325         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
326                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
327                         continue;
328                 rcu_read_lock_bh();
329                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
330                 while (r) {
331                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
332                             r->rt_genid == st->genid)
333                                 return r;
334                         r = rcu_dereference_bh(r->dst.rt_next);
335                 }
336                 rcu_read_unlock_bh();
337         }
338         return r;
339 }
340
341 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
342                                           struct rtable *r)
343 {
344         struct rt_cache_iter_state *st = seq->private;
345
346         r = rcu_dereference_bh(r->dst.rt_next);
347         while (!r) {
348                 rcu_read_unlock_bh();
349                 do {
350                         if (--st->bucket < 0)
351                                 return NULL;
352                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
353                 rcu_read_lock_bh();
354                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
355         }
356         return r;
357 }
358
359 static struct rtable *rt_cache_get_next(struct seq_file *seq,
360                                         struct rtable *r)
361 {
362         struct rt_cache_iter_state *st = seq->private;
363         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
364                 if (dev_net(r->dst.dev) != seq_file_net(seq))
365                         continue;
366                 if (r->rt_genid == st->genid)
367                         break;
368         }
369         return r;
370 }
371
372 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
373 {
374         struct rtable *r = rt_cache_get_first(seq);
375
376         if (r)
377                 while (pos && (r = rt_cache_get_next(seq, r)))
378                         --pos;
379         return pos ? NULL : r;
380 }
381
382 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
383 {
384         struct rt_cache_iter_state *st = seq->private;
385         if (*pos)
386                 return rt_cache_get_idx(seq, *pos - 1);
387         st->genid = rt_genid(seq_file_net(seq));
388         return SEQ_START_TOKEN;
389 }
390
391 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
392 {
393         struct rtable *r;
394
395         if (v == SEQ_START_TOKEN)
396                 r = rt_cache_get_first(seq);
397         else
398                 r = rt_cache_get_next(seq, v);
399         ++*pos;
400         return r;
401 }
402
403 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
404 {
405         if (v && v != SEQ_START_TOKEN)
406                 rcu_read_unlock_bh();
407 }
408
409 static int rt_cache_seq_show(struct seq_file *seq, void *v)
410 {
411         if (v == SEQ_START_TOKEN)
412                 seq_printf(seq, "%-127s\n",
413                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
414                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
415                            "HHUptod\tSpecDst");
416         else {
417                 struct rtable *r = v;
418                 struct neighbour *n;
419                 int len;
420
421                 n = dst_get_neighbour(&r->dst);
422                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
423                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
424                         r->dst.dev ? r->dst.dev->name : "*",
425                         (__force u32)r->rt_dst,
426                         (__force u32)r->rt_gateway,
427                         r->rt_flags, atomic_read(&r->dst.__refcnt),
428                         r->dst.__use, 0, (__force u32)r->rt_src,
429                         dst_metric_advmss(&r->dst) + 40,
430                         dst_metric(&r->dst, RTAX_WINDOW),
431                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
432                               dst_metric(&r->dst, RTAX_RTTVAR)),
433                         r->rt_key_tos,
434                         -1,
435                         (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
436                         r->rt_spec_dst, &len);
437
438                 seq_printf(seq, "%*s\n", 127 - len, "");
439         }
440         return 0;
441 }
442
443 static const struct seq_operations rt_cache_seq_ops = {
444         .start  = rt_cache_seq_start,
445         .next   = rt_cache_seq_next,
446         .stop   = rt_cache_seq_stop,
447         .show   = rt_cache_seq_show,
448 };
449
450 static int rt_cache_seq_open(struct inode *inode, struct file *file)
451 {
452         return seq_open_net(inode, file, &rt_cache_seq_ops,
453                         sizeof(struct rt_cache_iter_state));
454 }
455
456 static const struct file_operations rt_cache_seq_fops = {
457         .owner   = THIS_MODULE,
458         .open    = rt_cache_seq_open,
459         .read    = seq_read,
460         .llseek  = seq_lseek,
461         .release = seq_release_net,
462 };
463
464
465 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
466 {
467         int cpu;
468
469         if (*pos == 0)
470                 return SEQ_START_TOKEN;
471
472         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
473                 if (!cpu_possible(cpu))
474                         continue;
475                 *pos = cpu+1;
476                 return &per_cpu(rt_cache_stat, cpu);
477         }
478         return NULL;
479 }
480
481 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
482 {
483         int cpu;
484
485         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
486                 if (!cpu_possible(cpu))
487                         continue;
488                 *pos = cpu+1;
489                 return &per_cpu(rt_cache_stat, cpu);
490         }
491         return NULL;
492
493 }
494
495 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
496 {
497
498 }
499
500 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
501 {
502         struct rt_cache_stat *st = v;
503
504         if (v == SEQ_START_TOKEN) {
505                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
506                 return 0;
507         }
508
509         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
510                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
511                    dst_entries_get_slow(&ipv4_dst_ops),
512                    st->in_hit,
513                    st->in_slow_tot,
514                    st->in_slow_mc,
515                    st->in_no_route,
516                    st->in_brd,
517                    st->in_martian_dst,
518                    st->in_martian_src,
519
520                    st->out_hit,
521                    st->out_slow_tot,
522                    st->out_slow_mc,
523
524                    st->gc_total,
525                    st->gc_ignored,
526                    st->gc_goal_miss,
527                    st->gc_dst_overflow,
528                    st->in_hlist_search,
529                    st->out_hlist_search
530                 );
531         return 0;
532 }
533
534 static const struct seq_operations rt_cpu_seq_ops = {
535         .start  = rt_cpu_seq_start,
536         .next   = rt_cpu_seq_next,
537         .stop   = rt_cpu_seq_stop,
538         .show   = rt_cpu_seq_show,
539 };
540
541
542 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
543 {
544         return seq_open(file, &rt_cpu_seq_ops);
545 }
546
547 static const struct file_operations rt_cpu_seq_fops = {
548         .owner   = THIS_MODULE,
549         .open    = rt_cpu_seq_open,
550         .read    = seq_read,
551         .llseek  = seq_lseek,
552         .release = seq_release,
553 };
554
555 #ifdef CONFIG_IP_ROUTE_CLASSID
556 static int rt_acct_proc_show(struct seq_file *m, void *v)
557 {
558         struct ip_rt_acct *dst, *src;
559         unsigned int i, j;
560
561         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
562         if (!dst)
563                 return -ENOMEM;
564
565         for_each_possible_cpu(i) {
566                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
567                 for (j = 0; j < 256; j++) {
568                         dst[j].o_bytes   += src[j].o_bytes;
569                         dst[j].o_packets += src[j].o_packets;
570                         dst[j].i_bytes   += src[j].i_bytes;
571                         dst[j].i_packets += src[j].i_packets;
572                 }
573         }
574
575         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
576         kfree(dst);
577         return 0;
578 }
579
580 static int rt_acct_proc_open(struct inode *inode, struct file *file)
581 {
582         return single_open(file, rt_acct_proc_show, NULL);
583 }
584
585 static const struct file_operations rt_acct_proc_fops = {
586         .owner          = THIS_MODULE,
587         .open           = rt_acct_proc_open,
588         .read           = seq_read,
589         .llseek         = seq_lseek,
590         .release        = single_release,
591 };
592 #endif
593
594 static int __net_init ip_rt_do_proc_init(struct net *net)
595 {
596         struct proc_dir_entry *pde;
597
598         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
599                         &rt_cache_seq_fops);
600         if (!pde)
601                 goto err1;
602
603         pde = proc_create("rt_cache", S_IRUGO,
604                           net->proc_net_stat, &rt_cpu_seq_fops);
605         if (!pde)
606                 goto err2;
607
608 #ifdef CONFIG_IP_ROUTE_CLASSID
609         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
610         if (!pde)
611                 goto err3;
612 #endif
613         return 0;
614
615 #ifdef CONFIG_IP_ROUTE_CLASSID
616 err3:
617         remove_proc_entry("rt_cache", net->proc_net_stat);
618 #endif
619 err2:
620         remove_proc_entry("rt_cache", net->proc_net);
621 err1:
622         return -ENOMEM;
623 }
624
625 static void __net_exit ip_rt_do_proc_exit(struct net *net)
626 {
627         remove_proc_entry("rt_cache", net->proc_net_stat);
628         remove_proc_entry("rt_cache", net->proc_net);
629 #ifdef CONFIG_IP_ROUTE_CLASSID
630         remove_proc_entry("rt_acct", net->proc_net);
631 #endif
632 }
633
634 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
635         .init = ip_rt_do_proc_init,
636         .exit = ip_rt_do_proc_exit,
637 };
638
639 static int __init ip_rt_proc_init(void)
640 {
641         return register_pernet_subsys(&ip_rt_proc_ops);
642 }
643
644 #else
645 static inline int ip_rt_proc_init(void)
646 {
647         return 0;
648 }
649 #endif /* CONFIG_PROC_FS */
650
651 static inline void rt_free(struct rtable *rt)
652 {
653         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
654 }
655
656 static inline void rt_drop(struct rtable *rt)
657 {
658         ip_rt_put(rt);
659         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
660 }
661
662 static inline int rt_fast_clean(struct rtable *rth)
663 {
664         /* Kill broadcast/multicast entries very aggresively, if they
665            collide in hash table with more useful entries */
666         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
667                 rt_is_input_route(rth) && rth->dst.rt_next;
668 }
669
670 static inline int rt_valuable(struct rtable *rth)
671 {
672         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
673                 (rth->peer && rth->peer->pmtu_expires);
674 }
675
676 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
677 {
678         unsigned long age;
679         int ret = 0;
680
681         if (atomic_read(&rth->dst.__refcnt))
682                 goto out;
683
684         age = jiffies - rth->dst.lastuse;
685         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
686             (age <= tmo2 && rt_valuable(rth)))
687                 goto out;
688         ret = 1;
689 out:    return ret;
690 }
691
692 /* Bits of score are:
693  * 31: very valuable
694  * 30: not quite useless
695  * 29..0: usage counter
696  */
697 static inline u32 rt_score(struct rtable *rt)
698 {
699         u32 score = jiffies - rt->dst.lastuse;
700
701         score = ~score & ~(3<<30);
702
703         if (rt_valuable(rt))
704                 score |= (1<<31);
705
706         if (rt_is_output_route(rt) ||
707             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
708                 score |= (1<<30);
709
710         return score;
711 }
712
713 static inline bool rt_caching(const struct net *net)
714 {
715         return net->ipv4.current_rt_cache_rebuild_count <=
716                 net->ipv4.sysctl_rt_cache_rebuild_count;
717 }
718
719 static inline bool compare_hash_inputs(const struct rtable *rt1,
720                                        const struct rtable *rt2)
721 {
722         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
723                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
724                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
725 }
726
727 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
728 {
729         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
730                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
731                 (rt1->rt_mark ^ rt2->rt_mark) |
732                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
733                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
734                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
735 }
736
737 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
738 {
739         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
740 }
741
742 static inline int rt_is_expired(struct rtable *rth)
743 {
744         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
745 }
746
747 /*
748  * Perform a full scan of hash table and free all entries.
749  * Can be called by a softirq or a process.
750  * In the later case, we want to be reschedule if necessary
751  */
752 static void rt_do_flush(struct net *net, int process_context)
753 {
754         unsigned int i;
755         struct rtable *rth, *next;
756
757         for (i = 0; i <= rt_hash_mask; i++) {
758                 struct rtable __rcu **pprev;
759                 struct rtable *list;
760
761                 if (process_context && need_resched())
762                         cond_resched();
763                 rth = rcu_access_pointer(rt_hash_table[i].chain);
764                 if (!rth)
765                         continue;
766
767                 spin_lock_bh(rt_hash_lock_addr(i));
768
769                 list = NULL;
770                 pprev = &rt_hash_table[i].chain;
771                 rth = rcu_dereference_protected(*pprev,
772                         lockdep_is_held(rt_hash_lock_addr(i)));
773
774                 while (rth) {
775                         next = rcu_dereference_protected(rth->dst.rt_next,
776                                 lockdep_is_held(rt_hash_lock_addr(i)));
777
778                         if (!net ||
779                             net_eq(dev_net(rth->dst.dev), net)) {
780                                 rcu_assign_pointer(*pprev, next);
781                                 rcu_assign_pointer(rth->dst.rt_next, list);
782                                 list = rth;
783                         } else {
784                                 pprev = &rth->dst.rt_next;
785                         }
786                         rth = next;
787                 }
788
789                 spin_unlock_bh(rt_hash_lock_addr(i));
790
791                 for (; list; list = next) {
792                         next = rcu_dereference_protected(list->dst.rt_next, 1);
793                         rt_free(list);
794                 }
795         }
796 }
797
798 /*
799  * While freeing expired entries, we compute average chain length
800  * and standard deviation, using fixed-point arithmetic.
801  * This to have an estimation of rt_chain_length_max
802  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
803  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
804  */
805
806 #define FRACT_BITS 3
807 #define ONE (1UL << FRACT_BITS)
808
809 /*
810  * Given a hash chain and an item in this hash chain,
811  * find if a previous entry has the same hash_inputs
812  * (but differs on tos, mark or oif)
813  * Returns 0 if an alias is found.
814  * Returns ONE if rth has no alias before itself.
815  */
816 static int has_noalias(const struct rtable *head, const struct rtable *rth)
817 {
818         const struct rtable *aux = head;
819
820         while (aux != rth) {
821                 if (compare_hash_inputs(aux, rth))
822                         return 0;
823                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
824         }
825         return ONE;
826 }
827
828 /*
829  * Perturbation of rt_genid by a small quantity [1..256]
830  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
831  * many times (2^24) without giving recent rt_genid.
832  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
833  */
834 static void rt_cache_invalidate(struct net *net)
835 {
836         unsigned char shuffle;
837
838         get_random_bytes(&shuffle, sizeof(shuffle));
839         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
840 }
841
842 /*
843  * delay < 0  : invalidate cache (fast : entries will be deleted later)
844  * delay >= 0 : invalidate & flush cache (can be long)
845  */
846 void rt_cache_flush(struct net *net, int delay)
847 {
848         rt_cache_invalidate(net);
849         if (delay >= 0)
850                 rt_do_flush(net, !in_softirq());
851 }
852
853 /* Flush previous cache invalidated entries from the cache */
854 void rt_cache_flush_batch(struct net *net)
855 {
856         rt_do_flush(net, !in_softirq());
857 }
858
859 static void rt_emergency_hash_rebuild(struct net *net)
860 {
861         if (net_ratelimit())
862                 printk(KERN_WARNING "Route hash chain too long!\n");
863         rt_cache_invalidate(net);
864 }
865
866 /*
867    Short description of GC goals.
868
869    We want to build algorithm, which will keep routing cache
870    at some equilibrium point, when number of aged off entries
871    is kept approximately equal to newly generated ones.
872
873    Current expiration strength is variable "expire".
874    We try to adjust it dynamically, so that if networking
875    is idle expires is large enough to keep enough of warm entries,
876    and when load increases it reduces to limit cache size.
877  */
878
879 static int rt_garbage_collect(struct dst_ops *ops)
880 {
881         static unsigned long expire = RT_GC_TIMEOUT;
882         static unsigned long last_gc;
883         static int rover;
884         static int equilibrium;
885         struct rtable *rth;
886         struct rtable __rcu **rthp;
887         unsigned long now = jiffies;
888         int goal;
889         int entries = dst_entries_get_fast(&ipv4_dst_ops);
890
891         /*
892          * Garbage collection is pretty expensive,
893          * do not make it too frequently.
894          */
895
896         RT_CACHE_STAT_INC(gc_total);
897
898         if (now - last_gc < ip_rt_gc_min_interval &&
899             entries < ip_rt_max_size) {
900                 RT_CACHE_STAT_INC(gc_ignored);
901                 goto out;
902         }
903
904         entries = dst_entries_get_slow(&ipv4_dst_ops);
905         /* Calculate number of entries, which we want to expire now. */
906         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
907         if (goal <= 0) {
908                 if (equilibrium < ipv4_dst_ops.gc_thresh)
909                         equilibrium = ipv4_dst_ops.gc_thresh;
910                 goal = entries - equilibrium;
911                 if (goal > 0) {
912                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
913                         goal = entries - equilibrium;
914                 }
915         } else {
916                 /* We are in dangerous area. Try to reduce cache really
917                  * aggressively.
918                  */
919                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
920                 equilibrium = entries - goal;
921         }
922
923         if (now - last_gc >= ip_rt_gc_min_interval)
924                 last_gc = now;
925
926         if (goal <= 0) {
927                 equilibrium += goal;
928                 goto work_done;
929         }
930
931         do {
932                 int i, k;
933
934                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
935                         unsigned long tmo = expire;
936
937                         k = (k + 1) & rt_hash_mask;
938                         rthp = &rt_hash_table[k].chain;
939                         spin_lock_bh(rt_hash_lock_addr(k));
940                         while ((rth = rcu_dereference_protected(*rthp,
941                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
942                                 if (!rt_is_expired(rth) &&
943                                         !rt_may_expire(rth, tmo, expire)) {
944                                         tmo >>= 1;
945                                         rthp = &rth->dst.rt_next;
946                                         continue;
947                                 }
948                                 *rthp = rth->dst.rt_next;
949                                 rt_free(rth);
950                                 goal--;
951                         }
952                         spin_unlock_bh(rt_hash_lock_addr(k));
953                         if (goal <= 0)
954                                 break;
955                 }
956                 rover = k;
957
958                 if (goal <= 0)
959                         goto work_done;
960
961                 /* Goal is not achieved. We stop process if:
962
963                    - if expire reduced to zero. Otherwise, expire is halfed.
964                    - if table is not full.
965                    - if we are called from interrupt.
966                    - jiffies check is just fallback/debug loop breaker.
967                      We will not spin here for long time in any case.
968                  */
969
970                 RT_CACHE_STAT_INC(gc_goal_miss);
971
972                 if (expire == 0)
973                         break;
974
975                 expire >>= 1;
976
977                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
978                         goto out;
979         } while (!in_softirq() && time_before_eq(jiffies, now));
980
981         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
982                 goto out;
983         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
984                 goto out;
985         if (net_ratelimit())
986                 printk(KERN_WARNING "dst cache overflow\n");
987         RT_CACHE_STAT_INC(gc_dst_overflow);
988         return 1;
989
990 work_done:
991         expire += ip_rt_gc_min_interval;
992         if (expire > ip_rt_gc_timeout ||
993             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
994             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
995                 expire = ip_rt_gc_timeout;
996 out:    return 0;
997 }
998
999 /*
1000  * Returns number of entries in a hash chain that have different hash_inputs
1001  */
1002 static int slow_chain_length(const struct rtable *head)
1003 {
1004         int length = 0;
1005         const struct rtable *rth = head;
1006
1007         while (rth) {
1008                 length += has_noalias(head, rth);
1009                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1010         }
1011         return length >> FRACT_BITS;
1012 }
1013
1014 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1015 {
1016         struct neigh_table *tbl = &arp_tbl;
1017         static const __be32 inaddr_any = 0;
1018         struct net_device *dev = dst->dev;
1019         const __be32 *pkey = daddr;
1020         struct neighbour *n;
1021
1022 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1023         if (dev->type == ARPHRD_ATM)
1024                 tbl = clip_tbl_hook;
1025 #endif
1026         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1027                 pkey = &inaddr_any;
1028
1029         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1030         if (n)
1031                 return n;
1032         return neigh_create(tbl, pkey, dev);
1033 }
1034
1035 static int rt_bind_neighbour(struct rtable *rt)
1036 {
1037         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1038         if (IS_ERR(n))
1039                 return PTR_ERR(n);
1040         dst_set_neighbour(&rt->dst, n);
1041
1042         return 0;
1043 }
1044
1045 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1046                                      struct sk_buff *skb, int ifindex)
1047 {
1048         struct rtable   *rth, *cand;
1049         struct rtable __rcu **rthp, **candp;
1050         unsigned long   now;
1051         u32             min_score;
1052         int             chain_length;
1053         int attempts = !in_softirq();
1054
1055 restart:
1056         chain_length = 0;
1057         min_score = ~(u32)0;
1058         cand = NULL;
1059         candp = NULL;
1060         now = jiffies;
1061
1062         if (!rt_caching(dev_net(rt->dst.dev))) {
1063                 /*
1064                  * If we're not caching, just tell the caller we
1065                  * were successful and don't touch the route.  The
1066                  * caller hold the sole reference to the cache entry, and
1067                  * it will be released when the caller is done with it.
1068                  * If we drop it here, the callers have no way to resolve routes
1069                  * when we're not caching.  Instead, just point *rp at rt, so
1070                  * the caller gets a single use out of the route
1071                  * Note that we do rt_free on this new route entry, so that
1072                  * once its refcount hits zero, we are still able to reap it
1073                  * (Thanks Alexey)
1074                  * Note: To avoid expensive rcu stuff for this uncached dst,
1075                  * we set DST_NOCACHE so that dst_release() can free dst without
1076                  * waiting a grace period.
1077                  */
1078
1079                 rt->dst.flags |= DST_NOCACHE;
1080                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1081                         int err = rt_bind_neighbour(rt);
1082                         if (err) {
1083                                 if (net_ratelimit())
1084                                         printk(KERN_WARNING
1085                                             "Neighbour table failure & not caching routes.\n");
1086                                 ip_rt_put(rt);
1087                                 return ERR_PTR(err);
1088                         }
1089                 }
1090
1091                 goto skip_hashing;
1092         }
1093
1094         rthp = &rt_hash_table[hash].chain;
1095
1096         spin_lock_bh(rt_hash_lock_addr(hash));
1097         while ((rth = rcu_dereference_protected(*rthp,
1098                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1099                 if (rt_is_expired(rth)) {
1100                         *rthp = rth->dst.rt_next;
1101                         rt_free(rth);
1102                         continue;
1103                 }
1104                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1105                         /* Put it first */
1106                         *rthp = rth->dst.rt_next;
1107                         /*
1108                          * Since lookup is lockfree, the deletion
1109                          * must be visible to another weakly ordered CPU before
1110                          * the insertion at the start of the hash chain.
1111                          */
1112                         rcu_assign_pointer(rth->dst.rt_next,
1113                                            rt_hash_table[hash].chain);
1114                         /*
1115                          * Since lookup is lockfree, the update writes
1116                          * must be ordered for consistency on SMP.
1117                          */
1118                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1119
1120                         dst_use(&rth->dst, now);
1121                         spin_unlock_bh(rt_hash_lock_addr(hash));
1122
1123                         rt_drop(rt);
1124                         if (skb)
1125                                 skb_dst_set(skb, &rth->dst);
1126                         return rth;
1127                 }
1128
1129                 if (!atomic_read(&rth->dst.__refcnt)) {
1130                         u32 score = rt_score(rth);
1131
1132                         if (score <= min_score) {
1133                                 cand = rth;
1134                                 candp = rthp;
1135                                 min_score = score;
1136                         }
1137                 }
1138
1139                 chain_length++;
1140
1141                 rthp = &rth->dst.rt_next;
1142         }
1143
1144         if (cand) {
1145                 /* ip_rt_gc_elasticity used to be average length of chain
1146                  * length, when exceeded gc becomes really aggressive.
1147                  *
1148                  * The second limit is less certain. At the moment it allows
1149                  * only 2 entries per bucket. We will see.
1150                  */
1151                 if (chain_length > ip_rt_gc_elasticity) {
1152                         *candp = cand->dst.rt_next;
1153                         rt_free(cand);
1154                 }
1155         } else {
1156                 if (chain_length > rt_chain_length_max &&
1157                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1158                         struct net *net = dev_net(rt->dst.dev);
1159                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1160                         if (!rt_caching(net)) {
1161                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1162                                         rt->dst.dev->name, num);
1163                         }
1164                         rt_emergency_hash_rebuild(net);
1165                         spin_unlock_bh(rt_hash_lock_addr(hash));
1166
1167                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1168                                         ifindex, rt_genid(net));
1169                         goto restart;
1170                 }
1171         }
1172
1173         /* Try to bind route to arp only if it is output
1174            route or unicast forwarding path.
1175          */
1176         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1177                 int err = rt_bind_neighbour(rt);
1178                 if (err) {
1179                         spin_unlock_bh(rt_hash_lock_addr(hash));
1180
1181                         if (err != -ENOBUFS) {
1182                                 rt_drop(rt);
1183                                 return ERR_PTR(err);
1184                         }
1185
1186                         /* Neighbour tables are full and nothing
1187                            can be released. Try to shrink route cache,
1188                            it is most likely it holds some neighbour records.
1189                          */
1190                         if (attempts-- > 0) {
1191                                 int saved_elasticity = ip_rt_gc_elasticity;
1192                                 int saved_int = ip_rt_gc_min_interval;
1193                                 ip_rt_gc_elasticity     = 1;
1194                                 ip_rt_gc_min_interval   = 0;
1195                                 rt_garbage_collect(&ipv4_dst_ops);
1196                                 ip_rt_gc_min_interval   = saved_int;
1197                                 ip_rt_gc_elasticity     = saved_elasticity;
1198                                 goto restart;
1199                         }
1200
1201                         if (net_ratelimit())
1202                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1203                         rt_drop(rt);
1204                         return ERR_PTR(-ENOBUFS);
1205                 }
1206         }
1207
1208         rt->dst.rt_next = rt_hash_table[hash].chain;
1209
1210         /*
1211          * Since lookup is lockfree, we must make sure
1212          * previous writes to rt are committed to memory
1213          * before making rt visible to other CPUS.
1214          */
1215         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1216
1217         spin_unlock_bh(rt_hash_lock_addr(hash));
1218
1219 skip_hashing:
1220         if (skb)
1221                 skb_dst_set(skb, &rt->dst);
1222         return rt;
1223 }
1224
1225 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1226
1227 static u32 rt_peer_genid(void)
1228 {
1229         return atomic_read(&__rt_peer_genid);
1230 }
1231
1232 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1233 {
1234         struct inet_peer *peer;
1235
1236         peer = inet_getpeer_v4(daddr, create);
1237
1238         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1239                 inet_putpeer(peer);
1240         else
1241                 rt->rt_peer_genid = rt_peer_genid();
1242 }
1243
1244 /*
1245  * Peer allocation may fail only in serious out-of-memory conditions.  However
1246  * we still can generate some output.
1247  * Random ID selection looks a bit dangerous because we have no chances to
1248  * select ID being unique in a reasonable period of time.
1249  * But broken packet identifier may be better than no packet at all.
1250  */
1251 static void ip_select_fb_ident(struct iphdr *iph)
1252 {
1253         static DEFINE_SPINLOCK(ip_fb_id_lock);
1254         static u32 ip_fallback_id;
1255         u32 salt;
1256
1257         spin_lock_bh(&ip_fb_id_lock);
1258         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1259         iph->id = htons(salt & 0xFFFF);
1260         ip_fallback_id = salt;
1261         spin_unlock_bh(&ip_fb_id_lock);
1262 }
1263
1264 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1265 {
1266         struct rtable *rt = (struct rtable *) dst;
1267
1268         if (rt) {
1269                 if (rt->peer == NULL)
1270                         rt_bind_peer(rt, rt->rt_dst, 1);
1271
1272                 /* If peer is attached to destination, it is never detached,
1273                    so that we need not to grab a lock to dereference it.
1274                  */
1275                 if (rt->peer) {
1276                         iph->id = htons(inet_getid(rt->peer, more));
1277                         return;
1278                 }
1279         } else
1280                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1281                        __builtin_return_address(0));
1282
1283         ip_select_fb_ident(iph);
1284 }
1285 EXPORT_SYMBOL(__ip_select_ident);
1286
1287 static void rt_del(unsigned hash, struct rtable *rt)
1288 {
1289         struct rtable __rcu **rthp;
1290         struct rtable *aux;
1291
1292         rthp = &rt_hash_table[hash].chain;
1293         spin_lock_bh(rt_hash_lock_addr(hash));
1294         ip_rt_put(rt);
1295         while ((aux = rcu_dereference_protected(*rthp,
1296                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1297                 if (aux == rt || rt_is_expired(aux)) {
1298                         *rthp = aux->dst.rt_next;
1299                         rt_free(aux);
1300                         continue;
1301                 }
1302                 rthp = &aux->dst.rt_next;
1303         }
1304         spin_unlock_bh(rt_hash_lock_addr(hash));
1305 }
1306
1307 /* called in rcu_read_lock() section */
1308 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1309                     __be32 saddr, struct net_device *dev)
1310 {
1311         int s, i;
1312         struct in_device *in_dev = __in_dev_get_rcu(dev);
1313         struct rtable *rt;
1314         __be32 skeys[2] = { saddr, 0 };
1315         int    ikeys[2] = { dev->ifindex, 0 };
1316         struct flowi4 fl4;
1317         struct inet_peer *peer;
1318         struct net *net;
1319
1320         if (!in_dev)
1321                 return;
1322
1323         net = dev_net(dev);
1324         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1325             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1326             ipv4_is_zeronet(new_gw))
1327                 goto reject_redirect;
1328
1329         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1330                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1331                         goto reject_redirect;
1332                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1333                         goto reject_redirect;
1334         } else {
1335                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1336                         goto reject_redirect;
1337         }
1338
1339         memset(&fl4, 0, sizeof(fl4));
1340         fl4.daddr = daddr;
1341         for (s = 0; s < 2; s++) {
1342                 for (i = 0; i < 2; i++) {
1343                         fl4.flowi4_oif = ikeys[i];
1344                         fl4.saddr = skeys[s];
1345                         rt = __ip_route_output_key(net, &fl4);
1346                         if (IS_ERR(rt))
1347                                 continue;
1348
1349                         if (rt->dst.error || rt->dst.dev != dev ||
1350                             rt->rt_gateway != old_gw) {
1351                                 ip_rt_put(rt);
1352                                 continue;
1353                         }
1354
1355                         if (!rt->peer)
1356                                 rt_bind_peer(rt, rt->rt_dst, 1);
1357
1358                         peer = rt->peer;
1359                         if (peer) {
1360                                 peer->redirect_learned.a4 = new_gw;
1361                                 atomic_inc(&__rt_peer_genid);
1362                         }
1363
1364                         ip_rt_put(rt);
1365                         return;
1366                 }
1367         }
1368         return;
1369
1370 reject_redirect:
1371 #ifdef CONFIG_IP_ROUTE_VERBOSE
1372         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1373                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1374                         "  Advised path = %pI4 -> %pI4\n",
1375                        &old_gw, dev->name, &new_gw,
1376                        &saddr, &daddr);
1377 #endif
1378         ;
1379 }
1380
1381 static bool peer_pmtu_expired(struct inet_peer *peer)
1382 {
1383         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1384
1385         return orig &&
1386                time_after_eq(jiffies, orig) &&
1387                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1388 }
1389
1390 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1391 {
1392         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1393
1394         return orig &&
1395                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1396 }
1397
1398 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1399 {
1400         struct rtable *rt = (struct rtable *)dst;
1401         struct dst_entry *ret = dst;
1402
1403         if (rt) {
1404                 if (dst->obsolete > 0) {
1405                         ip_rt_put(rt);
1406                         ret = NULL;
1407                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1408                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1409                                                 rt->rt_oif,
1410                                                 rt_genid(dev_net(dst->dev)));
1411                         rt_del(hash, rt);
1412                         ret = NULL;
1413                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1414                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1415                 }
1416         }
1417         return ret;
1418 }
1419
1420 /*
1421  * Algorithm:
1422  *      1. The first ip_rt_redirect_number redirects are sent
1423  *         with exponential backoff, then we stop sending them at all,
1424  *         assuming that the host ignores our redirects.
1425  *      2. If we did not see packets requiring redirects
1426  *         during ip_rt_redirect_silence, we assume that the host
1427  *         forgot redirected route and start to send redirects again.
1428  *
1429  * This algorithm is much cheaper and more intelligent than dumb load limiting
1430  * in icmp.c.
1431  *
1432  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1433  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1434  */
1435
1436 void ip_rt_send_redirect(struct sk_buff *skb)
1437 {
1438         struct rtable *rt = skb_rtable(skb);
1439         struct in_device *in_dev;
1440         struct inet_peer *peer;
1441         int log_martians;
1442
1443         rcu_read_lock();
1444         in_dev = __in_dev_get_rcu(rt->dst.dev);
1445         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1446                 rcu_read_unlock();
1447                 return;
1448         }
1449         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1450         rcu_read_unlock();
1451
1452         if (!rt->peer)
1453                 rt_bind_peer(rt, rt->rt_dst, 1);
1454         peer = rt->peer;
1455         if (!peer) {
1456                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1457                 return;
1458         }
1459
1460         /* No redirected packets during ip_rt_redirect_silence;
1461          * reset the algorithm.
1462          */
1463         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1464                 peer->rate_tokens = 0;
1465
1466         /* Too many ignored redirects; do not send anything
1467          * set dst.rate_last to the last seen redirected packet.
1468          */
1469         if (peer->rate_tokens >= ip_rt_redirect_number) {
1470                 peer->rate_last = jiffies;
1471                 return;
1472         }
1473
1474         /* Check for load limit; set rate_last to the latest sent
1475          * redirect.
1476          */
1477         if (peer->rate_tokens == 0 ||
1478             time_after(jiffies,
1479                        (peer->rate_last +
1480                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1481                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1482                 peer->rate_last = jiffies;
1483                 ++peer->rate_tokens;
1484 #ifdef CONFIG_IP_ROUTE_VERBOSE
1485                 if (log_martians &&
1486                     peer->rate_tokens == ip_rt_redirect_number &&
1487                     net_ratelimit())
1488                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1489                                &ip_hdr(skb)->saddr, rt->rt_iif,
1490                                 &rt->rt_dst, &rt->rt_gateway);
1491 #endif
1492         }
1493 }
1494
1495 static int ip_error(struct sk_buff *skb)
1496 {
1497         struct rtable *rt = skb_rtable(skb);
1498         struct inet_peer *peer;
1499         unsigned long now;
1500         bool send;
1501         int code;
1502
1503         switch (rt->dst.error) {
1504         case EINVAL:
1505         default:
1506                 goto out;
1507         case EHOSTUNREACH:
1508                 code = ICMP_HOST_UNREACH;
1509                 break;
1510         case ENETUNREACH:
1511                 code = ICMP_NET_UNREACH;
1512                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1513                                 IPSTATS_MIB_INNOROUTES);
1514                 break;
1515         case EACCES:
1516                 code = ICMP_PKT_FILTERED;
1517                 break;
1518         }
1519
1520         if (!rt->peer)
1521                 rt_bind_peer(rt, rt->rt_dst, 1);
1522         peer = rt->peer;
1523
1524         send = true;
1525         if (peer) {
1526                 now = jiffies;
1527                 peer->rate_tokens += now - peer->rate_last;
1528                 if (peer->rate_tokens > ip_rt_error_burst)
1529                         peer->rate_tokens = ip_rt_error_burst;
1530                 peer->rate_last = now;
1531                 if (peer->rate_tokens >= ip_rt_error_cost)
1532                         peer->rate_tokens -= ip_rt_error_cost;
1533                 else
1534                         send = false;
1535         }
1536         if (send)
1537                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1538
1539 out:    kfree_skb(skb);
1540         return 0;
1541 }
1542
1543 /*
1544  *      The last two values are not from the RFC but
1545  *      are needed for AMPRnet AX.25 paths.
1546  */
1547
1548 static const unsigned short mtu_plateau[] =
1549 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1550
1551 static inline unsigned short guess_mtu(unsigned short old_mtu)
1552 {
1553         int i;
1554
1555         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1556                 if (old_mtu > mtu_plateau[i])
1557                         return mtu_plateau[i];
1558         return 68;
1559 }
1560
1561 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1562                                  unsigned short new_mtu,
1563                                  struct net_device *dev)
1564 {
1565         unsigned short old_mtu = ntohs(iph->tot_len);
1566         unsigned short est_mtu = 0;
1567         struct inet_peer *peer;
1568
1569         peer = inet_getpeer_v4(iph->daddr, 1);
1570         if (peer) {
1571                 unsigned short mtu = new_mtu;
1572
1573                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1574                         /* BSD 4.2 derived systems incorrectly adjust
1575                          * tot_len by the IP header length, and report
1576                          * a zero MTU in the ICMP message.
1577                          */
1578                         if (mtu == 0 &&
1579                             old_mtu >= 68 + (iph->ihl << 2))
1580                                 old_mtu -= iph->ihl << 2;
1581                         mtu = guess_mtu(old_mtu);
1582                 }
1583
1584                 if (mtu < ip_rt_min_pmtu)
1585                         mtu = ip_rt_min_pmtu;
1586                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1587                         unsigned long pmtu_expires;
1588
1589                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1590                         if (!pmtu_expires)
1591                                 pmtu_expires = 1UL;
1592
1593                         est_mtu = mtu;
1594                         peer->pmtu_learned = mtu;
1595                         peer->pmtu_expires = pmtu_expires;
1596                         atomic_inc(&__rt_peer_genid);
1597                 }
1598
1599                 inet_putpeer(peer);
1600         }
1601         return est_mtu ? : new_mtu;
1602 }
1603
1604 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1605 {
1606         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1607
1608         if (!expires)
1609                 return;
1610         if (time_before(jiffies, expires)) {
1611                 u32 orig_dst_mtu = dst_mtu(dst);
1612                 if (peer->pmtu_learned < orig_dst_mtu) {
1613                         if (!peer->pmtu_orig)
1614                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1615                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1616                 }
1617         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1618                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1619 }
1620
1621 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1622 {
1623         struct rtable *rt = (struct rtable *) dst;
1624         struct inet_peer *peer;
1625
1626         dst_confirm(dst);
1627
1628         if (!rt->peer)
1629                 rt_bind_peer(rt, rt->rt_dst, 1);
1630         peer = rt->peer;
1631         if (peer) {
1632                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1633
1634                 if (mtu < ip_rt_min_pmtu)
1635                         mtu = ip_rt_min_pmtu;
1636                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1637
1638                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1639                         if (!pmtu_expires)
1640                                 pmtu_expires = 1UL;
1641
1642                         peer->pmtu_learned = mtu;
1643                         peer->pmtu_expires = pmtu_expires;
1644
1645                         atomic_inc(&__rt_peer_genid);
1646                         rt->rt_peer_genid = rt_peer_genid();
1647                 }
1648                 check_peer_pmtu(dst, peer);
1649         }
1650 }
1651
1652 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1653 {
1654         struct rtable *rt = (struct rtable *) dst;
1655         __be32 orig_gw = rt->rt_gateway;
1656         struct neighbour *n, *old_n;
1657
1658         dst_confirm(&rt->dst);
1659
1660         rt->rt_gateway = peer->redirect_learned.a4;
1661
1662         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1663         if (IS_ERR(n))
1664                 return PTR_ERR(n);
1665         old_n = xchg(&rt->dst._neighbour, n);
1666         if (old_n)
1667                 neigh_release(old_n);
1668         if (!n || !(n->nud_state & NUD_VALID)) {
1669                 if (n)
1670                         neigh_event_send(n, NULL);
1671                 rt->rt_gateway = orig_gw;
1672                 return -EAGAIN;
1673         } else {
1674                 rt->rt_flags |= RTCF_REDIRECTED;
1675                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1676         }
1677         return 0;
1678 }
1679
1680 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1681 {
1682         struct rtable *rt = (struct rtable *) dst;
1683
1684         if (rt_is_expired(rt))
1685                 return NULL;
1686         if (rt->rt_peer_genid != rt_peer_genid()) {
1687                 struct inet_peer *peer;
1688
1689                 if (!rt->peer)
1690                         rt_bind_peer(rt, rt->rt_dst, 0);
1691
1692                 peer = rt->peer;
1693                 if (peer) {
1694                         check_peer_pmtu(dst, peer);
1695
1696                         if (peer->redirect_learned.a4 &&
1697                             peer->redirect_learned.a4 != rt->rt_gateway) {
1698                                 if (check_peer_redir(dst, peer))
1699                                         return NULL;
1700                         }
1701                 }
1702
1703                 rt->rt_peer_genid = rt_peer_genid();
1704         }
1705         return dst;
1706 }
1707
1708 static void ipv4_dst_destroy(struct dst_entry *dst)
1709 {
1710         struct rtable *rt = (struct rtable *) dst;
1711         struct inet_peer *peer = rt->peer;
1712
1713         if (rt->fi) {
1714                 fib_info_put(rt->fi);
1715                 rt->fi = NULL;
1716         }
1717         if (peer) {
1718                 rt->peer = NULL;
1719                 inet_putpeer(peer);
1720         }
1721 }
1722
1723
1724 static void ipv4_link_failure(struct sk_buff *skb)
1725 {
1726         struct rtable *rt;
1727
1728         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1729
1730         rt = skb_rtable(skb);
1731         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1732                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1733 }
1734
1735 static int ip_rt_bug(struct sk_buff *skb)
1736 {
1737         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1738                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1739                 skb->dev ? skb->dev->name : "?");
1740         kfree_skb(skb);
1741         WARN_ON(1);
1742         return 0;
1743 }
1744
1745 /*
1746    We do not cache source address of outgoing interface,
1747    because it is used only by IP RR, TS and SRR options,
1748    so that it out of fast path.
1749
1750    BTW remember: "addr" is allowed to be not aligned
1751    in IP options!
1752  */
1753
1754 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1755 {
1756         __be32 src;
1757
1758         if (rt_is_output_route(rt))
1759                 src = ip_hdr(skb)->saddr;
1760         else {
1761                 struct fib_result res;
1762                 struct flowi4 fl4;
1763                 struct iphdr *iph;
1764
1765                 iph = ip_hdr(skb);
1766
1767                 memset(&fl4, 0, sizeof(fl4));
1768                 fl4.daddr = iph->daddr;
1769                 fl4.saddr = iph->saddr;
1770                 fl4.flowi4_tos = RT_TOS(iph->tos);
1771                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1772                 fl4.flowi4_iif = skb->dev->ifindex;
1773                 fl4.flowi4_mark = skb->mark;
1774
1775                 rcu_read_lock();
1776                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1777                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1778                 else
1779                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1780                                         RT_SCOPE_UNIVERSE);
1781                 rcu_read_unlock();
1782         }
1783         memcpy(addr, &src, 4);
1784 }
1785
1786 #ifdef CONFIG_IP_ROUTE_CLASSID
1787 static void set_class_tag(struct rtable *rt, u32 tag)
1788 {
1789         if (!(rt->dst.tclassid & 0xFFFF))
1790                 rt->dst.tclassid |= tag & 0xFFFF;
1791         if (!(rt->dst.tclassid & 0xFFFF0000))
1792                 rt->dst.tclassid |= tag & 0xFFFF0000;
1793 }
1794 #endif
1795
1796 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1797 {
1798         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1799
1800         if (advmss == 0) {
1801                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1802                                ip_rt_min_advmss);
1803                 if (advmss > 65535 - 40)
1804                         advmss = 65535 - 40;
1805         }
1806         return advmss;
1807 }
1808
1809 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1810 {
1811         unsigned int mtu = dst->dev->mtu;
1812
1813         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1814                 const struct rtable *rt = (const struct rtable *) dst;
1815
1816                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1817                         mtu = 576;
1818         }
1819
1820         if (mtu > IP_MAX_MTU)
1821                 mtu = IP_MAX_MTU;
1822
1823         return mtu;
1824 }
1825
1826 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1827                             struct fib_info *fi)
1828 {
1829         struct inet_peer *peer;
1830         int create = 0;
1831
1832         /* If a peer entry exists for this destination, we must hook
1833          * it up in order to get at cached metrics.
1834          */
1835         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1836                 create = 1;
1837
1838         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1839         if (peer) {
1840                 rt->rt_peer_genid = rt_peer_genid();
1841                 if (inet_metrics_new(peer))
1842                         memcpy(peer->metrics, fi->fib_metrics,
1843                                sizeof(u32) * RTAX_MAX);
1844                 dst_init_metrics(&rt->dst, peer->metrics, false);
1845
1846                 check_peer_pmtu(&rt->dst, peer);
1847                 if (peer->redirect_learned.a4 &&
1848                     peer->redirect_learned.a4 != rt->rt_gateway) {
1849                         rt->rt_gateway = peer->redirect_learned.a4;
1850                         rt->rt_flags |= RTCF_REDIRECTED;
1851                 }
1852         } else {
1853                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1854                         rt->fi = fi;
1855                         atomic_inc(&fi->fib_clntref);
1856                 }
1857                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1858         }
1859 }
1860
1861 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1862                            const struct fib_result *res,
1863                            struct fib_info *fi, u16 type, u32 itag)
1864 {
1865         struct dst_entry *dst = &rt->dst;
1866
1867         if (fi) {
1868                 if (FIB_RES_GW(*res) &&
1869                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1870                         rt->rt_gateway = FIB_RES_GW(*res);
1871                 rt_init_metrics(rt, fl4, fi);
1872 #ifdef CONFIG_IP_ROUTE_CLASSID
1873                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1874 #endif
1875         }
1876
1877         if (dst_mtu(dst) > IP_MAX_MTU)
1878                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1879         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1880                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1881
1882 #ifdef CONFIG_IP_ROUTE_CLASSID
1883 #ifdef CONFIG_IP_MULTIPLE_TABLES
1884         set_class_tag(rt, fib_rules_tclass(res));
1885 #endif
1886         set_class_tag(rt, itag);
1887 #endif
1888 }
1889
1890 static struct rtable *rt_dst_alloc(struct net_device *dev,
1891                                    bool nopolicy, bool noxfrm)
1892 {
1893         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1894                          DST_HOST |
1895                          (nopolicy ? DST_NOPOLICY : 0) |
1896                          (noxfrm ? DST_NOXFRM : 0));
1897 }
1898
1899 /* called in rcu_read_lock() section */
1900 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1901                                 u8 tos, struct net_device *dev, int our)
1902 {
1903         unsigned int hash;
1904         struct rtable *rth;
1905         __be32 spec_dst;
1906         struct in_device *in_dev = __in_dev_get_rcu(dev);
1907         u32 itag = 0;
1908         int err;
1909
1910         /* Primary sanity checks. */
1911
1912         if (in_dev == NULL)
1913                 return -EINVAL;
1914
1915         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1916             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1917                 goto e_inval;
1918
1919         if (ipv4_is_zeronet(saddr)) {
1920                 if (!ipv4_is_local_multicast(daddr))
1921                         goto e_inval;
1922                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1923         } else {
1924                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1925                                           &itag);
1926                 if (err < 0)
1927                         goto e_err;
1928         }
1929         rth = rt_dst_alloc(init_net.loopback_dev,
1930                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1931         if (!rth)
1932                 goto e_nobufs;
1933
1934 #ifdef CONFIG_IP_ROUTE_CLASSID
1935         rth->dst.tclassid = itag;
1936 #endif
1937         rth->dst.output = ip_rt_bug;
1938
1939         rth->rt_key_dst = daddr;
1940         rth->rt_key_src = saddr;
1941         rth->rt_genid   = rt_genid(dev_net(dev));
1942         rth->rt_flags   = RTCF_MULTICAST;
1943         rth->rt_type    = RTN_MULTICAST;
1944         rth->rt_key_tos = tos;
1945         rth->rt_dst     = daddr;
1946         rth->rt_src     = saddr;
1947         rth->rt_route_iif = dev->ifindex;
1948         rth->rt_iif     = dev->ifindex;
1949         rth->rt_oif     = 0;
1950         rth->rt_mark    = skb->mark;
1951         rth->rt_gateway = daddr;
1952         rth->rt_spec_dst= spec_dst;
1953         rth->rt_peer_genid = 0;
1954         rth->peer = NULL;
1955         rth->fi = NULL;
1956         if (our) {
1957                 rth->dst.input= ip_local_deliver;
1958                 rth->rt_flags |= RTCF_LOCAL;
1959         }
1960
1961 #ifdef CONFIG_IP_MROUTE
1962         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1963                 rth->dst.input = ip_mr_input;
1964 #endif
1965         RT_CACHE_STAT_INC(in_slow_mc);
1966
1967         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1968         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1969         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1970
1971 e_nobufs:
1972         return -ENOBUFS;
1973 e_inval:
1974         return -EINVAL;
1975 e_err:
1976         return err;
1977 }
1978
1979
1980 static void ip_handle_martian_source(struct net_device *dev,
1981                                      struct in_device *in_dev,
1982                                      struct sk_buff *skb,
1983                                      __be32 daddr,
1984                                      __be32 saddr)
1985 {
1986         RT_CACHE_STAT_INC(in_martian_src);
1987 #ifdef CONFIG_IP_ROUTE_VERBOSE
1988         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1989                 /*
1990                  *      RFC1812 recommendation, if source is martian,
1991                  *      the only hint is MAC header.
1992                  */
1993                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1994                         &daddr, &saddr, dev->name);
1995                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1996                         int i;
1997                         const unsigned char *p = skb_mac_header(skb);
1998                         printk(KERN_WARNING "ll header: ");
1999                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2000                                 printk("%02x", *p);
2001                                 if (i < (dev->hard_header_len - 1))
2002                                         printk(":");
2003                         }
2004                         printk("\n");
2005                 }
2006         }
2007 #endif
2008 }
2009
2010 /* called in rcu_read_lock() section */
2011 static int __mkroute_input(struct sk_buff *skb,
2012                            const struct fib_result *res,
2013                            struct in_device *in_dev,
2014                            __be32 daddr, __be32 saddr, u32 tos,
2015                            struct rtable **result)
2016 {
2017         struct rtable *rth;
2018         int err;
2019         struct in_device *out_dev;
2020         unsigned int flags = 0;
2021         __be32 spec_dst;
2022         u32 itag;
2023
2024         /* get a working reference to the output device */
2025         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2026         if (out_dev == NULL) {
2027                 if (net_ratelimit())
2028                         printk(KERN_CRIT "Bug in ip_route_input" \
2029                                "_slow(). Please, report\n");
2030                 return -EINVAL;
2031         }
2032
2033
2034         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2035                                   in_dev->dev, &spec_dst, &itag);
2036         if (err < 0) {
2037                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2038                                          saddr);
2039
2040                 goto cleanup;
2041         }
2042
2043         if (err)
2044                 flags |= RTCF_DIRECTSRC;
2045
2046         if (out_dev == in_dev && err &&
2047             (IN_DEV_SHARED_MEDIA(out_dev) ||
2048              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2049                 flags |= RTCF_DOREDIRECT;
2050
2051         if (skb->protocol != htons(ETH_P_IP)) {
2052                 /* Not IP (i.e. ARP). Do not create route, if it is
2053                  * invalid for proxy arp. DNAT routes are always valid.
2054                  *
2055                  * Proxy arp feature have been extended to allow, ARP
2056                  * replies back to the same interface, to support
2057                  * Private VLAN switch technologies. See arp.c.
2058                  */
2059                 if (out_dev == in_dev &&
2060                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2061                         err = -EINVAL;
2062                         goto cleanup;
2063                 }
2064         }
2065
2066         rth = rt_dst_alloc(out_dev->dev,
2067                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2068                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2069         if (!rth) {
2070                 err = -ENOBUFS;
2071                 goto cleanup;
2072         }
2073
2074         rth->rt_key_dst = daddr;
2075         rth->rt_key_src = saddr;
2076         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2077         rth->rt_flags = flags;
2078         rth->rt_type = res->type;
2079         rth->rt_key_tos = tos;
2080         rth->rt_dst     = daddr;
2081         rth->rt_src     = saddr;
2082         rth->rt_route_iif = in_dev->dev->ifindex;
2083         rth->rt_iif     = in_dev->dev->ifindex;
2084         rth->rt_oif     = 0;
2085         rth->rt_mark    = skb->mark;
2086         rth->rt_gateway = daddr;
2087         rth->rt_spec_dst= spec_dst;
2088         rth->rt_peer_genid = 0;
2089         rth->peer = NULL;
2090         rth->fi = NULL;
2091
2092         rth->dst.input = ip_forward;
2093         rth->dst.output = ip_output;
2094
2095         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2096
2097         *result = rth;
2098         err = 0;
2099  cleanup:
2100         return err;
2101 }
2102
2103 static int ip_mkroute_input(struct sk_buff *skb,
2104                             struct fib_result *res,
2105                             const struct flowi4 *fl4,
2106                             struct in_device *in_dev,
2107                             __be32 daddr, __be32 saddr, u32 tos)
2108 {
2109         struct rtable* rth = NULL;
2110         int err;
2111         unsigned hash;
2112
2113 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2114         if (res->fi && res->fi->fib_nhs > 1)
2115                 fib_select_multipath(res);
2116 #endif
2117
2118         /* create a routing cache entry */
2119         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2120         if (err)
2121                 return err;
2122
2123         /* put it into the cache */
2124         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2125                        rt_genid(dev_net(rth->dst.dev)));
2126         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2127         if (IS_ERR(rth))
2128                 return PTR_ERR(rth);
2129         return 0;
2130 }
2131
2132 /*
2133  *      NOTE. We drop all the packets that has local source
2134  *      addresses, because every properly looped back packet
2135  *      must have correct destination already attached by output routine.
2136  *
2137  *      Such approach solves two big problems:
2138  *      1. Not simplex devices are handled properly.
2139  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2140  *      called with rcu_read_lock()
2141  */
2142
2143 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2144                                u8 tos, struct net_device *dev)
2145 {
2146         struct fib_result res;
2147         struct in_device *in_dev = __in_dev_get_rcu(dev);
2148         struct flowi4   fl4;
2149         unsigned        flags = 0;
2150         u32             itag = 0;
2151         struct rtable * rth;
2152         unsigned        hash;
2153         __be32          spec_dst;
2154         int             err = -EINVAL;
2155         struct net    * net = dev_net(dev);
2156
2157         /* IP on this device is disabled. */
2158
2159         if (!in_dev)
2160                 goto out;
2161
2162         /* Check for the most weird martians, which can be not detected
2163            by fib_lookup.
2164          */
2165
2166         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2167             ipv4_is_loopback(saddr))
2168                 goto martian_source;
2169
2170         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2171                 goto brd_input;
2172
2173         /* Accept zero addresses only to limited broadcast;
2174          * I even do not know to fix it or not. Waiting for complains :-)
2175          */
2176         if (ipv4_is_zeronet(saddr))
2177                 goto martian_source;
2178
2179         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2180                 goto martian_destination;
2181
2182         /*
2183          *      Now we are ready to route packet.
2184          */
2185         fl4.flowi4_oif = 0;
2186         fl4.flowi4_iif = dev->ifindex;
2187         fl4.flowi4_mark = skb->mark;
2188         fl4.flowi4_tos = tos;
2189         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2190         fl4.daddr = daddr;
2191         fl4.saddr = saddr;
2192         err = fib_lookup(net, &fl4, &res);
2193         if (err != 0) {
2194                 if (!IN_DEV_FORWARD(in_dev))
2195                         goto e_hostunreach;
2196                 goto no_route;
2197         }
2198
2199         RT_CACHE_STAT_INC(in_slow_tot);
2200
2201         if (res.type == RTN_BROADCAST)
2202                 goto brd_input;
2203
2204         if (res.type == RTN_LOCAL) {
2205                 err = fib_validate_source(skb, saddr, daddr, tos,
2206                                           net->loopback_dev->ifindex,
2207                                           dev, &spec_dst, &itag);
2208                 if (err < 0)
2209                         goto martian_source_keep_err;
2210                 if (err)
2211                         flags |= RTCF_DIRECTSRC;
2212                 spec_dst = daddr;
2213                 goto local_input;
2214         }
2215
2216         if (!IN_DEV_FORWARD(in_dev))
2217                 goto e_hostunreach;
2218         if (res.type != RTN_UNICAST)
2219                 goto martian_destination;
2220
2221         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2222 out:    return err;
2223
2224 brd_input:
2225         if (skb->protocol != htons(ETH_P_IP))
2226                 goto e_inval;
2227
2228         if (ipv4_is_zeronet(saddr))
2229                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2230         else {
2231                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2232                                           &itag);
2233                 if (err < 0)
2234                         goto martian_source_keep_err;
2235                 if (err)
2236                         flags |= RTCF_DIRECTSRC;
2237         }
2238         flags |= RTCF_BROADCAST;
2239         res.type = RTN_BROADCAST;
2240         RT_CACHE_STAT_INC(in_brd);
2241
2242 local_input:
2243         rth = rt_dst_alloc(net->loopback_dev,
2244                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2245         if (!rth)
2246                 goto e_nobufs;
2247
2248         rth->dst.input= ip_local_deliver;
2249         rth->dst.output= ip_rt_bug;
2250 #ifdef CONFIG_IP_ROUTE_CLASSID
2251         rth->dst.tclassid = itag;
2252 #endif
2253
2254         rth->rt_key_dst = daddr;
2255         rth->rt_key_src = saddr;
2256         rth->rt_genid = rt_genid(net);
2257         rth->rt_flags   = flags|RTCF_LOCAL;
2258         rth->rt_type    = res.type;
2259         rth->rt_key_tos = tos;
2260         rth->rt_dst     = daddr;
2261         rth->rt_src     = saddr;
2262 #ifdef CONFIG_IP_ROUTE_CLASSID
2263         rth->dst.tclassid = itag;
2264 #endif
2265         rth->rt_route_iif = dev->ifindex;
2266         rth->rt_iif     = dev->ifindex;
2267         rth->rt_oif     = 0;
2268         rth->rt_mark    = skb->mark;
2269         rth->rt_gateway = daddr;
2270         rth->rt_spec_dst= spec_dst;
2271         rth->rt_peer_genid = 0;
2272         rth->peer = NULL;
2273         rth->fi = NULL;
2274         if (res.type == RTN_UNREACHABLE) {
2275                 rth->dst.input= ip_error;
2276                 rth->dst.error= -err;
2277                 rth->rt_flags   &= ~RTCF_LOCAL;
2278         }
2279         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2280         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2281         err = 0;
2282         if (IS_ERR(rth))
2283                 err = PTR_ERR(rth);
2284         goto out;
2285
2286 no_route:
2287         RT_CACHE_STAT_INC(in_no_route);
2288         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2289         res.type = RTN_UNREACHABLE;
2290         if (err == -ESRCH)
2291                 err = -ENETUNREACH;
2292         goto local_input;
2293
2294         /*
2295          *      Do not cache martian addresses: they should be logged (RFC1812)
2296          */
2297 martian_destination:
2298         RT_CACHE_STAT_INC(in_martian_dst);
2299 #ifdef CONFIG_IP_ROUTE_VERBOSE
2300         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2301                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2302                         &daddr, &saddr, dev->name);
2303 #endif
2304
2305 e_hostunreach:
2306         err = -EHOSTUNREACH;
2307         goto out;
2308
2309 e_inval:
2310         err = -EINVAL;
2311         goto out;
2312
2313 e_nobufs:
2314         err = -ENOBUFS;
2315         goto out;
2316
2317 martian_source:
2318         err = -EINVAL;
2319 martian_source_keep_err:
2320         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2321         goto out;
2322 }
2323
2324 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2325                            u8 tos, struct net_device *dev, bool noref)
2326 {
2327         struct rtable * rth;
2328         unsigned        hash;
2329         int iif = dev->ifindex;
2330         struct net *net;
2331         int res;
2332
2333         net = dev_net(dev);
2334
2335         rcu_read_lock();
2336
2337         if (!rt_caching(net))
2338                 goto skip_cache;
2339
2340         tos &= IPTOS_RT_MASK;
2341         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2342
2343         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2344              rth = rcu_dereference(rth->dst.rt_next)) {
2345                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2346                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2347                      (rth->rt_route_iif ^ iif) |
2348                      (rth->rt_key_tos ^ tos)) == 0 &&
2349                     rth->rt_mark == skb->mark &&
2350                     net_eq(dev_net(rth->dst.dev), net) &&
2351                     !rt_is_expired(rth)) {
2352                         if (noref) {
2353                                 dst_use_noref(&rth->dst, jiffies);
2354                                 skb_dst_set_noref(skb, &rth->dst);
2355                         } else {
2356                                 dst_use(&rth->dst, jiffies);
2357                                 skb_dst_set(skb, &rth->dst);
2358                         }
2359                         RT_CACHE_STAT_INC(in_hit);
2360                         rcu_read_unlock();
2361                         return 0;
2362                 }
2363                 RT_CACHE_STAT_INC(in_hlist_search);
2364         }
2365
2366 skip_cache:
2367         /* Multicast recognition logic is moved from route cache to here.
2368            The problem was that too many Ethernet cards have broken/missing
2369            hardware multicast filters :-( As result the host on multicasting
2370            network acquires a lot of useless route cache entries, sort of
2371            SDR messages from all the world. Now we try to get rid of them.
2372            Really, provided software IP multicast filter is organized
2373            reasonably (at least, hashed), it does not result in a slowdown
2374            comparing with route cache reject entries.
2375            Note, that multicast routers are not affected, because
2376            route cache entry is created eventually.
2377          */
2378         if (ipv4_is_multicast(daddr)) {
2379                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2380
2381                 if (in_dev) {
2382                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2383                                                   ip_hdr(skb)->protocol);
2384                         if (our
2385 #ifdef CONFIG_IP_MROUTE
2386                                 ||
2387                             (!ipv4_is_local_multicast(daddr) &&
2388                              IN_DEV_MFORWARD(in_dev))
2389 #endif
2390                            ) {
2391                                 int res = ip_route_input_mc(skb, daddr, saddr,
2392                                                             tos, dev, our);
2393                                 rcu_read_unlock();
2394                                 return res;
2395                         }
2396                 }
2397                 rcu_read_unlock();
2398                 return -EINVAL;
2399         }
2400         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2401         rcu_read_unlock();
2402         return res;
2403 }
2404 EXPORT_SYMBOL(ip_route_input_common);
2405
2406 /* called with rcu_read_lock() */
2407 static struct rtable *__mkroute_output(const struct fib_result *res,
2408                                        const struct flowi4 *fl4,
2409                                        __be32 orig_daddr, __be32 orig_saddr,
2410                                        int orig_oif, struct net_device *dev_out,
2411                                        unsigned int flags)
2412 {
2413         struct fib_info *fi = res->fi;
2414         u32 tos = RT_FL_TOS(fl4);
2415         struct in_device *in_dev;
2416         u16 type = res->type;
2417         struct rtable *rth;
2418
2419         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2420                 return ERR_PTR(-EINVAL);
2421
2422         if (ipv4_is_lbcast(fl4->daddr))
2423                 type = RTN_BROADCAST;
2424         else if (ipv4_is_multicast(fl4->daddr))
2425                 type = RTN_MULTICAST;
2426         else if (ipv4_is_zeronet(fl4->daddr))
2427                 return ERR_PTR(-EINVAL);
2428
2429         if (dev_out->flags & IFF_LOOPBACK)
2430                 flags |= RTCF_LOCAL;
2431
2432         in_dev = __in_dev_get_rcu(dev_out);
2433         if (!in_dev)
2434                 return ERR_PTR(-EINVAL);
2435
2436         if (type == RTN_BROADCAST) {
2437                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2438                 fi = NULL;
2439         } else if (type == RTN_MULTICAST) {
2440                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2441                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2442                                      fl4->flowi4_proto))
2443                         flags &= ~RTCF_LOCAL;
2444                 /* If multicast route do not exist use
2445                  * default one, but do not gateway in this case.
2446                  * Yes, it is hack.
2447                  */
2448                 if (fi && res->prefixlen < 4)
2449                         fi = NULL;
2450         }
2451
2452         rth = rt_dst_alloc(dev_out,
2453                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2454                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2455         if (!rth)
2456                 return ERR_PTR(-ENOBUFS);
2457
2458         rth->dst.output = ip_output;
2459
2460         rth->rt_key_dst = orig_daddr;
2461         rth->rt_key_src = orig_saddr;
2462         rth->rt_genid = rt_genid(dev_net(dev_out));
2463         rth->rt_flags   = flags;
2464         rth->rt_type    = type;
2465         rth->rt_key_tos = tos;
2466         rth->rt_dst     = fl4->daddr;
2467         rth->rt_src     = fl4->saddr;
2468         rth->rt_route_iif = 0;
2469         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2470         rth->rt_oif     = orig_oif;
2471         rth->rt_mark    = fl4->flowi4_mark;
2472         rth->rt_gateway = fl4->daddr;
2473         rth->rt_spec_dst= fl4->saddr;
2474         rth->rt_peer_genid = 0;
2475         rth->peer = NULL;
2476         rth->fi = NULL;
2477
2478         RT_CACHE_STAT_INC(out_slow_tot);
2479
2480         if (flags & RTCF_LOCAL) {
2481                 rth->dst.input = ip_local_deliver;
2482                 rth->rt_spec_dst = fl4->daddr;
2483         }
2484         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2485                 rth->rt_spec_dst = fl4->saddr;
2486                 if (flags & RTCF_LOCAL &&
2487                     !(dev_out->flags & IFF_LOOPBACK)) {
2488                         rth->dst.output = ip_mc_output;
2489                         RT_CACHE_STAT_INC(out_slow_mc);
2490                 }
2491 #ifdef CONFIG_IP_MROUTE
2492                 if (type == RTN_MULTICAST) {
2493                         if (IN_DEV_MFORWARD(in_dev) &&
2494                             !ipv4_is_local_multicast(fl4->daddr)) {
2495                                 rth->dst.input = ip_mr_input;
2496                                 rth->dst.output = ip_mc_output;
2497                         }
2498                 }
2499 #endif
2500         }
2501
2502         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2503
2504         return rth;
2505 }
2506
2507 /*
2508  * Major route resolver routine.
2509  * called with rcu_read_lock();
2510  */
2511
2512 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2513 {
2514         struct net_device *dev_out = NULL;
2515         u32 tos = RT_FL_TOS(fl4);
2516         unsigned int flags = 0;
2517         struct fib_result res;
2518         struct rtable *rth;
2519         __be32 orig_daddr;
2520         __be32 orig_saddr;
2521         int orig_oif;
2522
2523         res.fi          = NULL;
2524 #ifdef CONFIG_IP_MULTIPLE_TABLES
2525         res.r           = NULL;
2526 #endif
2527
2528         orig_daddr = fl4->daddr;
2529         orig_saddr = fl4->saddr;
2530         orig_oif = fl4->flowi4_oif;
2531
2532         fl4->flowi4_iif = net->loopback_dev->ifindex;
2533         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2534         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2535                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2536
2537         rcu_read_lock();
2538         if (fl4->saddr) {
2539                 rth = ERR_PTR(-EINVAL);
2540                 if (ipv4_is_multicast(fl4->saddr) ||
2541                     ipv4_is_lbcast(fl4->saddr) ||
2542                     ipv4_is_zeronet(fl4->saddr))
2543                         goto out;
2544
2545                 /* I removed check for oif == dev_out->oif here.
2546                    It was wrong for two reasons:
2547                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2548                       is assigned to multiple interfaces.
2549                    2. Moreover, we are allowed to send packets with saddr
2550                       of another iface. --ANK
2551                  */
2552
2553                 if (fl4->flowi4_oif == 0 &&
2554                     (ipv4_is_multicast(fl4->daddr) ||
2555                      ipv4_is_lbcast(fl4->daddr))) {
2556                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2557                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2558                         if (dev_out == NULL)
2559                                 goto out;
2560
2561                         /* Special hack: user can direct multicasts
2562                            and limited broadcast via necessary interface
2563                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2564                            This hack is not just for fun, it allows
2565                            vic,vat and friends to work.
2566                            They bind socket to loopback, set ttl to zero
2567                            and expect that it will work.
2568                            From the viewpoint of routing cache they are broken,
2569                            because we are not allowed to build multicast path
2570                            with loopback source addr (look, routing cache
2571                            cannot know, that ttl is zero, so that packet
2572                            will not leave this host and route is valid).
2573                            Luckily, this hack is good workaround.
2574                          */
2575
2576                         fl4->flowi4_oif = dev_out->ifindex;
2577                         goto make_route;
2578                 }
2579
2580                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2581                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2582                         if (!__ip_dev_find(net, fl4->saddr, false))
2583                                 goto out;
2584                 }
2585         }
2586
2587
2588         if (fl4->flowi4_oif) {
2589                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2590                 rth = ERR_PTR(-ENODEV);
2591                 if (dev_out == NULL)
2592                         goto out;
2593
2594                 /* RACE: Check return value of inet_select_addr instead. */
2595                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2596                         rth = ERR_PTR(-ENETUNREACH);
2597                         goto out;
2598                 }
2599                 if (ipv4_is_local_multicast(fl4->daddr) ||
2600                     ipv4_is_lbcast(fl4->daddr)) {
2601                         if (!fl4->saddr)
2602                                 fl4->saddr = inet_select_addr(dev_out, 0,
2603                                                               RT_SCOPE_LINK);
2604                         goto make_route;
2605                 }
2606                 if (fl4->saddr) {
2607                         if (ipv4_is_multicast(fl4->daddr))
2608                                 fl4->saddr = inet_select_addr(dev_out, 0,
2609                                                               fl4->flowi4_scope);
2610                         else if (!fl4->daddr)
2611                                 fl4->saddr = inet_select_addr(dev_out, 0,
2612                                                               RT_SCOPE_HOST);
2613                 }
2614         }
2615
2616         if (!fl4->daddr) {
2617                 fl4->daddr = fl4->saddr;
2618                 if (!fl4->daddr)
2619                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2620                 dev_out = net->loopback_dev;
2621                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2622                 res.type = RTN_LOCAL;
2623                 flags |= RTCF_LOCAL;
2624                 goto make_route;
2625         }
2626
2627         if (fib_lookup(net, fl4, &res)) {
2628                 res.fi = NULL;
2629                 if (fl4->flowi4_oif) {
2630                         /* Apparently, routing tables are wrong. Assume,
2631                            that the destination is on link.
2632
2633                            WHY? DW.
2634                            Because we are allowed to send to iface
2635                            even if it has NO routes and NO assigned
2636                            addresses. When oif is specified, routing
2637                            tables are looked up with only one purpose:
2638                            to catch if destination is gatewayed, rather than
2639                            direct. Moreover, if MSG_DONTROUTE is set,
2640                            we send packet, ignoring both routing tables
2641                            and ifaddr state. --ANK
2642
2643
2644                            We could make it even if oif is unknown,
2645                            likely IPv6, but we do not.
2646                          */
2647
2648                         if (fl4->saddr == 0)
2649                                 fl4->saddr = inet_select_addr(dev_out, 0,
2650                                                               RT_SCOPE_LINK);
2651                         res.type = RTN_UNICAST;
2652                         goto make_route;
2653                 }
2654                 rth = ERR_PTR(-ENETUNREACH);
2655                 goto out;
2656         }
2657
2658         if (res.type == RTN_LOCAL) {
2659                 if (!fl4->saddr) {
2660                         if (res.fi->fib_prefsrc)
2661                                 fl4->saddr = res.fi->fib_prefsrc;
2662                         else
2663                                 fl4->saddr = fl4->daddr;
2664                 }
2665                 dev_out = net->loopback_dev;
2666                 fl4->flowi4_oif = dev_out->ifindex;
2667                 res.fi = NULL;
2668                 flags |= RTCF_LOCAL;
2669                 goto make_route;
2670         }
2671
2672 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2673         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2674                 fib_select_multipath(&res);
2675         else
2676 #endif
2677         if (!res.prefixlen &&
2678             res.table->tb_num_default > 1 &&
2679             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2680                 fib_select_default(&res);
2681
2682         if (!fl4->saddr)
2683                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2684
2685         dev_out = FIB_RES_DEV(res);
2686         fl4->flowi4_oif = dev_out->ifindex;
2687
2688
2689 make_route:
2690         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2691                                dev_out, flags);
2692         if (!IS_ERR(rth)) {
2693                 unsigned int hash;
2694
2695                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2696                                rt_genid(dev_net(dev_out)));
2697                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2698         }
2699
2700 out:
2701         rcu_read_unlock();
2702         return rth;
2703 }
2704
2705 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2706 {
2707         struct rtable *rth;
2708         unsigned int hash;
2709
2710         if (!rt_caching(net))
2711                 goto slow_output;
2712
2713         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2714
2715         rcu_read_lock_bh();
2716         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2717                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2718                 if (rth->rt_key_dst == flp4->daddr &&
2719                     rth->rt_key_src == flp4->saddr &&
2720                     rt_is_output_route(rth) &&
2721                     rth->rt_oif == flp4->flowi4_oif &&
2722                     rth->rt_mark == flp4->flowi4_mark &&
2723                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2724                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2725                     net_eq(dev_net(rth->dst.dev), net) &&
2726                     !rt_is_expired(rth)) {
2727                         dst_use(&rth->dst, jiffies);
2728                         RT_CACHE_STAT_INC(out_hit);
2729                         rcu_read_unlock_bh();
2730                         if (!flp4->saddr)
2731                                 flp4->saddr = rth->rt_src;
2732                         if (!flp4->daddr)
2733                                 flp4->daddr = rth->rt_dst;
2734                         return rth;
2735                 }
2736                 RT_CACHE_STAT_INC(out_hlist_search);
2737         }
2738         rcu_read_unlock_bh();
2739
2740 slow_output:
2741         return ip_route_output_slow(net, flp4);
2742 }
2743 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2744
2745 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2746 {
2747         return NULL;
2748 }
2749
2750 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2751 {
2752         return 0;
2753 }
2754
2755 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2756 {
2757 }
2758
2759 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2760                                           unsigned long old)
2761 {
2762         return NULL;
2763 }
2764
2765 static struct dst_ops ipv4_dst_blackhole_ops = {
2766         .family                 =       AF_INET,
2767         .protocol               =       cpu_to_be16(ETH_P_IP),
2768         .destroy                =       ipv4_dst_destroy,
2769         .check                  =       ipv4_blackhole_dst_check,
2770         .default_mtu            =       ipv4_blackhole_default_mtu,
2771         .default_advmss         =       ipv4_default_advmss,
2772         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2773         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2774         .neigh_lookup           =       ipv4_neigh_lookup,
2775 };
2776
2777 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2778 {
2779         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2780         struct rtable *ort = (struct rtable *) dst_orig;
2781
2782         if (rt) {
2783                 struct dst_entry *new = &rt->dst;
2784
2785                 new->__use = 1;
2786                 new->input = dst_discard;
2787                 new->output = dst_discard;
2788                 dst_copy_metrics(new, &ort->dst);
2789
2790                 new->dev = ort->dst.dev;
2791                 if (new->dev)
2792                         dev_hold(new->dev);
2793
2794                 rt->rt_key_dst = ort->rt_key_dst;
2795                 rt->rt_key_src = ort->rt_key_src;
2796                 rt->rt_key_tos = ort->rt_key_tos;
2797                 rt->rt_route_iif = ort->rt_route_iif;
2798                 rt->rt_iif = ort->rt_iif;
2799                 rt->rt_oif = ort->rt_oif;
2800                 rt->rt_mark = ort->rt_mark;
2801
2802                 rt->rt_genid = rt_genid(net);
2803                 rt->rt_flags = ort->rt_flags;
2804                 rt->rt_type = ort->rt_type;
2805                 rt->rt_dst = ort->rt_dst;
2806                 rt->rt_src = ort->rt_src;
2807                 rt->rt_gateway = ort->rt_gateway;
2808                 rt->rt_spec_dst = ort->rt_spec_dst;
2809                 rt->peer = ort->peer;
2810                 if (rt->peer)
2811                         atomic_inc(&rt->peer->refcnt);
2812                 rt->fi = ort->fi;
2813                 if (rt->fi)
2814                         atomic_inc(&rt->fi->fib_clntref);
2815
2816                 dst_free(new);
2817         }
2818
2819         dst_release(dst_orig);
2820
2821         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2822 }
2823
2824 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2825                                     struct sock *sk)
2826 {
2827         struct rtable *rt = __ip_route_output_key(net, flp4);
2828
2829         if (IS_ERR(rt))
2830                 return rt;
2831
2832         if (flp4->flowi4_proto)
2833                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2834                                                    flowi4_to_flowi(flp4),
2835                                                    sk, 0);
2836
2837         return rt;
2838 }
2839 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2840
2841 static int rt_fill_info(struct net *net,
2842                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2843                         int nowait, unsigned int flags)
2844 {
2845         struct rtable *rt = skb_rtable(skb);
2846         struct rtmsg *r;
2847         struct nlmsghdr *nlh;
2848         long expires = 0;
2849         const struct inet_peer *peer = rt->peer;
2850         u32 id = 0, ts = 0, tsage = 0, error;
2851
2852         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2853         if (nlh == NULL)
2854                 return -EMSGSIZE;
2855
2856         r = nlmsg_data(nlh);
2857         r->rtm_family    = AF_INET;
2858         r->rtm_dst_len  = 32;
2859         r->rtm_src_len  = 0;
2860         r->rtm_tos      = rt->rt_key_tos;
2861         r->rtm_table    = RT_TABLE_MAIN;
2862         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2863         r->rtm_type     = rt->rt_type;
2864         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2865         r->rtm_protocol = RTPROT_UNSPEC;
2866         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2867         if (rt->rt_flags & RTCF_NOTIFY)
2868                 r->rtm_flags |= RTM_F_NOTIFY;
2869
2870         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2871
2872         if (rt->rt_key_src) {
2873                 r->rtm_src_len = 32;
2874                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2875         }
2876         if (rt->dst.dev)
2877                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2878 #ifdef CONFIG_IP_ROUTE_CLASSID
2879         if (rt->dst.tclassid)
2880                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2881 #endif
2882         if (rt_is_input_route(rt))
2883                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2884         else if (rt->rt_src != rt->rt_key_src)
2885                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2886
2887         if (rt->rt_dst != rt->rt_gateway)
2888                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2889
2890         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2891                 goto nla_put_failure;
2892
2893         if (rt->rt_mark)
2894                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2895
2896         error = rt->dst.error;
2897         if (peer) {
2898                 inet_peer_refcheck(rt->peer);
2899                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2900                 if (peer->tcp_ts_stamp) {
2901                         ts = peer->tcp_ts;
2902                         tsage = get_seconds() - peer->tcp_ts_stamp;
2903                 }
2904                 expires = ACCESS_ONCE(peer->pmtu_expires);
2905                 if (expires)
2906                         expires -= jiffies;
2907         }
2908
2909         if (rt_is_input_route(rt)) {
2910 #ifdef CONFIG_IP_MROUTE
2911                 __be32 dst = rt->rt_dst;
2912
2913                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2914                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2915                         int err = ipmr_get_route(net, skb,
2916                                                  rt->rt_src, rt->rt_dst,
2917                                                  r, nowait);
2918                         if (err <= 0) {
2919                                 if (!nowait) {
2920                                         if (err == 0)
2921                                                 return 0;
2922                                         goto nla_put_failure;
2923                                 } else {
2924                                         if (err == -EMSGSIZE)
2925                                                 goto nla_put_failure;
2926                                         error = err;
2927                                 }
2928                         }
2929                 } else
2930 #endif
2931                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2932         }
2933
2934         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2935                                expires, error) < 0)
2936                 goto nla_put_failure;
2937
2938         return nlmsg_end(skb, nlh);
2939
2940 nla_put_failure:
2941         nlmsg_cancel(skb, nlh);
2942         return -EMSGSIZE;
2943 }
2944
2945 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2946 {
2947         struct net *net = sock_net(in_skb->sk);
2948         struct rtmsg *rtm;
2949         struct nlattr *tb[RTA_MAX+1];
2950         struct rtable *rt = NULL;
2951         __be32 dst = 0;
2952         __be32 src = 0;
2953         u32 iif;
2954         int err;
2955         int mark;
2956         struct sk_buff *skb;
2957
2958         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2959         if (err < 0)
2960                 goto errout;
2961
2962         rtm = nlmsg_data(nlh);
2963
2964         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2965         if (skb == NULL) {
2966                 err = -ENOBUFS;
2967                 goto errout;
2968         }
2969
2970         /* Reserve room for dummy headers, this skb can pass
2971            through good chunk of routing engine.
2972          */
2973         skb_reset_mac_header(skb);
2974         skb_reset_network_header(skb);
2975
2976         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2977         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2978         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2979
2980         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2981         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2982         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2983         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2984
2985         if (iif) {
2986                 struct net_device *dev;
2987
2988                 dev = __dev_get_by_index(net, iif);
2989                 if (dev == NULL) {
2990                         err = -ENODEV;
2991                         goto errout_free;
2992                 }
2993
2994                 skb->protocol   = htons(ETH_P_IP);
2995                 skb->dev        = dev;
2996                 skb->mark       = mark;
2997                 local_bh_disable();
2998                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2999                 local_bh_enable();
3000
3001                 rt = skb_rtable(skb);
3002                 if (err == 0 && rt->dst.error)
3003                         err = -rt->dst.error;
3004         } else {
3005                 struct flowi4 fl4 = {
3006                         .daddr = dst,
3007                         .saddr = src,
3008                         .flowi4_tos = rtm->rtm_tos,
3009                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3010                         .flowi4_mark = mark,
3011                 };
3012                 rt = ip_route_output_key(net, &fl4);
3013
3014                 err = 0;
3015                 if (IS_ERR(rt))
3016                         err = PTR_ERR(rt);
3017         }
3018
3019         if (err)
3020                 goto errout_free;
3021
3022         skb_dst_set(skb, &rt->dst);
3023         if (rtm->rtm_flags & RTM_F_NOTIFY)
3024                 rt->rt_flags |= RTCF_NOTIFY;
3025
3026         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3027                            RTM_NEWROUTE, 0, 0);
3028         if (err <= 0)
3029                 goto errout_free;
3030
3031         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3032 errout:
3033         return err;
3034
3035 errout_free:
3036         kfree_skb(skb);
3037         goto errout;
3038 }
3039
3040 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3041 {
3042         struct rtable *rt;
3043         int h, s_h;
3044         int idx, s_idx;
3045         struct net *net;
3046
3047         net = sock_net(skb->sk);
3048
3049         s_h = cb->args[0];
3050         if (s_h < 0)
3051                 s_h = 0;
3052         s_idx = idx = cb->args[1];
3053         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3054                 if (!rt_hash_table[h].chain)
3055                         continue;
3056                 rcu_read_lock_bh();
3057                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3058                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3059                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3060                                 continue;
3061                         if (rt_is_expired(rt))
3062                                 continue;
3063                         skb_dst_set_noref(skb, &rt->dst);
3064                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3065                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3066                                          1, NLM_F_MULTI) <= 0) {
3067                                 skb_dst_drop(skb);
3068                                 rcu_read_unlock_bh();
3069                                 goto done;
3070                         }
3071                         skb_dst_drop(skb);
3072                 }
3073                 rcu_read_unlock_bh();
3074         }
3075
3076 done:
3077         cb->args[0] = h;
3078         cb->args[1] = idx;
3079         return skb->len;
3080 }
3081
3082 void ip_rt_multicast_event(struct in_device *in_dev)
3083 {
3084         rt_cache_flush(dev_net(in_dev->dev), 0);
3085 }
3086
3087 #ifdef CONFIG_SYSCTL
3088 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3089                                         void __user *buffer,
3090                                         size_t *lenp, loff_t *ppos)
3091 {
3092         if (write) {
3093                 int flush_delay;
3094                 ctl_table ctl;
3095                 struct net *net;
3096
3097                 memcpy(&ctl, __ctl, sizeof(ctl));
3098                 ctl.data = &flush_delay;
3099                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3100
3101                 net = (struct net *)__ctl->extra1;
3102                 rt_cache_flush(net, flush_delay);
3103                 return 0;
3104         }
3105
3106         return -EINVAL;
3107 }
3108
3109 static ctl_table ipv4_route_table[] = {
3110         {
3111                 .procname       = "gc_thresh",
3112                 .data           = &ipv4_dst_ops.gc_thresh,
3113                 .maxlen         = sizeof(int),
3114                 .mode           = 0644,
3115                 .proc_handler   = proc_dointvec,
3116         },
3117         {
3118                 .procname       = "max_size",
3119                 .data           = &ip_rt_max_size,
3120                 .maxlen         = sizeof(int),
3121                 .mode           = 0644,
3122                 .proc_handler   = proc_dointvec,
3123         },
3124         {
3125                 /*  Deprecated. Use gc_min_interval_ms */
3126
3127                 .procname       = "gc_min_interval",
3128                 .data           = &ip_rt_gc_min_interval,
3129                 .maxlen         = sizeof(int),
3130                 .mode           = 0644,
3131                 .proc_handler   = proc_dointvec_jiffies,
3132         },
3133         {
3134                 .procname       = "gc_min_interval_ms",
3135                 .data           = &ip_rt_gc_min_interval,
3136                 .maxlen         = sizeof(int),
3137                 .mode           = 0644,
3138                 .proc_handler   = proc_dointvec_ms_jiffies,
3139         },
3140         {
3141                 .procname       = "gc_timeout",
3142                 .data           = &ip_rt_gc_timeout,
3143                 .maxlen         = sizeof(int),
3144                 .mode           = 0644,
3145                 .proc_handler   = proc_dointvec_jiffies,
3146         },
3147         {
3148                 .procname       = "redirect_load",
3149                 .data           = &ip_rt_redirect_load,
3150                 .maxlen         = sizeof(int),
3151                 .mode           = 0644,
3152                 .proc_handler   = proc_dointvec,
3153         },
3154         {
3155                 .procname       = "redirect_number",
3156                 .data           = &ip_rt_redirect_number,
3157                 .maxlen         = sizeof(int),
3158                 .mode           = 0644,
3159                 .proc_handler   = proc_dointvec,
3160         },
3161         {
3162                 .procname       = "redirect_silence",
3163                 .data           = &ip_rt_redirect_silence,
3164                 .maxlen         = sizeof(int),
3165                 .mode           = 0644,
3166                 .proc_handler   = proc_dointvec,
3167         },
3168         {
3169                 .procname       = "error_cost",
3170                 .data           = &ip_rt_error_cost,
3171                 .maxlen         = sizeof(int),
3172                 .mode           = 0644,
3173                 .proc_handler   = proc_dointvec,
3174         },
3175         {
3176                 .procname       = "error_burst",
3177                 .data           = &ip_rt_error_burst,
3178                 .maxlen         = sizeof(int),
3179                 .mode           = 0644,
3180                 .proc_handler   = proc_dointvec,
3181         },
3182         {
3183                 .procname       = "gc_elasticity",
3184                 .data           = &ip_rt_gc_elasticity,
3185                 .maxlen         = sizeof(int),
3186                 .mode           = 0644,
3187                 .proc_handler   = proc_dointvec,
3188         },
3189         {
3190                 .procname       = "mtu_expires",
3191                 .data           = &ip_rt_mtu_expires,
3192                 .maxlen         = sizeof(int),
3193                 .mode           = 0644,
3194                 .proc_handler   = proc_dointvec_jiffies,
3195         },
3196         {
3197                 .procname       = "min_pmtu",
3198                 .data           = &ip_rt_min_pmtu,
3199                 .maxlen         = sizeof(int),
3200                 .mode           = 0644,
3201                 .proc_handler   = proc_dointvec,
3202         },
3203         {
3204                 .procname       = "min_adv_mss",
3205                 .data           = &ip_rt_min_advmss,
3206                 .maxlen         = sizeof(int),
3207                 .mode           = 0644,
3208                 .proc_handler   = proc_dointvec,
3209         },
3210         { }
3211 };
3212
3213 static struct ctl_table empty[1];
3214
3215 static struct ctl_table ipv4_skeleton[] =
3216 {
3217         { .procname = "route", 
3218           .mode = 0555, .child = ipv4_route_table},
3219         { .procname = "neigh", 
3220           .mode = 0555, .child = empty},
3221         { }
3222 };
3223
3224 static __net_initdata struct ctl_path ipv4_path[] = {
3225         { .procname = "net", },
3226         { .procname = "ipv4", },
3227         { },
3228 };
3229
3230 static struct ctl_table ipv4_route_flush_table[] = {
3231         {
3232                 .procname       = "flush",
3233                 .maxlen         = sizeof(int),
3234                 .mode           = 0200,
3235                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3236         },
3237         { },
3238 };
3239
3240 static __net_initdata struct ctl_path ipv4_route_path[] = {
3241         { .procname = "net", },
3242         { .procname = "ipv4", },
3243         { .procname = "route", },
3244         { },
3245 };
3246
3247 static __net_init int sysctl_route_net_init(struct net *net)
3248 {
3249         struct ctl_table *tbl;
3250
3251         tbl = ipv4_route_flush_table;
3252         if (!net_eq(net, &init_net)) {
3253                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3254                 if (tbl == NULL)
3255                         goto err_dup;
3256         }
3257         tbl[0].extra1 = net;
3258
3259         net->ipv4.route_hdr =
3260                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3261         if (net->ipv4.route_hdr == NULL)
3262                 goto err_reg;
3263         return 0;
3264
3265 err_reg:
3266         if (tbl != ipv4_route_flush_table)
3267                 kfree(tbl);
3268 err_dup:
3269         return -ENOMEM;
3270 }
3271
3272 static __net_exit void sysctl_route_net_exit(struct net *net)
3273 {
3274         struct ctl_table *tbl;
3275
3276         tbl = net->ipv4.route_hdr->ctl_table_arg;
3277         unregister_net_sysctl_table(net->ipv4.route_hdr);
3278         BUG_ON(tbl == ipv4_route_flush_table);
3279         kfree(tbl);
3280 }
3281
3282 static __net_initdata struct pernet_operations sysctl_route_ops = {
3283         .init = sysctl_route_net_init,
3284         .exit = sysctl_route_net_exit,
3285 };
3286 #endif
3287
3288 static __net_init int rt_genid_init(struct net *net)
3289 {
3290         get_random_bytes(&net->ipv4.rt_genid,
3291                          sizeof(net->ipv4.rt_genid));
3292         get_random_bytes(&net->ipv4.dev_addr_genid,
3293                          sizeof(net->ipv4.dev_addr_genid));
3294         return 0;
3295 }
3296
3297 static __net_initdata struct pernet_operations rt_genid_ops = {
3298         .init = rt_genid_init,
3299 };
3300
3301
3302 #ifdef CONFIG_IP_ROUTE_CLASSID
3303 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3304 #endif /* CONFIG_IP_ROUTE_CLASSID */
3305
3306 static __initdata unsigned long rhash_entries;
3307 static int __init set_rhash_entries(char *str)
3308 {
3309         if (!str)
3310                 return 0;
3311         rhash_entries = simple_strtoul(str, &str, 0);
3312         return 1;
3313 }
3314 __setup("rhash_entries=", set_rhash_entries);
3315
3316 int __init ip_rt_init(void)
3317 {
3318         int rc = 0;
3319
3320 #ifdef CONFIG_IP_ROUTE_CLASSID
3321         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3322         if (!ip_rt_acct)
3323                 panic("IP: failed to allocate ip_rt_acct\n");
3324 #endif
3325
3326         ipv4_dst_ops.kmem_cachep =
3327                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3328                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3329
3330         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3331
3332         if (dst_entries_init(&ipv4_dst_ops) < 0)
3333                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3334
3335         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3336                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3337
3338         rt_hash_table = (struct rt_hash_bucket *)
3339                 alloc_large_system_hash("IP route cache",
3340                                         sizeof(struct rt_hash_bucket),
3341                                         rhash_entries,
3342                                         (totalram_pages >= 128 * 1024) ?
3343                                         15 : 17,
3344                                         0,
3345                                         &rt_hash_log,
3346                                         &rt_hash_mask,
3347                                         rhash_entries ? 0 : 512 * 1024);
3348         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3349         rt_hash_lock_init();
3350
3351         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3352         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3353
3354         devinet_init();
3355         ip_fib_init();
3356
3357         if (ip_rt_proc_init())
3358                 printk(KERN_ERR "Unable to create route proc files\n");
3359 #ifdef CONFIG_XFRM
3360         xfrm_init();
3361         xfrm4_init(ip_rt_max_size);
3362 #endif
3363         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3364
3365 #ifdef CONFIG_SYSCTL
3366         register_pernet_subsys(&sysctl_route_ops);
3367 #endif
3368         register_pernet_subsys(&rt_genid_ops);
3369         return rc;
3370 }
3371
3372 #ifdef CONFIG_SYSCTL
3373 /*
3374  * We really need to sanitize the damn ipv4 init order, then all
3375  * this nonsense will go away.
3376  */
3377 void __init ip_static_sysctl_init(void)
3378 {
3379         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3380 }
3381 #endif