Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs
[pandora-kernel.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 #include <net/atmclip.h>
112 #include <net/secure_seq.h>
113
114 #define RT_FL_TOS(oldflp4) \
115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
116
117 #define IP_MAX_MTU      0xFFF0
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
124 static int ip_rt_redirect_number __read_mostly  = 9;
125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly       = HZ;
128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly    = 8;
130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly       = 256;
133 static int rt_chain_length_max __read_mostly    = 20;
134 static int redirect_genid;
135
136 /*
137  *      Interface to generic destination cache.
138  */
139
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
143 static void              ipv4_dst_destroy(struct dst_entry *dst);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void              ipv4_link_failure(struct sk_buff *skb);
146 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147 static int rt_garbage_collect(struct dst_ops *ops);
148
149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
150                             int how)
151 {
152 }
153
154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
155 {
156         struct rtable *rt = (struct rtable *) dst;
157         struct inet_peer *peer;
158         u32 *p = NULL;
159
160         if (!rt->peer)
161                 rt_bind_peer(rt, rt->rt_dst, 1);
162
163         peer = rt->peer;
164         if (peer) {
165                 u32 *old_p = __DST_METRICS_PTR(old);
166                 unsigned long prev, new;
167
168                 p = peer->metrics;
169                 if (inet_metrics_new(peer))
170                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
171
172                 new = (unsigned long) p;
173                 prev = cmpxchg(&dst->_metrics, old, new);
174
175                 if (prev != old) {
176                         p = __DST_METRICS_PTR(prev);
177                         if (prev & DST_METRICS_READ_ONLY)
178                                 p = NULL;
179                 } else {
180                         if (rt->fi) {
181                                 fib_info_put(rt->fi);
182                                 rt->fi = NULL;
183                         }
184                 }
185         }
186         return p;
187 }
188
189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
190
191 static struct dst_ops ipv4_dst_ops = {
192         .family =               AF_INET,
193         .protocol =             cpu_to_be16(ETH_P_IP),
194         .gc =                   rt_garbage_collect,
195         .check =                ipv4_dst_check,
196         .default_advmss =       ipv4_default_advmss,
197         .mtu =                  ipv4_mtu,
198         .cow_metrics =          ipv4_cow_metrics,
199         .destroy =              ipv4_dst_destroy,
200         .ifdown =               ipv4_dst_ifdown,
201         .negative_advice =      ipv4_negative_advice,
202         .link_failure =         ipv4_link_failure,
203         .update_pmtu =          ip_rt_update_pmtu,
204         .local_out =            __ip_local_out,
205         .neigh_lookup =         ipv4_neigh_lookup,
206 };
207
208 #define ECN_OR_COST(class)      TC_PRIO_##class
209
210 const __u8 ip_tos2prio[16] = {
211         TC_PRIO_BESTEFFORT,
212         ECN_OR_COST(BESTEFFORT),
213         TC_PRIO_BESTEFFORT,
214         ECN_OR_COST(BESTEFFORT),
215         TC_PRIO_BULK,
216         ECN_OR_COST(BULK),
217         TC_PRIO_BULK,
218         ECN_OR_COST(BULK),
219         TC_PRIO_INTERACTIVE,
220         ECN_OR_COST(INTERACTIVE),
221         TC_PRIO_INTERACTIVE,
222         ECN_OR_COST(INTERACTIVE),
223         TC_PRIO_INTERACTIVE_BULK,
224         ECN_OR_COST(INTERACTIVE_BULK),
225         TC_PRIO_INTERACTIVE_BULK,
226         ECN_OR_COST(INTERACTIVE_BULK)
227 };
228
229
230 /*
231  * Route cache.
232  */
233
234 /* The locking scheme is rather straight forward:
235  *
236  * 1) Read-Copy Update protects the buckets of the central route hash.
237  * 2) Only writers remove entries, and they hold the lock
238  *    as they look at rtable reference counts.
239  * 3) Only readers acquire references to rtable entries,
240  *    they do so with atomic increments and with the
241  *    lock held.
242  */
243
244 struct rt_hash_bucket {
245         struct rtable __rcu     *chain;
246 };
247
248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
249         defined(CONFIG_PROVE_LOCKING)
250 /*
251  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
252  * The size of this table is a power of two and depends on the number of CPUS.
253  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
254  */
255 #ifdef CONFIG_LOCKDEP
256 # define RT_HASH_LOCK_SZ        256
257 #else
258 # if NR_CPUS >= 32
259 #  define RT_HASH_LOCK_SZ       4096
260 # elif NR_CPUS >= 16
261 #  define RT_HASH_LOCK_SZ       2048
262 # elif NR_CPUS >= 8
263 #  define RT_HASH_LOCK_SZ       1024
264 # elif NR_CPUS >= 4
265 #  define RT_HASH_LOCK_SZ       512
266 # else
267 #  define RT_HASH_LOCK_SZ       256
268 # endif
269 #endif
270
271 static spinlock_t       *rt_hash_locks;
272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
273
274 static __init void rt_hash_lock_init(void)
275 {
276         int i;
277
278         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
279                         GFP_KERNEL);
280         if (!rt_hash_locks)
281                 panic("IP: failed to allocate rt_hash_locks\n");
282
283         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
284                 spin_lock_init(&rt_hash_locks[i]);
285 }
286 #else
287 # define rt_hash_lock_addr(slot) NULL
288
289 static inline void rt_hash_lock_init(void)
290 {
291 }
292 #endif
293
294 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
295 static unsigned                 rt_hash_mask __read_mostly;
296 static unsigned int             rt_hash_log  __read_mostly;
297
298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
300
301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
302                                    int genid)
303 {
304         return jhash_3words((__force u32)daddr, (__force u32)saddr,
305                             idx, genid)
306                 & rt_hash_mask;
307 }
308
309 static inline int rt_genid(struct net *net)
310 {
311         return atomic_read(&net->ipv4.rt_genid);
312 }
313
314 #ifdef CONFIG_PROC_FS
315 struct rt_cache_iter_state {
316         struct seq_net_private p;
317         int bucket;
318         int genid;
319 };
320
321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
322 {
323         struct rt_cache_iter_state *st = seq->private;
324         struct rtable *r = NULL;
325
326         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
327                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
328                         continue;
329                 rcu_read_lock_bh();
330                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
331                 while (r) {
332                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
333                             r->rt_genid == st->genid)
334                                 return r;
335                         r = rcu_dereference_bh(r->dst.rt_next);
336                 }
337                 rcu_read_unlock_bh();
338         }
339         return r;
340 }
341
342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
343                                           struct rtable *r)
344 {
345         struct rt_cache_iter_state *st = seq->private;
346
347         r = rcu_dereference_bh(r->dst.rt_next);
348         while (!r) {
349                 rcu_read_unlock_bh();
350                 do {
351                         if (--st->bucket < 0)
352                                 return NULL;
353                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
354                 rcu_read_lock_bh();
355                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
356         }
357         return r;
358 }
359
360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
361                                         struct rtable *r)
362 {
363         struct rt_cache_iter_state *st = seq->private;
364         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
365                 if (dev_net(r->dst.dev) != seq_file_net(seq))
366                         continue;
367                 if (r->rt_genid == st->genid)
368                         break;
369         }
370         return r;
371 }
372
373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
374 {
375         struct rtable *r = rt_cache_get_first(seq);
376
377         if (r)
378                 while (pos && (r = rt_cache_get_next(seq, r)))
379                         --pos;
380         return pos ? NULL : r;
381 }
382
383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
384 {
385         struct rt_cache_iter_state *st = seq->private;
386         if (*pos)
387                 return rt_cache_get_idx(seq, *pos - 1);
388         st->genid = rt_genid(seq_file_net(seq));
389         return SEQ_START_TOKEN;
390 }
391
392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
393 {
394         struct rtable *r;
395
396         if (v == SEQ_START_TOKEN)
397                 r = rt_cache_get_first(seq);
398         else
399                 r = rt_cache_get_next(seq, v);
400         ++*pos;
401         return r;
402 }
403
404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
405 {
406         if (v && v != SEQ_START_TOKEN)
407                 rcu_read_unlock_bh();
408 }
409
410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
411 {
412         if (v == SEQ_START_TOKEN)
413                 seq_printf(seq, "%-127s\n",
414                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
415                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
416                            "HHUptod\tSpecDst");
417         else {
418                 struct rtable *r = v;
419                 struct neighbour *n;
420                 int len, HHUptod;
421
422                 rcu_read_lock();
423                 n = dst_get_neighbour(&r->dst);
424                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
425                 rcu_read_unlock();
426
427                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
428                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
429                         r->dst.dev ? r->dst.dev->name : "*",
430                         (__force u32)r->rt_dst,
431                         (__force u32)r->rt_gateway,
432                         r->rt_flags, atomic_read(&r->dst.__refcnt),
433                         r->dst.__use, 0, (__force u32)r->rt_src,
434                         dst_metric_advmss(&r->dst) + 40,
435                         dst_metric(&r->dst, RTAX_WINDOW),
436                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
437                               dst_metric(&r->dst, RTAX_RTTVAR)),
438                         r->rt_key_tos,
439                         -1,
440                         HHUptod,
441                         r->rt_spec_dst, &len);
442
443                 seq_printf(seq, "%*s\n", 127 - len, "");
444         }
445         return 0;
446 }
447
448 static const struct seq_operations rt_cache_seq_ops = {
449         .start  = rt_cache_seq_start,
450         .next   = rt_cache_seq_next,
451         .stop   = rt_cache_seq_stop,
452         .show   = rt_cache_seq_show,
453 };
454
455 static int rt_cache_seq_open(struct inode *inode, struct file *file)
456 {
457         return seq_open_net(inode, file, &rt_cache_seq_ops,
458                         sizeof(struct rt_cache_iter_state));
459 }
460
461 static const struct file_operations rt_cache_seq_fops = {
462         .owner   = THIS_MODULE,
463         .open    = rt_cache_seq_open,
464         .read    = seq_read,
465         .llseek  = seq_lseek,
466         .release = seq_release_net,
467 };
468
469
470 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
471 {
472         int cpu;
473
474         if (*pos == 0)
475                 return SEQ_START_TOKEN;
476
477         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
478                 if (!cpu_possible(cpu))
479                         continue;
480                 *pos = cpu+1;
481                 return &per_cpu(rt_cache_stat, cpu);
482         }
483         return NULL;
484 }
485
486 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
487 {
488         int cpu;
489
490         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
491                 if (!cpu_possible(cpu))
492                         continue;
493                 *pos = cpu+1;
494                 return &per_cpu(rt_cache_stat, cpu);
495         }
496         return NULL;
497
498 }
499
500 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
501 {
502
503 }
504
505 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
506 {
507         struct rt_cache_stat *st = v;
508
509         if (v == SEQ_START_TOKEN) {
510                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
511                 return 0;
512         }
513
514         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
515                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
516                    dst_entries_get_slow(&ipv4_dst_ops),
517                    st->in_hit,
518                    st->in_slow_tot,
519                    st->in_slow_mc,
520                    st->in_no_route,
521                    st->in_brd,
522                    st->in_martian_dst,
523                    st->in_martian_src,
524
525                    st->out_hit,
526                    st->out_slow_tot,
527                    st->out_slow_mc,
528
529                    st->gc_total,
530                    st->gc_ignored,
531                    st->gc_goal_miss,
532                    st->gc_dst_overflow,
533                    st->in_hlist_search,
534                    st->out_hlist_search
535                 );
536         return 0;
537 }
538
539 static const struct seq_operations rt_cpu_seq_ops = {
540         .start  = rt_cpu_seq_start,
541         .next   = rt_cpu_seq_next,
542         .stop   = rt_cpu_seq_stop,
543         .show   = rt_cpu_seq_show,
544 };
545
546
547 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
548 {
549         return seq_open(file, &rt_cpu_seq_ops);
550 }
551
552 static const struct file_operations rt_cpu_seq_fops = {
553         .owner   = THIS_MODULE,
554         .open    = rt_cpu_seq_open,
555         .read    = seq_read,
556         .llseek  = seq_lseek,
557         .release = seq_release,
558 };
559
560 #ifdef CONFIG_IP_ROUTE_CLASSID
561 static int rt_acct_proc_show(struct seq_file *m, void *v)
562 {
563         struct ip_rt_acct *dst, *src;
564         unsigned int i, j;
565
566         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
567         if (!dst)
568                 return -ENOMEM;
569
570         for_each_possible_cpu(i) {
571                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
572                 for (j = 0; j < 256; j++) {
573                         dst[j].o_bytes   += src[j].o_bytes;
574                         dst[j].o_packets += src[j].o_packets;
575                         dst[j].i_bytes   += src[j].i_bytes;
576                         dst[j].i_packets += src[j].i_packets;
577                 }
578         }
579
580         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
581         kfree(dst);
582         return 0;
583 }
584
585 static int rt_acct_proc_open(struct inode *inode, struct file *file)
586 {
587         return single_open(file, rt_acct_proc_show, NULL);
588 }
589
590 static const struct file_operations rt_acct_proc_fops = {
591         .owner          = THIS_MODULE,
592         .open           = rt_acct_proc_open,
593         .read           = seq_read,
594         .llseek         = seq_lseek,
595         .release        = single_release,
596 };
597 #endif
598
599 static int __net_init ip_rt_do_proc_init(struct net *net)
600 {
601         struct proc_dir_entry *pde;
602
603         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
604                         &rt_cache_seq_fops);
605         if (!pde)
606                 goto err1;
607
608         pde = proc_create("rt_cache", S_IRUGO,
609                           net->proc_net_stat, &rt_cpu_seq_fops);
610         if (!pde)
611                 goto err2;
612
613 #ifdef CONFIG_IP_ROUTE_CLASSID
614         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
615         if (!pde)
616                 goto err3;
617 #endif
618         return 0;
619
620 #ifdef CONFIG_IP_ROUTE_CLASSID
621 err3:
622         remove_proc_entry("rt_cache", net->proc_net_stat);
623 #endif
624 err2:
625         remove_proc_entry("rt_cache", net->proc_net);
626 err1:
627         return -ENOMEM;
628 }
629
630 static void __net_exit ip_rt_do_proc_exit(struct net *net)
631 {
632         remove_proc_entry("rt_cache", net->proc_net_stat);
633         remove_proc_entry("rt_cache", net->proc_net);
634 #ifdef CONFIG_IP_ROUTE_CLASSID
635         remove_proc_entry("rt_acct", net->proc_net);
636 #endif
637 }
638
639 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
640         .init = ip_rt_do_proc_init,
641         .exit = ip_rt_do_proc_exit,
642 };
643
644 static int __init ip_rt_proc_init(void)
645 {
646         return register_pernet_subsys(&ip_rt_proc_ops);
647 }
648
649 #else
650 static inline int ip_rt_proc_init(void)
651 {
652         return 0;
653 }
654 #endif /* CONFIG_PROC_FS */
655
656 static inline void rt_free(struct rtable *rt)
657 {
658         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
659 }
660
661 static inline void rt_drop(struct rtable *rt)
662 {
663         ip_rt_put(rt);
664         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
665 }
666
667 static inline int rt_fast_clean(struct rtable *rth)
668 {
669         /* Kill broadcast/multicast entries very aggresively, if they
670            collide in hash table with more useful entries */
671         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
672                 rt_is_input_route(rth) && rth->dst.rt_next;
673 }
674
675 static inline int rt_valuable(struct rtable *rth)
676 {
677         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
678                 (rth->peer && rth->peer->pmtu_expires);
679 }
680
681 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
682 {
683         unsigned long age;
684         int ret = 0;
685
686         if (atomic_read(&rth->dst.__refcnt))
687                 goto out;
688
689         age = jiffies - rth->dst.lastuse;
690         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
691             (age <= tmo2 && rt_valuable(rth)))
692                 goto out;
693         ret = 1;
694 out:    return ret;
695 }
696
697 /* Bits of score are:
698  * 31: very valuable
699  * 30: not quite useless
700  * 29..0: usage counter
701  */
702 static inline u32 rt_score(struct rtable *rt)
703 {
704         u32 score = jiffies - rt->dst.lastuse;
705
706         score = ~score & ~(3<<30);
707
708         if (rt_valuable(rt))
709                 score |= (1<<31);
710
711         if (rt_is_output_route(rt) ||
712             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
713                 score |= (1<<30);
714
715         return score;
716 }
717
718 static inline bool rt_caching(const struct net *net)
719 {
720         return net->ipv4.current_rt_cache_rebuild_count <=
721                 net->ipv4.sysctl_rt_cache_rebuild_count;
722 }
723
724 static inline bool compare_hash_inputs(const struct rtable *rt1,
725                                        const struct rtable *rt2)
726 {
727         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
728                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
729                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
730 }
731
732 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
733 {
734         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
735                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
736                 (rt1->rt_mark ^ rt2->rt_mark) |
737                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
738                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
739                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
740 }
741
742 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
743 {
744         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
745 }
746
747 static inline int rt_is_expired(struct rtable *rth)
748 {
749         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
750 }
751
752 /*
753  * Perform a full scan of hash table and free all entries.
754  * Can be called by a softirq or a process.
755  * In the later case, we want to be reschedule if necessary
756  */
757 static void rt_do_flush(struct net *net, int process_context)
758 {
759         unsigned int i;
760         struct rtable *rth, *next;
761
762         for (i = 0; i <= rt_hash_mask; i++) {
763                 struct rtable __rcu **pprev;
764                 struct rtable *list;
765
766                 if (process_context && need_resched())
767                         cond_resched();
768                 rth = rcu_access_pointer(rt_hash_table[i].chain);
769                 if (!rth)
770                         continue;
771
772                 spin_lock_bh(rt_hash_lock_addr(i));
773
774                 list = NULL;
775                 pprev = &rt_hash_table[i].chain;
776                 rth = rcu_dereference_protected(*pprev,
777                         lockdep_is_held(rt_hash_lock_addr(i)));
778
779                 while (rth) {
780                         next = rcu_dereference_protected(rth->dst.rt_next,
781                                 lockdep_is_held(rt_hash_lock_addr(i)));
782
783                         if (!net ||
784                             net_eq(dev_net(rth->dst.dev), net)) {
785                                 rcu_assign_pointer(*pprev, next);
786                                 rcu_assign_pointer(rth->dst.rt_next, list);
787                                 list = rth;
788                         } else {
789                                 pprev = &rth->dst.rt_next;
790                         }
791                         rth = next;
792                 }
793
794                 spin_unlock_bh(rt_hash_lock_addr(i));
795
796                 for (; list; list = next) {
797                         next = rcu_dereference_protected(list->dst.rt_next, 1);
798                         rt_free(list);
799                 }
800         }
801 }
802
803 /*
804  * While freeing expired entries, we compute average chain length
805  * and standard deviation, using fixed-point arithmetic.
806  * This to have an estimation of rt_chain_length_max
807  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
808  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
809  */
810
811 #define FRACT_BITS 3
812 #define ONE (1UL << FRACT_BITS)
813
814 /*
815  * Given a hash chain and an item in this hash chain,
816  * find if a previous entry has the same hash_inputs
817  * (but differs on tos, mark or oif)
818  * Returns 0 if an alias is found.
819  * Returns ONE if rth has no alias before itself.
820  */
821 static int has_noalias(const struct rtable *head, const struct rtable *rth)
822 {
823         const struct rtable *aux = head;
824
825         while (aux != rth) {
826                 if (compare_hash_inputs(aux, rth))
827                         return 0;
828                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
829         }
830         return ONE;
831 }
832
833 /*
834  * Perturbation of rt_genid by a small quantity [1..256]
835  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
836  * many times (2^24) without giving recent rt_genid.
837  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
838  */
839 static void rt_cache_invalidate(struct net *net)
840 {
841         unsigned char shuffle;
842
843         get_random_bytes(&shuffle, sizeof(shuffle));
844         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
845         redirect_genid++;
846 }
847
848 /*
849  * delay < 0  : invalidate cache (fast : entries will be deleted later)
850  * delay >= 0 : invalidate & flush cache (can be long)
851  */
852 void rt_cache_flush(struct net *net, int delay)
853 {
854         rt_cache_invalidate(net);
855         if (delay >= 0)
856                 rt_do_flush(net, !in_softirq());
857 }
858
859 /* Flush previous cache invalidated entries from the cache */
860 void rt_cache_flush_batch(struct net *net)
861 {
862         rt_do_flush(net, !in_softirq());
863 }
864
865 static void rt_emergency_hash_rebuild(struct net *net)
866 {
867         if (net_ratelimit())
868                 printk(KERN_WARNING "Route hash chain too long!\n");
869         rt_cache_invalidate(net);
870 }
871
872 /*
873    Short description of GC goals.
874
875    We want to build algorithm, which will keep routing cache
876    at some equilibrium point, when number of aged off entries
877    is kept approximately equal to newly generated ones.
878
879    Current expiration strength is variable "expire".
880    We try to adjust it dynamically, so that if networking
881    is idle expires is large enough to keep enough of warm entries,
882    and when load increases it reduces to limit cache size.
883  */
884
885 static int rt_garbage_collect(struct dst_ops *ops)
886 {
887         static unsigned long expire = RT_GC_TIMEOUT;
888         static unsigned long last_gc;
889         static int rover;
890         static int equilibrium;
891         struct rtable *rth;
892         struct rtable __rcu **rthp;
893         unsigned long now = jiffies;
894         int goal;
895         int entries = dst_entries_get_fast(&ipv4_dst_ops);
896
897         /*
898          * Garbage collection is pretty expensive,
899          * do not make it too frequently.
900          */
901
902         RT_CACHE_STAT_INC(gc_total);
903
904         if (now - last_gc < ip_rt_gc_min_interval &&
905             entries < ip_rt_max_size) {
906                 RT_CACHE_STAT_INC(gc_ignored);
907                 goto out;
908         }
909
910         entries = dst_entries_get_slow(&ipv4_dst_ops);
911         /* Calculate number of entries, which we want to expire now. */
912         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
913         if (goal <= 0) {
914                 if (equilibrium < ipv4_dst_ops.gc_thresh)
915                         equilibrium = ipv4_dst_ops.gc_thresh;
916                 goal = entries - equilibrium;
917                 if (goal > 0) {
918                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
919                         goal = entries - equilibrium;
920                 }
921         } else {
922                 /* We are in dangerous area. Try to reduce cache really
923                  * aggressively.
924                  */
925                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
926                 equilibrium = entries - goal;
927         }
928
929         if (now - last_gc >= ip_rt_gc_min_interval)
930                 last_gc = now;
931
932         if (goal <= 0) {
933                 equilibrium += goal;
934                 goto work_done;
935         }
936
937         do {
938                 int i, k;
939
940                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
941                         unsigned long tmo = expire;
942
943                         k = (k + 1) & rt_hash_mask;
944                         rthp = &rt_hash_table[k].chain;
945                         spin_lock_bh(rt_hash_lock_addr(k));
946                         while ((rth = rcu_dereference_protected(*rthp,
947                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
948                                 if (!rt_is_expired(rth) &&
949                                         !rt_may_expire(rth, tmo, expire)) {
950                                         tmo >>= 1;
951                                         rthp = &rth->dst.rt_next;
952                                         continue;
953                                 }
954                                 *rthp = rth->dst.rt_next;
955                                 rt_free(rth);
956                                 goal--;
957                         }
958                         spin_unlock_bh(rt_hash_lock_addr(k));
959                         if (goal <= 0)
960                                 break;
961                 }
962                 rover = k;
963
964                 if (goal <= 0)
965                         goto work_done;
966
967                 /* Goal is not achieved. We stop process if:
968
969                    - if expire reduced to zero. Otherwise, expire is halfed.
970                    - if table is not full.
971                    - if we are called from interrupt.
972                    - jiffies check is just fallback/debug loop breaker.
973                      We will not spin here for long time in any case.
974                  */
975
976                 RT_CACHE_STAT_INC(gc_goal_miss);
977
978                 if (expire == 0)
979                         break;
980
981                 expire >>= 1;
982
983                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
984                         goto out;
985         } while (!in_softirq() && time_before_eq(jiffies, now));
986
987         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
988                 goto out;
989         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
990                 goto out;
991         if (net_ratelimit())
992                 printk(KERN_WARNING "dst cache overflow\n");
993         RT_CACHE_STAT_INC(gc_dst_overflow);
994         return 1;
995
996 work_done:
997         expire += ip_rt_gc_min_interval;
998         if (expire > ip_rt_gc_timeout ||
999             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1000             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1001                 expire = ip_rt_gc_timeout;
1002 out:    return 0;
1003 }
1004
1005 /*
1006  * Returns number of entries in a hash chain that have different hash_inputs
1007  */
1008 static int slow_chain_length(const struct rtable *head)
1009 {
1010         int length = 0;
1011         const struct rtable *rth = head;
1012
1013         while (rth) {
1014                 length += has_noalias(head, rth);
1015                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1016         }
1017         return length >> FRACT_BITS;
1018 }
1019
1020 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1021 {
1022         struct neigh_table *tbl = &arp_tbl;
1023         static const __be32 inaddr_any = 0;
1024         struct net_device *dev = dst->dev;
1025         const __be32 *pkey = daddr;
1026         struct neighbour *n;
1027
1028 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1029         if (dev->type == ARPHRD_ATM)
1030                 tbl = clip_tbl_hook;
1031 #endif
1032         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1033                 pkey = &inaddr_any;
1034
1035         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1036         if (n)
1037                 return n;
1038         return neigh_create(tbl, pkey, dev);
1039 }
1040
1041 static int rt_bind_neighbour(struct rtable *rt)
1042 {
1043         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1044         if (IS_ERR(n))
1045                 return PTR_ERR(n);
1046         dst_set_neighbour(&rt->dst, n);
1047
1048         return 0;
1049 }
1050
1051 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1052                                      struct sk_buff *skb, int ifindex)
1053 {
1054         struct rtable   *rth, *cand;
1055         struct rtable __rcu **rthp, **candp;
1056         unsigned long   now;
1057         u32             min_score;
1058         int             chain_length;
1059         int attempts = !in_softirq();
1060
1061 restart:
1062         chain_length = 0;
1063         min_score = ~(u32)0;
1064         cand = NULL;
1065         candp = NULL;
1066         now = jiffies;
1067
1068         if (!rt_caching(dev_net(rt->dst.dev))) {
1069                 /*
1070                  * If we're not caching, just tell the caller we
1071                  * were successful and don't touch the route.  The
1072                  * caller hold the sole reference to the cache entry, and
1073                  * it will be released when the caller is done with it.
1074                  * If we drop it here, the callers have no way to resolve routes
1075                  * when we're not caching.  Instead, just point *rp at rt, so
1076                  * the caller gets a single use out of the route
1077                  * Note that we do rt_free on this new route entry, so that
1078                  * once its refcount hits zero, we are still able to reap it
1079                  * (Thanks Alexey)
1080                  * Note: To avoid expensive rcu stuff for this uncached dst,
1081                  * we set DST_NOCACHE so that dst_release() can free dst without
1082                  * waiting a grace period.
1083                  */
1084
1085                 rt->dst.flags |= DST_NOCACHE;
1086                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1087                         int err = rt_bind_neighbour(rt);
1088                         if (err) {
1089                                 if (net_ratelimit())
1090                                         printk(KERN_WARNING
1091                                             "Neighbour table failure & not caching routes.\n");
1092                                 ip_rt_put(rt);
1093                                 return ERR_PTR(err);
1094                         }
1095                 }
1096
1097                 goto skip_hashing;
1098         }
1099
1100         rthp = &rt_hash_table[hash].chain;
1101
1102         spin_lock_bh(rt_hash_lock_addr(hash));
1103         while ((rth = rcu_dereference_protected(*rthp,
1104                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1105                 if (rt_is_expired(rth)) {
1106                         *rthp = rth->dst.rt_next;
1107                         rt_free(rth);
1108                         continue;
1109                 }
1110                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1111                         /* Put it first */
1112                         *rthp = rth->dst.rt_next;
1113                         /*
1114                          * Since lookup is lockfree, the deletion
1115                          * must be visible to another weakly ordered CPU before
1116                          * the insertion at the start of the hash chain.
1117                          */
1118                         rcu_assign_pointer(rth->dst.rt_next,
1119                                            rt_hash_table[hash].chain);
1120                         /*
1121                          * Since lookup is lockfree, the update writes
1122                          * must be ordered for consistency on SMP.
1123                          */
1124                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1125
1126                         dst_use(&rth->dst, now);
1127                         spin_unlock_bh(rt_hash_lock_addr(hash));
1128
1129                         rt_drop(rt);
1130                         if (skb)
1131                                 skb_dst_set(skb, &rth->dst);
1132                         return rth;
1133                 }
1134
1135                 if (!atomic_read(&rth->dst.__refcnt)) {
1136                         u32 score = rt_score(rth);
1137
1138                         if (score <= min_score) {
1139                                 cand = rth;
1140                                 candp = rthp;
1141                                 min_score = score;
1142                         }
1143                 }
1144
1145                 chain_length++;
1146
1147                 rthp = &rth->dst.rt_next;
1148         }
1149
1150         if (cand) {
1151                 /* ip_rt_gc_elasticity used to be average length of chain
1152                  * length, when exceeded gc becomes really aggressive.
1153                  *
1154                  * The second limit is less certain. At the moment it allows
1155                  * only 2 entries per bucket. We will see.
1156                  */
1157                 if (chain_length > ip_rt_gc_elasticity) {
1158                         *candp = cand->dst.rt_next;
1159                         rt_free(cand);
1160                 }
1161         } else {
1162                 if (chain_length > rt_chain_length_max &&
1163                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1164                         struct net *net = dev_net(rt->dst.dev);
1165                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1166                         if (!rt_caching(net)) {
1167                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1168                                         rt->dst.dev->name, num);
1169                         }
1170                         rt_emergency_hash_rebuild(net);
1171                         spin_unlock_bh(rt_hash_lock_addr(hash));
1172
1173                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1174                                         ifindex, rt_genid(net));
1175                         goto restart;
1176                 }
1177         }
1178
1179         /* Try to bind route to arp only if it is output
1180            route or unicast forwarding path.
1181          */
1182         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1183                 int err = rt_bind_neighbour(rt);
1184                 if (err) {
1185                         spin_unlock_bh(rt_hash_lock_addr(hash));
1186
1187                         if (err != -ENOBUFS) {
1188                                 rt_drop(rt);
1189                                 return ERR_PTR(err);
1190                         }
1191
1192                         /* Neighbour tables are full and nothing
1193                            can be released. Try to shrink route cache,
1194                            it is most likely it holds some neighbour records.
1195                          */
1196                         if (attempts-- > 0) {
1197                                 int saved_elasticity = ip_rt_gc_elasticity;
1198                                 int saved_int = ip_rt_gc_min_interval;
1199                                 ip_rt_gc_elasticity     = 1;
1200                                 ip_rt_gc_min_interval   = 0;
1201                                 rt_garbage_collect(&ipv4_dst_ops);
1202                                 ip_rt_gc_min_interval   = saved_int;
1203                                 ip_rt_gc_elasticity     = saved_elasticity;
1204                                 goto restart;
1205                         }
1206
1207                         if (net_ratelimit())
1208                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1209                         rt_drop(rt);
1210                         return ERR_PTR(-ENOBUFS);
1211                 }
1212         }
1213
1214         rt->dst.rt_next = rt_hash_table[hash].chain;
1215
1216         /*
1217          * Since lookup is lockfree, we must make sure
1218          * previous writes to rt are committed to memory
1219          * before making rt visible to other CPUS.
1220          */
1221         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1222
1223         spin_unlock_bh(rt_hash_lock_addr(hash));
1224
1225 skip_hashing:
1226         if (skb)
1227                 skb_dst_set(skb, &rt->dst);
1228         return rt;
1229 }
1230
1231 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1232
1233 static u32 rt_peer_genid(void)
1234 {
1235         return atomic_read(&__rt_peer_genid);
1236 }
1237
1238 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1239 {
1240         struct inet_peer *peer;
1241
1242         peer = inet_getpeer_v4(daddr, create);
1243
1244         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1245                 inet_putpeer(peer);
1246         else
1247                 rt->rt_peer_genid = rt_peer_genid();
1248 }
1249
1250 /*
1251  * Peer allocation may fail only in serious out-of-memory conditions.  However
1252  * we still can generate some output.
1253  * Random ID selection looks a bit dangerous because we have no chances to
1254  * select ID being unique in a reasonable period of time.
1255  * But broken packet identifier may be better than no packet at all.
1256  */
1257 static void ip_select_fb_ident(struct iphdr *iph)
1258 {
1259         static DEFINE_SPINLOCK(ip_fb_id_lock);
1260         static u32 ip_fallback_id;
1261         u32 salt;
1262
1263         spin_lock_bh(&ip_fb_id_lock);
1264         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1265         iph->id = htons(salt & 0xFFFF);
1266         ip_fallback_id = salt;
1267         spin_unlock_bh(&ip_fb_id_lock);
1268 }
1269
1270 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1271 {
1272         struct rtable *rt = (struct rtable *) dst;
1273
1274         if (rt) {
1275                 if (rt->peer == NULL)
1276                         rt_bind_peer(rt, rt->rt_dst, 1);
1277
1278                 /* If peer is attached to destination, it is never detached,
1279                    so that we need not to grab a lock to dereference it.
1280                  */
1281                 if (rt->peer) {
1282                         iph->id = htons(inet_getid(rt->peer, more));
1283                         return;
1284                 }
1285         } else
1286                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1287                        __builtin_return_address(0));
1288
1289         ip_select_fb_ident(iph);
1290 }
1291 EXPORT_SYMBOL(__ip_select_ident);
1292
1293 static void rt_del(unsigned hash, struct rtable *rt)
1294 {
1295         struct rtable __rcu **rthp;
1296         struct rtable *aux;
1297
1298         rthp = &rt_hash_table[hash].chain;
1299         spin_lock_bh(rt_hash_lock_addr(hash));
1300         ip_rt_put(rt);
1301         while ((aux = rcu_dereference_protected(*rthp,
1302                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1303                 if (aux == rt || rt_is_expired(aux)) {
1304                         *rthp = aux->dst.rt_next;
1305                         rt_free(aux);
1306                         continue;
1307                 }
1308                 rthp = &aux->dst.rt_next;
1309         }
1310         spin_unlock_bh(rt_hash_lock_addr(hash));
1311 }
1312
1313 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1314 {
1315         struct rtable *rt = (struct rtable *) dst;
1316         __be32 orig_gw = rt->rt_gateway;
1317         struct neighbour *n, *old_n;
1318
1319         dst_confirm(&rt->dst);
1320
1321         rt->rt_gateway = peer->redirect_learned.a4;
1322
1323         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1324         if (IS_ERR(n))
1325                 return PTR_ERR(n);
1326         old_n = xchg(&rt->dst._neighbour, n);
1327         if (old_n)
1328                 neigh_release(old_n);
1329         if (!n || !(n->nud_state & NUD_VALID)) {
1330                 if (n)
1331                         neigh_event_send(n, NULL);
1332                 rt->rt_gateway = orig_gw;
1333                 return -EAGAIN;
1334         } else {
1335                 rt->rt_flags |= RTCF_REDIRECTED;
1336                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1337         }
1338         return 0;
1339 }
1340
1341 /* called in rcu_read_lock() section */
1342 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1343                     __be32 saddr, struct net_device *dev)
1344 {
1345         int s, i;
1346         struct in_device *in_dev = __in_dev_get_rcu(dev);
1347         __be32 skeys[2] = { saddr, 0 };
1348         int    ikeys[2] = { dev->ifindex, 0 };
1349         struct inet_peer *peer;
1350         struct net *net;
1351
1352         if (!in_dev)
1353                 return;
1354
1355         net = dev_net(dev);
1356         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1357             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1358             ipv4_is_zeronet(new_gw))
1359                 goto reject_redirect;
1360
1361         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1362                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1363                         goto reject_redirect;
1364                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1365                         goto reject_redirect;
1366         } else {
1367                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1368                         goto reject_redirect;
1369         }
1370
1371         for (s = 0; s < 2; s++) {
1372                 for (i = 0; i < 2; i++) {
1373                         unsigned int hash;
1374                         struct rtable __rcu **rthp;
1375                         struct rtable *rt;
1376
1377                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1378
1379                         rthp = &rt_hash_table[hash].chain;
1380
1381                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1382                                 rthp = &rt->dst.rt_next;
1383
1384                                 if (rt->rt_key_dst != daddr ||
1385                                     rt->rt_key_src != skeys[s] ||
1386                                     rt->rt_oif != ikeys[i] ||
1387                                     rt_is_input_route(rt) ||
1388                                     rt_is_expired(rt) ||
1389                                     !net_eq(dev_net(rt->dst.dev), net) ||
1390                                     rt->dst.error ||
1391                                     rt->dst.dev != dev ||
1392                                     rt->rt_gateway != old_gw)
1393                                         continue;
1394
1395                                 if (!rt->peer)
1396                                         rt_bind_peer(rt, rt->rt_dst, 1);
1397
1398                                 peer = rt->peer;
1399                                 if (peer) {
1400                                         if (peer->redirect_learned.a4 != new_gw ||
1401                                             peer->redirect_genid != redirect_genid) {
1402                                                 peer->redirect_learned.a4 = new_gw;
1403                                                 peer->redirect_genid = redirect_genid;
1404                                                 atomic_inc(&__rt_peer_genid);
1405                                         }
1406                                         check_peer_redir(&rt->dst, peer);
1407                                 }
1408                         }
1409                 }
1410         }
1411         return;
1412
1413 reject_redirect:
1414 #ifdef CONFIG_IP_ROUTE_VERBOSE
1415         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1416                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1417                         "  Advised path = %pI4 -> %pI4\n",
1418                        &old_gw, dev->name, &new_gw,
1419                        &saddr, &daddr);
1420 #endif
1421         ;
1422 }
1423
1424 static bool peer_pmtu_expired(struct inet_peer *peer)
1425 {
1426         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1427
1428         return orig &&
1429                time_after_eq(jiffies, orig) &&
1430                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1431 }
1432
1433 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1434 {
1435         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1436
1437         return orig &&
1438                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1439 }
1440
1441 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1442 {
1443         struct rtable *rt = (struct rtable *)dst;
1444         struct dst_entry *ret = dst;
1445
1446         if (rt) {
1447                 if (dst->obsolete > 0) {
1448                         ip_rt_put(rt);
1449                         ret = NULL;
1450                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1451                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1452                                                 rt->rt_oif,
1453                                                 rt_genid(dev_net(dst->dev)));
1454                         rt_del(hash, rt);
1455                         ret = NULL;
1456                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1457                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1458                 }
1459         }
1460         return ret;
1461 }
1462
1463 /*
1464  * Algorithm:
1465  *      1. The first ip_rt_redirect_number redirects are sent
1466  *         with exponential backoff, then we stop sending them at all,
1467  *         assuming that the host ignores our redirects.
1468  *      2. If we did not see packets requiring redirects
1469  *         during ip_rt_redirect_silence, we assume that the host
1470  *         forgot redirected route and start to send redirects again.
1471  *
1472  * This algorithm is much cheaper and more intelligent than dumb load limiting
1473  * in icmp.c.
1474  *
1475  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1476  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1477  */
1478
1479 void ip_rt_send_redirect(struct sk_buff *skb)
1480 {
1481         struct rtable *rt = skb_rtable(skb);
1482         struct in_device *in_dev;
1483         struct inet_peer *peer;
1484         int log_martians;
1485
1486         rcu_read_lock();
1487         in_dev = __in_dev_get_rcu(rt->dst.dev);
1488         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1489                 rcu_read_unlock();
1490                 return;
1491         }
1492         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1493         rcu_read_unlock();
1494
1495         if (!rt->peer)
1496                 rt_bind_peer(rt, rt->rt_dst, 1);
1497         peer = rt->peer;
1498         if (!peer) {
1499                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1500                 return;
1501         }
1502
1503         /* No redirected packets during ip_rt_redirect_silence;
1504          * reset the algorithm.
1505          */
1506         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1507                 peer->rate_tokens = 0;
1508
1509         /* Too many ignored redirects; do not send anything
1510          * set dst.rate_last to the last seen redirected packet.
1511          */
1512         if (peer->rate_tokens >= ip_rt_redirect_number) {
1513                 peer->rate_last = jiffies;
1514                 return;
1515         }
1516
1517         /* Check for load limit; set rate_last to the latest sent
1518          * redirect.
1519          */
1520         if (peer->rate_tokens == 0 ||
1521             time_after(jiffies,
1522                        (peer->rate_last +
1523                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1524                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1525                 peer->rate_last = jiffies;
1526                 ++peer->rate_tokens;
1527 #ifdef CONFIG_IP_ROUTE_VERBOSE
1528                 if (log_martians &&
1529                     peer->rate_tokens == ip_rt_redirect_number &&
1530                     net_ratelimit())
1531                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1532                                &ip_hdr(skb)->saddr, rt->rt_iif,
1533                                 &rt->rt_dst, &rt->rt_gateway);
1534 #endif
1535         }
1536 }
1537
1538 static int ip_error(struct sk_buff *skb)
1539 {
1540         struct rtable *rt = skb_rtable(skb);
1541         struct inet_peer *peer;
1542         unsigned long now;
1543         bool send;
1544         int code;
1545
1546         switch (rt->dst.error) {
1547         case EINVAL:
1548         default:
1549                 goto out;
1550         case EHOSTUNREACH:
1551                 code = ICMP_HOST_UNREACH;
1552                 break;
1553         case ENETUNREACH:
1554                 code = ICMP_NET_UNREACH;
1555                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1556                                 IPSTATS_MIB_INNOROUTES);
1557                 break;
1558         case EACCES:
1559                 code = ICMP_PKT_FILTERED;
1560                 break;
1561         }
1562
1563         if (!rt->peer)
1564                 rt_bind_peer(rt, rt->rt_dst, 1);
1565         peer = rt->peer;
1566
1567         send = true;
1568         if (peer) {
1569                 now = jiffies;
1570                 peer->rate_tokens += now - peer->rate_last;
1571                 if (peer->rate_tokens > ip_rt_error_burst)
1572                         peer->rate_tokens = ip_rt_error_burst;
1573                 peer->rate_last = now;
1574                 if (peer->rate_tokens >= ip_rt_error_cost)
1575                         peer->rate_tokens -= ip_rt_error_cost;
1576                 else
1577                         send = false;
1578         }
1579         if (send)
1580                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1581
1582 out:    kfree_skb(skb);
1583         return 0;
1584 }
1585
1586 /*
1587  *      The last two values are not from the RFC but
1588  *      are needed for AMPRnet AX.25 paths.
1589  */
1590
1591 static const unsigned short mtu_plateau[] =
1592 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1593
1594 static inline unsigned short guess_mtu(unsigned short old_mtu)
1595 {
1596         int i;
1597
1598         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1599                 if (old_mtu > mtu_plateau[i])
1600                         return mtu_plateau[i];
1601         return 68;
1602 }
1603
1604 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1605                                  unsigned short new_mtu,
1606                                  struct net_device *dev)
1607 {
1608         unsigned short old_mtu = ntohs(iph->tot_len);
1609         unsigned short est_mtu = 0;
1610         struct inet_peer *peer;
1611
1612         peer = inet_getpeer_v4(iph->daddr, 1);
1613         if (peer) {
1614                 unsigned short mtu = new_mtu;
1615
1616                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1617                         /* BSD 4.2 derived systems incorrectly adjust
1618                          * tot_len by the IP header length, and report
1619                          * a zero MTU in the ICMP message.
1620                          */
1621                         if (mtu == 0 &&
1622                             old_mtu >= 68 + (iph->ihl << 2))
1623                                 old_mtu -= iph->ihl << 2;
1624                         mtu = guess_mtu(old_mtu);
1625                 }
1626
1627                 if (mtu < ip_rt_min_pmtu)
1628                         mtu = ip_rt_min_pmtu;
1629                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1630                         unsigned long pmtu_expires;
1631
1632                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1633                         if (!pmtu_expires)
1634                                 pmtu_expires = 1UL;
1635
1636                         est_mtu = mtu;
1637                         peer->pmtu_learned = mtu;
1638                         peer->pmtu_expires = pmtu_expires;
1639                         atomic_inc(&__rt_peer_genid);
1640                 }
1641
1642                 inet_putpeer(peer);
1643         }
1644         return est_mtu ? : new_mtu;
1645 }
1646
1647 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1648 {
1649         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1650
1651         if (!expires)
1652                 return;
1653         if (time_before(jiffies, expires)) {
1654                 u32 orig_dst_mtu = dst_mtu(dst);
1655                 if (peer->pmtu_learned < orig_dst_mtu) {
1656                         if (!peer->pmtu_orig)
1657                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1658                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1659                 }
1660         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1661                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1662 }
1663
1664 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1665 {
1666         struct rtable *rt = (struct rtable *) dst;
1667         struct inet_peer *peer;
1668
1669         dst_confirm(dst);
1670
1671         if (!rt->peer)
1672                 rt_bind_peer(rt, rt->rt_dst, 1);
1673         peer = rt->peer;
1674         if (peer) {
1675                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1676
1677                 if (mtu < ip_rt_min_pmtu)
1678                         mtu = ip_rt_min_pmtu;
1679                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1680
1681                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1682                         if (!pmtu_expires)
1683                                 pmtu_expires = 1UL;
1684
1685                         peer->pmtu_learned = mtu;
1686                         peer->pmtu_expires = pmtu_expires;
1687
1688                         atomic_inc(&__rt_peer_genid);
1689                         rt->rt_peer_genid = rt_peer_genid();
1690                 }
1691                 check_peer_pmtu(dst, peer);
1692         }
1693 }
1694
1695
1696 static struct rtable *ipv4_validate_peer(struct rtable *rt)
1697 {
1698         if (rt->rt_peer_genid != rt_peer_genid()) {
1699                 struct inet_peer *peer;
1700
1701                 if (!rt->peer)
1702                         rt_bind_peer(rt, rt->rt_dst, 0);
1703
1704                 peer = rt->peer;
1705                 if (peer) {
1706                         check_peer_pmtu(&rt->dst, peer);
1707
1708                         if (peer->redirect_genid != redirect_genid)
1709                                 peer->redirect_learned.a4 = 0;
1710                         if (peer->redirect_learned.a4 &&
1711                             peer->redirect_learned.a4 != rt->rt_gateway) {
1712                                 if (check_peer_redir(&rt->dst, peer))
1713                                         return NULL;
1714                         }
1715                 }
1716
1717                 rt->rt_peer_genid = rt_peer_genid();
1718         }
1719         return rt;
1720 }
1721
1722 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1723 {
1724         struct rtable *rt = (struct rtable *) dst;
1725
1726         if (rt_is_expired(rt))
1727                 return NULL;
1728         dst = (struct dst_entry *) ipv4_validate_peer(rt);
1729         return dst;
1730 }
1731
1732 static void ipv4_dst_destroy(struct dst_entry *dst)
1733 {
1734         struct rtable *rt = (struct rtable *) dst;
1735         struct inet_peer *peer = rt->peer;
1736
1737         if (rt->fi) {
1738                 fib_info_put(rt->fi);
1739                 rt->fi = NULL;
1740         }
1741         if (peer) {
1742                 rt->peer = NULL;
1743                 inet_putpeer(peer);
1744         }
1745 }
1746
1747
1748 static void ipv4_link_failure(struct sk_buff *skb)
1749 {
1750         struct rtable *rt;
1751
1752         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1753
1754         rt = skb_rtable(skb);
1755         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1756                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1757 }
1758
1759 static int ip_rt_bug(struct sk_buff *skb)
1760 {
1761         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1762                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1763                 skb->dev ? skb->dev->name : "?");
1764         kfree_skb(skb);
1765         WARN_ON(1);
1766         return 0;
1767 }
1768
1769 /*
1770    We do not cache source address of outgoing interface,
1771    because it is used only by IP RR, TS and SRR options,
1772    so that it out of fast path.
1773
1774    BTW remember: "addr" is allowed to be not aligned
1775    in IP options!
1776  */
1777
1778 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1779 {
1780         __be32 src;
1781
1782         if (rt_is_output_route(rt))
1783                 src = ip_hdr(skb)->saddr;
1784         else {
1785                 struct fib_result res;
1786                 struct flowi4 fl4;
1787                 struct iphdr *iph;
1788
1789                 iph = ip_hdr(skb);
1790
1791                 memset(&fl4, 0, sizeof(fl4));
1792                 fl4.daddr = iph->daddr;
1793                 fl4.saddr = iph->saddr;
1794                 fl4.flowi4_tos = RT_TOS(iph->tos);
1795                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1796                 fl4.flowi4_iif = skb->dev->ifindex;
1797                 fl4.flowi4_mark = skb->mark;
1798
1799                 rcu_read_lock();
1800                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1801                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1802                 else
1803                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1804                                         RT_SCOPE_UNIVERSE);
1805                 rcu_read_unlock();
1806         }
1807         memcpy(addr, &src, 4);
1808 }
1809
1810 #ifdef CONFIG_IP_ROUTE_CLASSID
1811 static void set_class_tag(struct rtable *rt, u32 tag)
1812 {
1813         if (!(rt->dst.tclassid & 0xFFFF))
1814                 rt->dst.tclassid |= tag & 0xFFFF;
1815         if (!(rt->dst.tclassid & 0xFFFF0000))
1816                 rt->dst.tclassid |= tag & 0xFFFF0000;
1817 }
1818 #endif
1819
1820 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1821 {
1822         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1823
1824         if (advmss == 0) {
1825                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1826                                ip_rt_min_advmss);
1827                 if (advmss > 65535 - 40)
1828                         advmss = 65535 - 40;
1829         }
1830         return advmss;
1831 }
1832
1833 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1834 {
1835         const struct rtable *rt = (const struct rtable *) dst;
1836         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1837
1838         if (mtu && rt_is_output_route(rt))
1839                 return mtu;
1840
1841         mtu = dst->dev->mtu;
1842
1843         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1844
1845                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1846                         mtu = 576;
1847         }
1848
1849         if (mtu > IP_MAX_MTU)
1850                 mtu = IP_MAX_MTU;
1851
1852         return mtu;
1853 }
1854
1855 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1856                             struct fib_info *fi)
1857 {
1858         struct inet_peer *peer;
1859         int create = 0;
1860
1861         /* If a peer entry exists for this destination, we must hook
1862          * it up in order to get at cached metrics.
1863          */
1864         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1865                 create = 1;
1866
1867         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1868         if (peer) {
1869                 rt->rt_peer_genid = rt_peer_genid();
1870                 if (inet_metrics_new(peer))
1871                         memcpy(peer->metrics, fi->fib_metrics,
1872                                sizeof(u32) * RTAX_MAX);
1873                 dst_init_metrics(&rt->dst, peer->metrics, false);
1874
1875                 check_peer_pmtu(&rt->dst, peer);
1876                 if (peer->redirect_genid != redirect_genid)
1877                         peer->redirect_learned.a4 = 0;
1878                 if (peer->redirect_learned.a4 &&
1879                     peer->redirect_learned.a4 != rt->rt_gateway) {
1880                         rt->rt_gateway = peer->redirect_learned.a4;
1881                         rt->rt_flags |= RTCF_REDIRECTED;
1882                 }
1883         } else {
1884                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1885                         rt->fi = fi;
1886                         atomic_inc(&fi->fib_clntref);
1887                 }
1888                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1889         }
1890 }
1891
1892 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1893                            const struct fib_result *res,
1894                            struct fib_info *fi, u16 type, u32 itag)
1895 {
1896         struct dst_entry *dst = &rt->dst;
1897
1898         if (fi) {
1899                 if (FIB_RES_GW(*res) &&
1900                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1901                         rt->rt_gateway = FIB_RES_GW(*res);
1902                 rt_init_metrics(rt, fl4, fi);
1903 #ifdef CONFIG_IP_ROUTE_CLASSID
1904                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1905 #endif
1906         }
1907
1908         if (dst_mtu(dst) > IP_MAX_MTU)
1909                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1910         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1911                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1912
1913 #ifdef CONFIG_IP_ROUTE_CLASSID
1914 #ifdef CONFIG_IP_MULTIPLE_TABLES
1915         set_class_tag(rt, fib_rules_tclass(res));
1916 #endif
1917         set_class_tag(rt, itag);
1918 #endif
1919 }
1920
1921 static struct rtable *rt_dst_alloc(struct net_device *dev,
1922                                    bool nopolicy, bool noxfrm)
1923 {
1924         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1925                          DST_HOST |
1926                          (nopolicy ? DST_NOPOLICY : 0) |
1927                          (noxfrm ? DST_NOXFRM : 0));
1928 }
1929
1930 /* called in rcu_read_lock() section */
1931 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1932                                 u8 tos, struct net_device *dev, int our)
1933 {
1934         unsigned int hash;
1935         struct rtable *rth;
1936         __be32 spec_dst;
1937         struct in_device *in_dev = __in_dev_get_rcu(dev);
1938         u32 itag = 0;
1939         int err;
1940
1941         /* Primary sanity checks. */
1942
1943         if (in_dev == NULL)
1944                 return -EINVAL;
1945
1946         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1947             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1948                 goto e_inval;
1949
1950         if (ipv4_is_zeronet(saddr)) {
1951                 if (!ipv4_is_local_multicast(daddr))
1952                         goto e_inval;
1953                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1954         } else {
1955                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1956                                           &itag);
1957                 if (err < 0)
1958                         goto e_err;
1959         }
1960         rth = rt_dst_alloc(init_net.loopback_dev,
1961                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1962         if (!rth)
1963                 goto e_nobufs;
1964
1965 #ifdef CONFIG_IP_ROUTE_CLASSID
1966         rth->dst.tclassid = itag;
1967 #endif
1968         rth->dst.output = ip_rt_bug;
1969
1970         rth->rt_key_dst = daddr;
1971         rth->rt_key_src = saddr;
1972         rth->rt_genid   = rt_genid(dev_net(dev));
1973         rth->rt_flags   = RTCF_MULTICAST;
1974         rth->rt_type    = RTN_MULTICAST;
1975         rth->rt_key_tos = tos;
1976         rth->rt_dst     = daddr;
1977         rth->rt_src     = saddr;
1978         rth->rt_route_iif = dev->ifindex;
1979         rth->rt_iif     = dev->ifindex;
1980         rth->rt_oif     = 0;
1981         rth->rt_mark    = skb->mark;
1982         rth->rt_gateway = daddr;
1983         rth->rt_spec_dst= spec_dst;
1984         rth->rt_peer_genid = 0;
1985         rth->peer = NULL;
1986         rth->fi = NULL;
1987         if (our) {
1988                 rth->dst.input= ip_local_deliver;
1989                 rth->rt_flags |= RTCF_LOCAL;
1990         }
1991
1992 #ifdef CONFIG_IP_MROUTE
1993         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1994                 rth->dst.input = ip_mr_input;
1995 #endif
1996         RT_CACHE_STAT_INC(in_slow_mc);
1997
1998         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1999         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2000         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2001
2002 e_nobufs:
2003         return -ENOBUFS;
2004 e_inval:
2005         return -EINVAL;
2006 e_err:
2007         return err;
2008 }
2009
2010
2011 static void ip_handle_martian_source(struct net_device *dev,
2012                                      struct in_device *in_dev,
2013                                      struct sk_buff *skb,
2014                                      __be32 daddr,
2015                                      __be32 saddr)
2016 {
2017         RT_CACHE_STAT_INC(in_martian_src);
2018 #ifdef CONFIG_IP_ROUTE_VERBOSE
2019         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2020                 /*
2021                  *      RFC1812 recommendation, if source is martian,
2022                  *      the only hint is MAC header.
2023                  */
2024                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2025                         &daddr, &saddr, dev->name);
2026                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2027                         int i;
2028                         const unsigned char *p = skb_mac_header(skb);
2029                         printk(KERN_WARNING "ll header: ");
2030                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2031                                 printk("%02x", *p);
2032                                 if (i < (dev->hard_header_len - 1))
2033                                         printk(":");
2034                         }
2035                         printk("\n");
2036                 }
2037         }
2038 #endif
2039 }
2040
2041 /* called in rcu_read_lock() section */
2042 static int __mkroute_input(struct sk_buff *skb,
2043                            const struct fib_result *res,
2044                            struct in_device *in_dev,
2045                            __be32 daddr, __be32 saddr, u32 tos,
2046                            struct rtable **result)
2047 {
2048         struct rtable *rth;
2049         int err;
2050         struct in_device *out_dev;
2051         unsigned int flags = 0;
2052         __be32 spec_dst;
2053         u32 itag;
2054
2055         /* get a working reference to the output device */
2056         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2057         if (out_dev == NULL) {
2058                 if (net_ratelimit())
2059                         printk(KERN_CRIT "Bug in ip_route_input" \
2060                                "_slow(). Please, report\n");
2061                 return -EINVAL;
2062         }
2063
2064
2065         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2066                                   in_dev->dev, &spec_dst, &itag);
2067         if (err < 0) {
2068                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2069                                          saddr);
2070
2071                 goto cleanup;
2072         }
2073
2074         if (err)
2075                 flags |= RTCF_DIRECTSRC;
2076
2077         if (out_dev == in_dev && err &&
2078             (IN_DEV_SHARED_MEDIA(out_dev) ||
2079              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2080                 flags |= RTCF_DOREDIRECT;
2081
2082         if (skb->protocol != htons(ETH_P_IP)) {
2083                 /* Not IP (i.e. ARP). Do not create route, if it is
2084                  * invalid for proxy arp. DNAT routes are always valid.
2085                  *
2086                  * Proxy arp feature have been extended to allow, ARP
2087                  * replies back to the same interface, to support
2088                  * Private VLAN switch technologies. See arp.c.
2089                  */
2090                 if (out_dev == in_dev &&
2091                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2092                         err = -EINVAL;
2093                         goto cleanup;
2094                 }
2095         }
2096
2097         rth = rt_dst_alloc(out_dev->dev,
2098                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2099                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2100         if (!rth) {
2101                 err = -ENOBUFS;
2102                 goto cleanup;
2103         }
2104
2105         rth->rt_key_dst = daddr;
2106         rth->rt_key_src = saddr;
2107         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2108         rth->rt_flags = flags;
2109         rth->rt_type = res->type;
2110         rth->rt_key_tos = tos;
2111         rth->rt_dst     = daddr;
2112         rth->rt_src     = saddr;
2113         rth->rt_route_iif = in_dev->dev->ifindex;
2114         rth->rt_iif     = in_dev->dev->ifindex;
2115         rth->rt_oif     = 0;
2116         rth->rt_mark    = skb->mark;
2117         rth->rt_gateway = daddr;
2118         rth->rt_spec_dst= spec_dst;
2119         rth->rt_peer_genid = 0;
2120         rth->peer = NULL;
2121         rth->fi = NULL;
2122
2123         rth->dst.input = ip_forward;
2124         rth->dst.output = ip_output;
2125
2126         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2127
2128         *result = rth;
2129         err = 0;
2130  cleanup:
2131         return err;
2132 }
2133
2134 static int ip_mkroute_input(struct sk_buff *skb,
2135                             struct fib_result *res,
2136                             const struct flowi4 *fl4,
2137                             struct in_device *in_dev,
2138                             __be32 daddr, __be32 saddr, u32 tos)
2139 {
2140         struct rtable* rth = NULL;
2141         int err;
2142         unsigned hash;
2143
2144 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2145         if (res->fi && res->fi->fib_nhs > 1)
2146                 fib_select_multipath(res);
2147 #endif
2148
2149         /* create a routing cache entry */
2150         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2151         if (err)
2152                 return err;
2153
2154         /* put it into the cache */
2155         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2156                        rt_genid(dev_net(rth->dst.dev)));
2157         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2158         if (IS_ERR(rth))
2159                 return PTR_ERR(rth);
2160         return 0;
2161 }
2162
2163 /*
2164  *      NOTE. We drop all the packets that has local source
2165  *      addresses, because every properly looped back packet
2166  *      must have correct destination already attached by output routine.
2167  *
2168  *      Such approach solves two big problems:
2169  *      1. Not simplex devices are handled properly.
2170  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2171  *      called with rcu_read_lock()
2172  */
2173
2174 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2175                                u8 tos, struct net_device *dev)
2176 {
2177         struct fib_result res;
2178         struct in_device *in_dev = __in_dev_get_rcu(dev);
2179         struct flowi4   fl4;
2180         unsigned        flags = 0;
2181         u32             itag = 0;
2182         struct rtable * rth;
2183         unsigned        hash;
2184         __be32          spec_dst;
2185         int             err = -EINVAL;
2186         struct net    * net = dev_net(dev);
2187
2188         /* IP on this device is disabled. */
2189
2190         if (!in_dev)
2191                 goto out;
2192
2193         /* Check for the most weird martians, which can be not detected
2194            by fib_lookup.
2195          */
2196
2197         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2198             ipv4_is_loopback(saddr))
2199                 goto martian_source;
2200
2201         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2202                 goto brd_input;
2203
2204         /* Accept zero addresses only to limited broadcast;
2205          * I even do not know to fix it or not. Waiting for complains :-)
2206          */
2207         if (ipv4_is_zeronet(saddr))
2208                 goto martian_source;
2209
2210         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2211                 goto martian_destination;
2212
2213         /*
2214          *      Now we are ready to route packet.
2215          */
2216         fl4.flowi4_oif = 0;
2217         fl4.flowi4_iif = dev->ifindex;
2218         fl4.flowi4_mark = skb->mark;
2219         fl4.flowi4_tos = tos;
2220         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2221         fl4.daddr = daddr;
2222         fl4.saddr = saddr;
2223         err = fib_lookup(net, &fl4, &res);
2224         if (err != 0) {
2225                 if (!IN_DEV_FORWARD(in_dev))
2226                         goto e_hostunreach;
2227                 goto no_route;
2228         }
2229
2230         RT_CACHE_STAT_INC(in_slow_tot);
2231
2232         if (res.type == RTN_BROADCAST)
2233                 goto brd_input;
2234
2235         if (res.type == RTN_LOCAL) {
2236                 err = fib_validate_source(skb, saddr, daddr, tos,
2237                                           net->loopback_dev->ifindex,
2238                                           dev, &spec_dst, &itag);
2239                 if (err < 0)
2240                         goto martian_source_keep_err;
2241                 if (err)
2242                         flags |= RTCF_DIRECTSRC;
2243                 spec_dst = daddr;
2244                 goto local_input;
2245         }
2246
2247         if (!IN_DEV_FORWARD(in_dev))
2248                 goto e_hostunreach;
2249         if (res.type != RTN_UNICAST)
2250                 goto martian_destination;
2251
2252         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2253 out:    return err;
2254
2255 brd_input:
2256         if (skb->protocol != htons(ETH_P_IP))
2257                 goto e_inval;
2258
2259         if (ipv4_is_zeronet(saddr))
2260                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2261         else {
2262                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2263                                           &itag);
2264                 if (err < 0)
2265                         goto martian_source_keep_err;
2266                 if (err)
2267                         flags |= RTCF_DIRECTSRC;
2268         }
2269         flags |= RTCF_BROADCAST;
2270         res.type = RTN_BROADCAST;
2271         RT_CACHE_STAT_INC(in_brd);
2272
2273 local_input:
2274         rth = rt_dst_alloc(net->loopback_dev,
2275                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2276         if (!rth)
2277                 goto e_nobufs;
2278
2279         rth->dst.input= ip_local_deliver;
2280         rth->dst.output= ip_rt_bug;
2281 #ifdef CONFIG_IP_ROUTE_CLASSID
2282         rth->dst.tclassid = itag;
2283 #endif
2284
2285         rth->rt_key_dst = daddr;
2286         rth->rt_key_src = saddr;
2287         rth->rt_genid = rt_genid(net);
2288         rth->rt_flags   = flags|RTCF_LOCAL;
2289         rth->rt_type    = res.type;
2290         rth->rt_key_tos = tos;
2291         rth->rt_dst     = daddr;
2292         rth->rt_src     = saddr;
2293 #ifdef CONFIG_IP_ROUTE_CLASSID
2294         rth->dst.tclassid = itag;
2295 #endif
2296         rth->rt_route_iif = dev->ifindex;
2297         rth->rt_iif     = dev->ifindex;
2298         rth->rt_oif     = 0;
2299         rth->rt_mark    = skb->mark;
2300         rth->rt_gateway = daddr;
2301         rth->rt_spec_dst= spec_dst;
2302         rth->rt_peer_genid = 0;
2303         rth->peer = NULL;
2304         rth->fi = NULL;
2305         if (res.type == RTN_UNREACHABLE) {
2306                 rth->dst.input= ip_error;
2307                 rth->dst.error= -err;
2308                 rth->rt_flags   &= ~RTCF_LOCAL;
2309         }
2310         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2311         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2312         err = 0;
2313         if (IS_ERR(rth))
2314                 err = PTR_ERR(rth);
2315         goto out;
2316
2317 no_route:
2318         RT_CACHE_STAT_INC(in_no_route);
2319         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2320         res.type = RTN_UNREACHABLE;
2321         if (err == -ESRCH)
2322                 err = -ENETUNREACH;
2323         goto local_input;
2324
2325         /*
2326          *      Do not cache martian addresses: they should be logged (RFC1812)
2327          */
2328 martian_destination:
2329         RT_CACHE_STAT_INC(in_martian_dst);
2330 #ifdef CONFIG_IP_ROUTE_VERBOSE
2331         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2332                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2333                         &daddr, &saddr, dev->name);
2334 #endif
2335
2336 e_hostunreach:
2337         err = -EHOSTUNREACH;
2338         goto out;
2339
2340 e_inval:
2341         err = -EINVAL;
2342         goto out;
2343
2344 e_nobufs:
2345         err = -ENOBUFS;
2346         goto out;
2347
2348 martian_source:
2349         err = -EINVAL;
2350 martian_source_keep_err:
2351         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2352         goto out;
2353 }
2354
2355 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2356                            u8 tos, struct net_device *dev, bool noref)
2357 {
2358         struct rtable * rth;
2359         unsigned        hash;
2360         int iif = dev->ifindex;
2361         struct net *net;
2362         int res;
2363
2364         net = dev_net(dev);
2365
2366         rcu_read_lock();
2367
2368         if (!rt_caching(net))
2369                 goto skip_cache;
2370
2371         tos &= IPTOS_RT_MASK;
2372         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2373
2374         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2375              rth = rcu_dereference(rth->dst.rt_next)) {
2376                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2377                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2378                      (rth->rt_route_iif ^ iif) |
2379                      (rth->rt_key_tos ^ tos)) == 0 &&
2380                     rth->rt_mark == skb->mark &&
2381                     net_eq(dev_net(rth->dst.dev), net) &&
2382                     !rt_is_expired(rth)) {
2383                         rth = ipv4_validate_peer(rth);
2384                         if (!rth)
2385                                 continue;
2386                         if (noref) {
2387                                 dst_use_noref(&rth->dst, jiffies);
2388                                 skb_dst_set_noref(skb, &rth->dst);
2389                         } else {
2390                                 dst_use(&rth->dst, jiffies);
2391                                 skb_dst_set(skb, &rth->dst);
2392                         }
2393                         RT_CACHE_STAT_INC(in_hit);
2394                         rcu_read_unlock();
2395                         return 0;
2396                 }
2397                 RT_CACHE_STAT_INC(in_hlist_search);
2398         }
2399
2400 skip_cache:
2401         /* Multicast recognition logic is moved from route cache to here.
2402            The problem was that too many Ethernet cards have broken/missing
2403            hardware multicast filters :-( As result the host on multicasting
2404            network acquires a lot of useless route cache entries, sort of
2405            SDR messages from all the world. Now we try to get rid of them.
2406            Really, provided software IP multicast filter is organized
2407            reasonably (at least, hashed), it does not result in a slowdown
2408            comparing with route cache reject entries.
2409            Note, that multicast routers are not affected, because
2410            route cache entry is created eventually.
2411          */
2412         if (ipv4_is_multicast(daddr)) {
2413                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2414
2415                 if (in_dev) {
2416                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2417                                                   ip_hdr(skb)->protocol);
2418                         if (our
2419 #ifdef CONFIG_IP_MROUTE
2420                                 ||
2421                             (!ipv4_is_local_multicast(daddr) &&
2422                              IN_DEV_MFORWARD(in_dev))
2423 #endif
2424                            ) {
2425                                 int res = ip_route_input_mc(skb, daddr, saddr,
2426                                                             tos, dev, our);
2427                                 rcu_read_unlock();
2428                                 return res;
2429                         }
2430                 }
2431                 rcu_read_unlock();
2432                 return -EINVAL;
2433         }
2434         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2435         rcu_read_unlock();
2436         return res;
2437 }
2438 EXPORT_SYMBOL(ip_route_input_common);
2439
2440 /* called with rcu_read_lock() */
2441 static struct rtable *__mkroute_output(const struct fib_result *res,
2442                                        const struct flowi4 *fl4,
2443                                        __be32 orig_daddr, __be32 orig_saddr,
2444                                        int orig_oif, struct net_device *dev_out,
2445                                        unsigned int flags)
2446 {
2447         struct fib_info *fi = res->fi;
2448         u32 tos = RT_FL_TOS(fl4);
2449         struct in_device *in_dev;
2450         u16 type = res->type;
2451         struct rtable *rth;
2452
2453         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2454                 return ERR_PTR(-EINVAL);
2455
2456         if (ipv4_is_lbcast(fl4->daddr))
2457                 type = RTN_BROADCAST;
2458         else if (ipv4_is_multicast(fl4->daddr))
2459                 type = RTN_MULTICAST;
2460         else if (ipv4_is_zeronet(fl4->daddr))
2461                 return ERR_PTR(-EINVAL);
2462
2463         if (dev_out->flags & IFF_LOOPBACK)
2464                 flags |= RTCF_LOCAL;
2465
2466         in_dev = __in_dev_get_rcu(dev_out);
2467         if (!in_dev)
2468                 return ERR_PTR(-EINVAL);
2469
2470         if (type == RTN_BROADCAST) {
2471                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2472                 fi = NULL;
2473         } else if (type == RTN_MULTICAST) {
2474                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2475                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2476                                      fl4->flowi4_proto))
2477                         flags &= ~RTCF_LOCAL;
2478                 /* If multicast route do not exist use
2479                  * default one, but do not gateway in this case.
2480                  * Yes, it is hack.
2481                  */
2482                 if (fi && res->prefixlen < 4)
2483                         fi = NULL;
2484         }
2485
2486         rth = rt_dst_alloc(dev_out,
2487                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2488                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2489         if (!rth)
2490                 return ERR_PTR(-ENOBUFS);
2491
2492         rth->dst.output = ip_output;
2493
2494         rth->rt_key_dst = orig_daddr;
2495         rth->rt_key_src = orig_saddr;
2496         rth->rt_genid = rt_genid(dev_net(dev_out));
2497         rth->rt_flags   = flags;
2498         rth->rt_type    = type;
2499         rth->rt_key_tos = tos;
2500         rth->rt_dst     = fl4->daddr;
2501         rth->rt_src     = fl4->saddr;
2502         rth->rt_route_iif = 0;
2503         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2504         rth->rt_oif     = orig_oif;
2505         rth->rt_mark    = fl4->flowi4_mark;
2506         rth->rt_gateway = fl4->daddr;
2507         rth->rt_spec_dst= fl4->saddr;
2508         rth->rt_peer_genid = 0;
2509         rth->peer = NULL;
2510         rth->fi = NULL;
2511
2512         RT_CACHE_STAT_INC(out_slow_tot);
2513
2514         if (flags & RTCF_LOCAL) {
2515                 rth->dst.input = ip_local_deliver;
2516                 rth->rt_spec_dst = fl4->daddr;
2517         }
2518         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2519                 rth->rt_spec_dst = fl4->saddr;
2520                 if (flags & RTCF_LOCAL &&
2521                     !(dev_out->flags & IFF_LOOPBACK)) {
2522                         rth->dst.output = ip_mc_output;
2523                         RT_CACHE_STAT_INC(out_slow_mc);
2524                 }
2525 #ifdef CONFIG_IP_MROUTE
2526                 if (type == RTN_MULTICAST) {
2527                         if (IN_DEV_MFORWARD(in_dev) &&
2528                             !ipv4_is_local_multicast(fl4->daddr)) {
2529                                 rth->dst.input = ip_mr_input;
2530                                 rth->dst.output = ip_mc_output;
2531                         }
2532                 }
2533 #endif
2534         }
2535
2536         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2537
2538         return rth;
2539 }
2540
2541 /*
2542  * Major route resolver routine.
2543  * called with rcu_read_lock();
2544  */
2545
2546 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2547 {
2548         struct net_device *dev_out = NULL;
2549         u32 tos = RT_FL_TOS(fl4);
2550         unsigned int flags = 0;
2551         struct fib_result res;
2552         struct rtable *rth;
2553         __be32 orig_daddr;
2554         __be32 orig_saddr;
2555         int orig_oif;
2556
2557         res.fi          = NULL;
2558 #ifdef CONFIG_IP_MULTIPLE_TABLES
2559         res.r           = NULL;
2560 #endif
2561
2562         orig_daddr = fl4->daddr;
2563         orig_saddr = fl4->saddr;
2564         orig_oif = fl4->flowi4_oif;
2565
2566         fl4->flowi4_iif = net->loopback_dev->ifindex;
2567         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2568         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2569                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2570
2571         rcu_read_lock();
2572         if (fl4->saddr) {
2573                 rth = ERR_PTR(-EINVAL);
2574                 if (ipv4_is_multicast(fl4->saddr) ||
2575                     ipv4_is_lbcast(fl4->saddr) ||
2576                     ipv4_is_zeronet(fl4->saddr))
2577                         goto out;
2578
2579                 /* I removed check for oif == dev_out->oif here.
2580                    It was wrong for two reasons:
2581                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2582                       is assigned to multiple interfaces.
2583                    2. Moreover, we are allowed to send packets with saddr
2584                       of another iface. --ANK
2585                  */
2586
2587                 if (fl4->flowi4_oif == 0 &&
2588                     (ipv4_is_multicast(fl4->daddr) ||
2589                      ipv4_is_lbcast(fl4->daddr))) {
2590                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2591                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2592                         if (dev_out == NULL)
2593                                 goto out;
2594
2595                         /* Special hack: user can direct multicasts
2596                            and limited broadcast via necessary interface
2597                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2598                            This hack is not just for fun, it allows
2599                            vic,vat and friends to work.
2600                            They bind socket to loopback, set ttl to zero
2601                            and expect that it will work.
2602                            From the viewpoint of routing cache they are broken,
2603                            because we are not allowed to build multicast path
2604                            with loopback source addr (look, routing cache
2605                            cannot know, that ttl is zero, so that packet
2606                            will not leave this host and route is valid).
2607                            Luckily, this hack is good workaround.
2608                          */
2609
2610                         fl4->flowi4_oif = dev_out->ifindex;
2611                         goto make_route;
2612                 }
2613
2614                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2615                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2616                         if (!__ip_dev_find(net, fl4->saddr, false))
2617                                 goto out;
2618                 }
2619         }
2620
2621
2622         if (fl4->flowi4_oif) {
2623                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2624                 rth = ERR_PTR(-ENODEV);
2625                 if (dev_out == NULL)
2626                         goto out;
2627
2628                 /* RACE: Check return value of inet_select_addr instead. */
2629                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2630                         rth = ERR_PTR(-ENETUNREACH);
2631                         goto out;
2632                 }
2633                 if (ipv4_is_local_multicast(fl4->daddr) ||
2634                     ipv4_is_lbcast(fl4->daddr)) {
2635                         if (!fl4->saddr)
2636                                 fl4->saddr = inet_select_addr(dev_out, 0,
2637                                                               RT_SCOPE_LINK);
2638                         goto make_route;
2639                 }
2640                 if (fl4->saddr) {
2641                         if (ipv4_is_multicast(fl4->daddr))
2642                                 fl4->saddr = inet_select_addr(dev_out, 0,
2643                                                               fl4->flowi4_scope);
2644                         else if (!fl4->daddr)
2645                                 fl4->saddr = inet_select_addr(dev_out, 0,
2646                                                               RT_SCOPE_HOST);
2647                 }
2648         }
2649
2650         if (!fl4->daddr) {
2651                 fl4->daddr = fl4->saddr;
2652                 if (!fl4->daddr)
2653                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2654                 dev_out = net->loopback_dev;
2655                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2656                 res.type = RTN_LOCAL;
2657                 flags |= RTCF_LOCAL;
2658                 goto make_route;
2659         }
2660
2661         if (fib_lookup(net, fl4, &res)) {
2662                 res.fi = NULL;
2663                 if (fl4->flowi4_oif) {
2664                         /* Apparently, routing tables are wrong. Assume,
2665                            that the destination is on link.
2666
2667                            WHY? DW.
2668                            Because we are allowed to send to iface
2669                            even if it has NO routes and NO assigned
2670                            addresses. When oif is specified, routing
2671                            tables are looked up with only one purpose:
2672                            to catch if destination is gatewayed, rather than
2673                            direct. Moreover, if MSG_DONTROUTE is set,
2674                            we send packet, ignoring both routing tables
2675                            and ifaddr state. --ANK
2676
2677
2678                            We could make it even if oif is unknown,
2679                            likely IPv6, but we do not.
2680                          */
2681
2682                         if (fl4->saddr == 0)
2683                                 fl4->saddr = inet_select_addr(dev_out, 0,
2684                                                               RT_SCOPE_LINK);
2685                         res.type = RTN_UNICAST;
2686                         goto make_route;
2687                 }
2688                 rth = ERR_PTR(-ENETUNREACH);
2689                 goto out;
2690         }
2691
2692         if (res.type == RTN_LOCAL) {
2693                 if (!fl4->saddr) {
2694                         if (res.fi->fib_prefsrc)
2695                                 fl4->saddr = res.fi->fib_prefsrc;
2696                         else
2697                                 fl4->saddr = fl4->daddr;
2698                 }
2699                 dev_out = net->loopback_dev;
2700                 fl4->flowi4_oif = dev_out->ifindex;
2701                 res.fi = NULL;
2702                 flags |= RTCF_LOCAL;
2703                 goto make_route;
2704         }
2705
2706 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2707         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2708                 fib_select_multipath(&res);
2709         else
2710 #endif
2711         if (!res.prefixlen &&
2712             res.table->tb_num_default > 1 &&
2713             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2714                 fib_select_default(&res);
2715
2716         if (!fl4->saddr)
2717                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2718
2719         dev_out = FIB_RES_DEV(res);
2720         fl4->flowi4_oif = dev_out->ifindex;
2721
2722
2723 make_route:
2724         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2725                                dev_out, flags);
2726         if (!IS_ERR(rth)) {
2727                 unsigned int hash;
2728
2729                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2730                                rt_genid(dev_net(dev_out)));
2731                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2732         }
2733
2734 out:
2735         rcu_read_unlock();
2736         return rth;
2737 }
2738
2739 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2740 {
2741         struct rtable *rth;
2742         unsigned int hash;
2743
2744         if (!rt_caching(net))
2745                 goto slow_output;
2746
2747         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2748
2749         rcu_read_lock_bh();
2750         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2751                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2752                 if (rth->rt_key_dst == flp4->daddr &&
2753                     rth->rt_key_src == flp4->saddr &&
2754                     rt_is_output_route(rth) &&
2755                     rth->rt_oif == flp4->flowi4_oif &&
2756                     rth->rt_mark == flp4->flowi4_mark &&
2757                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2758                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2759                     net_eq(dev_net(rth->dst.dev), net) &&
2760                     !rt_is_expired(rth)) {
2761                         rth = ipv4_validate_peer(rth);
2762                         if (!rth)
2763                                 continue;
2764                         dst_use(&rth->dst, jiffies);
2765                         RT_CACHE_STAT_INC(out_hit);
2766                         rcu_read_unlock_bh();
2767                         if (!flp4->saddr)
2768                                 flp4->saddr = rth->rt_src;
2769                         if (!flp4->daddr)
2770                                 flp4->daddr = rth->rt_dst;
2771                         return rth;
2772                 }
2773                 RT_CACHE_STAT_INC(out_hlist_search);
2774         }
2775         rcu_read_unlock_bh();
2776
2777 slow_output:
2778         return ip_route_output_slow(net, flp4);
2779 }
2780 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2781
2782 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2783 {
2784         return NULL;
2785 }
2786
2787 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2788 {
2789         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2790
2791         return mtu ? : dst->dev->mtu;
2792 }
2793
2794 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2795 {
2796 }
2797
2798 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2799                                           unsigned long old)
2800 {
2801         return NULL;
2802 }
2803
2804 static struct dst_ops ipv4_dst_blackhole_ops = {
2805         .family                 =       AF_INET,
2806         .protocol               =       cpu_to_be16(ETH_P_IP),
2807         .destroy                =       ipv4_dst_destroy,
2808         .check                  =       ipv4_blackhole_dst_check,
2809         .mtu                    =       ipv4_blackhole_mtu,
2810         .default_advmss         =       ipv4_default_advmss,
2811         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2812         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2813         .neigh_lookup           =       ipv4_neigh_lookup,
2814 };
2815
2816 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2817 {
2818         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2819         struct rtable *ort = (struct rtable *) dst_orig;
2820
2821         if (rt) {
2822                 struct dst_entry *new = &rt->dst;
2823
2824                 new->__use = 1;
2825                 new->input = dst_discard;
2826                 new->output = dst_discard;
2827                 dst_copy_metrics(new, &ort->dst);
2828
2829                 new->dev = ort->dst.dev;
2830                 if (new->dev)
2831                         dev_hold(new->dev);
2832
2833                 rt->rt_key_dst = ort->rt_key_dst;
2834                 rt->rt_key_src = ort->rt_key_src;
2835                 rt->rt_key_tos = ort->rt_key_tos;
2836                 rt->rt_route_iif = ort->rt_route_iif;
2837                 rt->rt_iif = ort->rt_iif;
2838                 rt->rt_oif = ort->rt_oif;
2839                 rt->rt_mark = ort->rt_mark;
2840
2841                 rt->rt_genid = rt_genid(net);
2842                 rt->rt_flags = ort->rt_flags;
2843                 rt->rt_type = ort->rt_type;
2844                 rt->rt_dst = ort->rt_dst;
2845                 rt->rt_src = ort->rt_src;
2846                 rt->rt_gateway = ort->rt_gateway;
2847                 rt->rt_spec_dst = ort->rt_spec_dst;
2848                 rt->peer = ort->peer;
2849                 if (rt->peer)
2850                         atomic_inc(&rt->peer->refcnt);
2851                 rt->fi = ort->fi;
2852                 if (rt->fi)
2853                         atomic_inc(&rt->fi->fib_clntref);
2854
2855                 dst_free(new);
2856         }
2857
2858         dst_release(dst_orig);
2859
2860         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2861 }
2862
2863 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2864                                     struct sock *sk)
2865 {
2866         struct rtable *rt = __ip_route_output_key(net, flp4);
2867
2868         if (IS_ERR(rt))
2869                 return rt;
2870
2871         if (flp4->flowi4_proto)
2872                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2873                                                    flowi4_to_flowi(flp4),
2874                                                    sk, 0);
2875
2876         return rt;
2877 }
2878 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2879
2880 static int rt_fill_info(struct net *net,
2881                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2882                         int nowait, unsigned int flags)
2883 {
2884         struct rtable *rt = skb_rtable(skb);
2885         struct rtmsg *r;
2886         struct nlmsghdr *nlh;
2887         unsigned long expires = 0;
2888         const struct inet_peer *peer = rt->peer;
2889         u32 id = 0, ts = 0, tsage = 0, error;
2890
2891         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2892         if (nlh == NULL)
2893                 return -EMSGSIZE;
2894
2895         r = nlmsg_data(nlh);
2896         r->rtm_family    = AF_INET;
2897         r->rtm_dst_len  = 32;
2898         r->rtm_src_len  = 0;
2899         r->rtm_tos      = rt->rt_key_tos;
2900         r->rtm_table    = RT_TABLE_MAIN;
2901         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2902         r->rtm_type     = rt->rt_type;
2903         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2904         r->rtm_protocol = RTPROT_UNSPEC;
2905         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2906         if (rt->rt_flags & RTCF_NOTIFY)
2907                 r->rtm_flags |= RTM_F_NOTIFY;
2908
2909         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2910
2911         if (rt->rt_key_src) {
2912                 r->rtm_src_len = 32;
2913                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2914         }
2915         if (rt->dst.dev)
2916                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2917 #ifdef CONFIG_IP_ROUTE_CLASSID
2918         if (rt->dst.tclassid)
2919                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2920 #endif
2921         if (rt_is_input_route(rt))
2922                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2923         else if (rt->rt_src != rt->rt_key_src)
2924                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2925
2926         if (rt->rt_dst != rt->rt_gateway)
2927                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2928
2929         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2930                 goto nla_put_failure;
2931
2932         if (rt->rt_mark)
2933                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2934
2935         error = rt->dst.error;
2936         if (peer) {
2937                 inet_peer_refcheck(rt->peer);
2938                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2939                 if (peer->tcp_ts_stamp) {
2940                         ts = peer->tcp_ts;
2941                         tsage = get_seconds() - peer->tcp_ts_stamp;
2942                 }
2943                 expires = ACCESS_ONCE(peer->pmtu_expires);
2944                 if (expires) {
2945                         if (time_before(jiffies, expires))
2946                                 expires -= jiffies;
2947                         else
2948                                 expires = 0;
2949                 }
2950         }
2951
2952         if (rt_is_input_route(rt)) {
2953 #ifdef CONFIG_IP_MROUTE
2954                 __be32 dst = rt->rt_dst;
2955
2956                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2957                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2958                         int err = ipmr_get_route(net, skb,
2959                                                  rt->rt_src, rt->rt_dst,
2960                                                  r, nowait);
2961                         if (err <= 0) {
2962                                 if (!nowait) {
2963                                         if (err == 0)
2964                                                 return 0;
2965                                         goto nla_put_failure;
2966                                 } else {
2967                                         if (err == -EMSGSIZE)
2968                                                 goto nla_put_failure;
2969                                         error = err;
2970                                 }
2971                         }
2972                 } else
2973 #endif
2974                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2975         }
2976
2977         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2978                                expires, error) < 0)
2979                 goto nla_put_failure;
2980
2981         return nlmsg_end(skb, nlh);
2982
2983 nla_put_failure:
2984         nlmsg_cancel(skb, nlh);
2985         return -EMSGSIZE;
2986 }
2987
2988 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2989 {
2990         struct net *net = sock_net(in_skb->sk);
2991         struct rtmsg *rtm;
2992         struct nlattr *tb[RTA_MAX+1];
2993         struct rtable *rt = NULL;
2994         __be32 dst = 0;
2995         __be32 src = 0;
2996         u32 iif;
2997         int err;
2998         int mark;
2999         struct sk_buff *skb;
3000
3001         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3002         if (err < 0)
3003                 goto errout;
3004
3005         rtm = nlmsg_data(nlh);
3006
3007         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3008         if (skb == NULL) {
3009                 err = -ENOBUFS;
3010                 goto errout;
3011         }
3012
3013         /* Reserve room for dummy headers, this skb can pass
3014            through good chunk of routing engine.
3015          */
3016         skb_reset_mac_header(skb);
3017         skb_reset_network_header(skb);
3018
3019         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3020         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3021         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3022
3023         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3024         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3025         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3026         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3027
3028         if (iif) {
3029                 struct net_device *dev;
3030
3031                 dev = __dev_get_by_index(net, iif);
3032                 if (dev == NULL) {
3033                         err = -ENODEV;
3034                         goto errout_free;
3035                 }
3036
3037                 skb->protocol   = htons(ETH_P_IP);
3038                 skb->dev        = dev;
3039                 skb->mark       = mark;
3040                 local_bh_disable();
3041                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3042                 local_bh_enable();
3043
3044                 rt = skb_rtable(skb);
3045                 if (err == 0 && rt->dst.error)
3046                         err = -rt->dst.error;
3047         } else {
3048                 struct flowi4 fl4 = {
3049                         .daddr = dst,
3050                         .saddr = src,
3051                         .flowi4_tos = rtm->rtm_tos,
3052                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3053                         .flowi4_mark = mark,
3054                 };
3055                 rt = ip_route_output_key(net, &fl4);
3056
3057                 err = 0;
3058                 if (IS_ERR(rt))
3059                         err = PTR_ERR(rt);
3060         }
3061
3062         if (err)
3063                 goto errout_free;
3064
3065         skb_dst_set(skb, &rt->dst);
3066         if (rtm->rtm_flags & RTM_F_NOTIFY)
3067                 rt->rt_flags |= RTCF_NOTIFY;
3068
3069         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3070                            RTM_NEWROUTE, 0, 0);
3071         if (err <= 0)
3072                 goto errout_free;
3073
3074         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3075 errout:
3076         return err;
3077
3078 errout_free:
3079         kfree_skb(skb);
3080         goto errout;
3081 }
3082
3083 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3084 {
3085         struct rtable *rt;
3086         int h, s_h;
3087         int idx, s_idx;
3088         struct net *net;
3089
3090         net = sock_net(skb->sk);
3091
3092         s_h = cb->args[0];
3093         if (s_h < 0)
3094                 s_h = 0;
3095         s_idx = idx = cb->args[1];
3096         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3097                 if (!rt_hash_table[h].chain)
3098                         continue;
3099                 rcu_read_lock_bh();
3100                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3101                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3102                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3103                                 continue;
3104                         if (rt_is_expired(rt))
3105                                 continue;
3106                         skb_dst_set_noref(skb, &rt->dst);
3107                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3108                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3109                                          1, NLM_F_MULTI) <= 0) {
3110                                 skb_dst_drop(skb);
3111                                 rcu_read_unlock_bh();
3112                                 goto done;
3113                         }
3114                         skb_dst_drop(skb);
3115                 }
3116                 rcu_read_unlock_bh();
3117         }
3118
3119 done:
3120         cb->args[0] = h;
3121         cb->args[1] = idx;
3122         return skb->len;
3123 }
3124
3125 void ip_rt_multicast_event(struct in_device *in_dev)
3126 {
3127         rt_cache_flush(dev_net(in_dev->dev), 0);
3128 }
3129
3130 #ifdef CONFIG_SYSCTL
3131 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3132                                         void __user *buffer,
3133                                         size_t *lenp, loff_t *ppos)
3134 {
3135         if (write) {
3136                 int flush_delay;
3137                 ctl_table ctl;
3138                 struct net *net;
3139
3140                 memcpy(&ctl, __ctl, sizeof(ctl));
3141                 ctl.data = &flush_delay;
3142                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3143
3144                 net = (struct net *)__ctl->extra1;
3145                 rt_cache_flush(net, flush_delay);
3146                 return 0;
3147         }
3148
3149         return -EINVAL;
3150 }
3151
3152 static ctl_table ipv4_route_table[] = {
3153         {
3154                 .procname       = "gc_thresh",
3155                 .data           = &ipv4_dst_ops.gc_thresh,
3156                 .maxlen         = sizeof(int),
3157                 .mode           = 0644,
3158                 .proc_handler   = proc_dointvec,
3159         },
3160         {
3161                 .procname       = "max_size",
3162                 .data           = &ip_rt_max_size,
3163                 .maxlen         = sizeof(int),
3164                 .mode           = 0644,
3165                 .proc_handler   = proc_dointvec,
3166         },
3167         {
3168                 /*  Deprecated. Use gc_min_interval_ms */
3169
3170                 .procname       = "gc_min_interval",
3171                 .data           = &ip_rt_gc_min_interval,
3172                 .maxlen         = sizeof(int),
3173                 .mode           = 0644,
3174                 .proc_handler   = proc_dointvec_jiffies,
3175         },
3176         {
3177                 .procname       = "gc_min_interval_ms",
3178                 .data           = &ip_rt_gc_min_interval,
3179                 .maxlen         = sizeof(int),
3180                 .mode           = 0644,
3181                 .proc_handler   = proc_dointvec_ms_jiffies,
3182         },
3183         {
3184                 .procname       = "gc_timeout",
3185                 .data           = &ip_rt_gc_timeout,
3186                 .maxlen         = sizeof(int),
3187                 .mode           = 0644,
3188                 .proc_handler   = proc_dointvec_jiffies,
3189         },
3190         {
3191                 .procname       = "redirect_load",
3192                 .data           = &ip_rt_redirect_load,
3193                 .maxlen         = sizeof(int),
3194                 .mode           = 0644,
3195                 .proc_handler   = proc_dointvec,
3196         },
3197         {
3198                 .procname       = "redirect_number",
3199                 .data           = &ip_rt_redirect_number,
3200                 .maxlen         = sizeof(int),
3201                 .mode           = 0644,
3202                 .proc_handler   = proc_dointvec,
3203         },
3204         {
3205                 .procname       = "redirect_silence",
3206                 .data           = &ip_rt_redirect_silence,
3207                 .maxlen         = sizeof(int),
3208                 .mode           = 0644,
3209                 .proc_handler   = proc_dointvec,
3210         },
3211         {
3212                 .procname       = "error_cost",
3213                 .data           = &ip_rt_error_cost,
3214                 .maxlen         = sizeof(int),
3215                 .mode           = 0644,
3216                 .proc_handler   = proc_dointvec,
3217         },
3218         {
3219                 .procname       = "error_burst",
3220                 .data           = &ip_rt_error_burst,
3221                 .maxlen         = sizeof(int),
3222                 .mode           = 0644,
3223                 .proc_handler   = proc_dointvec,
3224         },
3225         {
3226                 .procname       = "gc_elasticity",
3227                 .data           = &ip_rt_gc_elasticity,
3228                 .maxlen         = sizeof(int),
3229                 .mode           = 0644,
3230                 .proc_handler   = proc_dointvec,
3231         },
3232         {
3233                 .procname       = "mtu_expires",
3234                 .data           = &ip_rt_mtu_expires,
3235                 .maxlen         = sizeof(int),
3236                 .mode           = 0644,
3237                 .proc_handler   = proc_dointvec_jiffies,
3238         },
3239         {
3240                 .procname       = "min_pmtu",
3241                 .data           = &ip_rt_min_pmtu,
3242                 .maxlen         = sizeof(int),
3243                 .mode           = 0644,
3244                 .proc_handler   = proc_dointvec,
3245         },
3246         {
3247                 .procname       = "min_adv_mss",
3248                 .data           = &ip_rt_min_advmss,
3249                 .maxlen         = sizeof(int),
3250                 .mode           = 0644,
3251                 .proc_handler   = proc_dointvec,
3252         },
3253         { }
3254 };
3255
3256 static struct ctl_table empty[1];
3257
3258 static struct ctl_table ipv4_skeleton[] =
3259 {
3260         { .procname = "route", 
3261           .mode = 0555, .child = ipv4_route_table},
3262         { .procname = "neigh", 
3263           .mode = 0555, .child = empty},
3264         { }
3265 };
3266
3267 static __net_initdata struct ctl_path ipv4_path[] = {
3268         { .procname = "net", },
3269         { .procname = "ipv4", },
3270         { },
3271 };
3272
3273 static struct ctl_table ipv4_route_flush_table[] = {
3274         {
3275                 .procname       = "flush",
3276                 .maxlen         = sizeof(int),
3277                 .mode           = 0200,
3278                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3279         },
3280         { },
3281 };
3282
3283 static __net_initdata struct ctl_path ipv4_route_path[] = {
3284         { .procname = "net", },
3285         { .procname = "ipv4", },
3286         { .procname = "route", },
3287         { },
3288 };
3289
3290 static __net_init int sysctl_route_net_init(struct net *net)
3291 {
3292         struct ctl_table *tbl;
3293
3294         tbl = ipv4_route_flush_table;
3295         if (!net_eq(net, &init_net)) {
3296                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3297                 if (tbl == NULL)
3298                         goto err_dup;
3299         }
3300         tbl[0].extra1 = net;
3301
3302         net->ipv4.route_hdr =
3303                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3304         if (net->ipv4.route_hdr == NULL)
3305                 goto err_reg;
3306         return 0;
3307
3308 err_reg:
3309         if (tbl != ipv4_route_flush_table)
3310                 kfree(tbl);
3311 err_dup:
3312         return -ENOMEM;
3313 }
3314
3315 static __net_exit void sysctl_route_net_exit(struct net *net)
3316 {
3317         struct ctl_table *tbl;
3318
3319         tbl = net->ipv4.route_hdr->ctl_table_arg;
3320         unregister_net_sysctl_table(net->ipv4.route_hdr);
3321         BUG_ON(tbl == ipv4_route_flush_table);
3322         kfree(tbl);
3323 }
3324
3325 static __net_initdata struct pernet_operations sysctl_route_ops = {
3326         .init = sysctl_route_net_init,
3327         .exit = sysctl_route_net_exit,
3328 };
3329 #endif
3330
3331 static __net_init int rt_genid_init(struct net *net)
3332 {
3333         get_random_bytes(&net->ipv4.rt_genid,
3334                          sizeof(net->ipv4.rt_genid));
3335         get_random_bytes(&net->ipv4.dev_addr_genid,
3336                          sizeof(net->ipv4.dev_addr_genid));
3337         return 0;
3338 }
3339
3340 static __net_initdata struct pernet_operations rt_genid_ops = {
3341         .init = rt_genid_init,
3342 };
3343
3344
3345 #ifdef CONFIG_IP_ROUTE_CLASSID
3346 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3347 #endif /* CONFIG_IP_ROUTE_CLASSID */
3348
3349 static __initdata unsigned long rhash_entries;
3350 static int __init set_rhash_entries(char *str)
3351 {
3352         if (!str)
3353                 return 0;
3354         rhash_entries = simple_strtoul(str, &str, 0);
3355         return 1;
3356 }
3357 __setup("rhash_entries=", set_rhash_entries);
3358
3359 int __init ip_rt_init(void)
3360 {
3361         int rc = 0;
3362
3363 #ifdef CONFIG_IP_ROUTE_CLASSID
3364         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3365         if (!ip_rt_acct)
3366                 panic("IP: failed to allocate ip_rt_acct\n");
3367 #endif
3368
3369         ipv4_dst_ops.kmem_cachep =
3370                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3371                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3372
3373         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3374
3375         if (dst_entries_init(&ipv4_dst_ops) < 0)
3376                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3377
3378         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3379                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3380
3381         rt_hash_table = (struct rt_hash_bucket *)
3382                 alloc_large_system_hash("IP route cache",
3383                                         sizeof(struct rt_hash_bucket),
3384                                         rhash_entries,
3385                                         (totalram_pages >= 128 * 1024) ?
3386                                         15 : 17,
3387                                         0,
3388                                         &rt_hash_log,
3389                                         &rt_hash_mask,
3390                                         rhash_entries ? 0 : 512 * 1024);
3391         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3392         rt_hash_lock_init();
3393
3394         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3395         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3396
3397         devinet_init();
3398         ip_fib_init();
3399
3400         if (ip_rt_proc_init())
3401                 printk(KERN_ERR "Unable to create route proc files\n");
3402 #ifdef CONFIG_XFRM
3403         xfrm_init();
3404         xfrm4_init(ip_rt_max_size);
3405 #endif
3406         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3407
3408 #ifdef CONFIG_SYSCTL
3409         register_pernet_subsys(&sysctl_route_ops);
3410 #endif
3411         register_pernet_subsys(&rt_genid_ops);
3412         return rc;
3413 }
3414
3415 #ifdef CONFIG_SYSCTL
3416 /*
3417  * We really need to sanitize the damn ipv4 init order, then all
3418  * this nonsense will go away.
3419  */
3420 void __init ip_static_sysctl_init(void)
3421 {
3422         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3423 }
3424 #endif