Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[pandora-kernel.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111 #include <net/atmclip.h>
112 #include <net/secure_seq.h>
113
114 #define RT_FL_TOS(oldflp4) \
115         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
116
117 #define IP_MAX_MTU      0xFFF0
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
124 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
125 static int ip_rt_redirect_number __read_mostly  = 9;
126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly       = HZ;
129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
130 static int ip_rt_gc_elasticity __read_mostly    = 8;
131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
133 static int ip_rt_min_advmss __read_mostly       = 256;
134 static int rt_chain_length_max __read_mostly    = 20;
135 static int redirect_genid;
136
137 static struct delayed_work expires_work;
138 static unsigned long expires_ljiffies;
139
140 /*
141  *      Interface to generic destination cache.
142  */
143
144 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
145 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
146 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
147 static void              ipv4_dst_destroy(struct dst_entry *dst);
148 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149 static void              ipv4_link_failure(struct sk_buff *skb);
150 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
151 static int rt_garbage_collect(struct dst_ops *ops);
152
153 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154                             int how)
155 {
156 }
157
158 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159 {
160         struct rtable *rt = (struct rtable *) dst;
161         struct inet_peer *peer;
162         u32 *p = NULL;
163
164         if (!rt->peer)
165                 rt_bind_peer(rt, rt->rt_dst, 1);
166
167         peer = rt->peer;
168         if (peer) {
169                 u32 *old_p = __DST_METRICS_PTR(old);
170                 unsigned long prev, new;
171
172                 p = peer->metrics;
173                 if (inet_metrics_new(peer))
174                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
175
176                 new = (unsigned long) p;
177                 prev = cmpxchg(&dst->_metrics, old, new);
178
179                 if (prev != old) {
180                         p = __DST_METRICS_PTR(prev);
181                         if (prev & DST_METRICS_READ_ONLY)
182                                 p = NULL;
183                 } else {
184                         if (rt->fi) {
185                                 fib_info_put(rt->fi);
186                                 rt->fi = NULL;
187                         }
188                 }
189         }
190         return p;
191 }
192
193 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
194
195 static struct dst_ops ipv4_dst_ops = {
196         .family =               AF_INET,
197         .protocol =             cpu_to_be16(ETH_P_IP),
198         .gc =                   rt_garbage_collect,
199         .check =                ipv4_dst_check,
200         .default_advmss =       ipv4_default_advmss,
201         .mtu =                  ipv4_mtu,
202         .cow_metrics =          ipv4_cow_metrics,
203         .destroy =              ipv4_dst_destroy,
204         .ifdown =               ipv4_dst_ifdown,
205         .negative_advice =      ipv4_negative_advice,
206         .link_failure =         ipv4_link_failure,
207         .update_pmtu =          ip_rt_update_pmtu,
208         .local_out =            __ip_local_out,
209         .neigh_lookup =         ipv4_neigh_lookup,
210 };
211
212 #define ECN_OR_COST(class)      TC_PRIO_##class
213
214 const __u8 ip_tos2prio[16] = {
215         TC_PRIO_BESTEFFORT,
216         ECN_OR_COST(BESTEFFORT),
217         TC_PRIO_BESTEFFORT,
218         ECN_OR_COST(BESTEFFORT),
219         TC_PRIO_BULK,
220         ECN_OR_COST(BULK),
221         TC_PRIO_BULK,
222         ECN_OR_COST(BULK),
223         TC_PRIO_INTERACTIVE,
224         ECN_OR_COST(INTERACTIVE),
225         TC_PRIO_INTERACTIVE,
226         ECN_OR_COST(INTERACTIVE),
227         TC_PRIO_INTERACTIVE_BULK,
228         ECN_OR_COST(INTERACTIVE_BULK),
229         TC_PRIO_INTERACTIVE_BULK,
230         ECN_OR_COST(INTERACTIVE_BULK)
231 };
232
233
234 /*
235  * Route cache.
236  */
237
238 /* The locking scheme is rather straight forward:
239  *
240  * 1) Read-Copy Update protects the buckets of the central route hash.
241  * 2) Only writers remove entries, and they hold the lock
242  *    as they look at rtable reference counts.
243  * 3) Only readers acquire references to rtable entries,
244  *    they do so with atomic increments and with the
245  *    lock held.
246  */
247
248 struct rt_hash_bucket {
249         struct rtable __rcu     *chain;
250 };
251
252 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253         defined(CONFIG_PROVE_LOCKING)
254 /*
255  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256  * The size of this table is a power of two and depends on the number of CPUS.
257  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
258  */
259 #ifdef CONFIG_LOCKDEP
260 # define RT_HASH_LOCK_SZ        256
261 #else
262 # if NR_CPUS >= 32
263 #  define RT_HASH_LOCK_SZ       4096
264 # elif NR_CPUS >= 16
265 #  define RT_HASH_LOCK_SZ       2048
266 # elif NR_CPUS >= 8
267 #  define RT_HASH_LOCK_SZ       1024
268 # elif NR_CPUS >= 4
269 #  define RT_HASH_LOCK_SZ       512
270 # else
271 #  define RT_HASH_LOCK_SZ       256
272 # endif
273 #endif
274
275 static spinlock_t       *rt_hash_locks;
276 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
277
278 static __init void rt_hash_lock_init(void)
279 {
280         int i;
281
282         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
283                         GFP_KERNEL);
284         if (!rt_hash_locks)
285                 panic("IP: failed to allocate rt_hash_locks\n");
286
287         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288                 spin_lock_init(&rt_hash_locks[i]);
289 }
290 #else
291 # define rt_hash_lock_addr(slot) NULL
292
293 static inline void rt_hash_lock_init(void)
294 {
295 }
296 #endif
297
298 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
299 static unsigned                 rt_hash_mask __read_mostly;
300 static unsigned int             rt_hash_log  __read_mostly;
301
302 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
303 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
304
305 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
306                                    int genid)
307 {
308         return jhash_3words((__force u32)daddr, (__force u32)saddr,
309                             idx, genid)
310                 & rt_hash_mask;
311 }
312
313 static inline int rt_genid(struct net *net)
314 {
315         return atomic_read(&net->ipv4.rt_genid);
316 }
317
318 #ifdef CONFIG_PROC_FS
319 struct rt_cache_iter_state {
320         struct seq_net_private p;
321         int bucket;
322         int genid;
323 };
324
325 static struct rtable *rt_cache_get_first(struct seq_file *seq)
326 {
327         struct rt_cache_iter_state *st = seq->private;
328         struct rtable *r = NULL;
329
330         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
331                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
332                         continue;
333                 rcu_read_lock_bh();
334                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
335                 while (r) {
336                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
337                             r->rt_genid == st->genid)
338                                 return r;
339                         r = rcu_dereference_bh(r->dst.rt_next);
340                 }
341                 rcu_read_unlock_bh();
342         }
343         return r;
344 }
345
346 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
347                                           struct rtable *r)
348 {
349         struct rt_cache_iter_state *st = seq->private;
350
351         r = rcu_dereference_bh(r->dst.rt_next);
352         while (!r) {
353                 rcu_read_unlock_bh();
354                 do {
355                         if (--st->bucket < 0)
356                                 return NULL;
357                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
358                 rcu_read_lock_bh();
359                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
360         }
361         return r;
362 }
363
364 static struct rtable *rt_cache_get_next(struct seq_file *seq,
365                                         struct rtable *r)
366 {
367         struct rt_cache_iter_state *st = seq->private;
368         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
369                 if (dev_net(r->dst.dev) != seq_file_net(seq))
370                         continue;
371                 if (r->rt_genid == st->genid)
372                         break;
373         }
374         return r;
375 }
376
377 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
378 {
379         struct rtable *r = rt_cache_get_first(seq);
380
381         if (r)
382                 while (pos && (r = rt_cache_get_next(seq, r)))
383                         --pos;
384         return pos ? NULL : r;
385 }
386
387 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
388 {
389         struct rt_cache_iter_state *st = seq->private;
390         if (*pos)
391                 return rt_cache_get_idx(seq, *pos - 1);
392         st->genid = rt_genid(seq_file_net(seq));
393         return SEQ_START_TOKEN;
394 }
395
396 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
397 {
398         struct rtable *r;
399
400         if (v == SEQ_START_TOKEN)
401                 r = rt_cache_get_first(seq);
402         else
403                 r = rt_cache_get_next(seq, v);
404         ++*pos;
405         return r;
406 }
407
408 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
409 {
410         if (v && v != SEQ_START_TOKEN)
411                 rcu_read_unlock_bh();
412 }
413
414 static int rt_cache_seq_show(struct seq_file *seq, void *v)
415 {
416         if (v == SEQ_START_TOKEN)
417                 seq_printf(seq, "%-127s\n",
418                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
420                            "HHUptod\tSpecDst");
421         else {
422                 struct rtable *r = v;
423                 struct neighbour *n;
424                 int len, HHUptod;
425
426                 rcu_read_lock();
427                 n = dst_get_neighbour(&r->dst);
428                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
429                 rcu_read_unlock();
430
431                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
432                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
433                         r->dst.dev ? r->dst.dev->name : "*",
434                         (__force u32)r->rt_dst,
435                         (__force u32)r->rt_gateway,
436                         r->rt_flags, atomic_read(&r->dst.__refcnt),
437                         r->dst.__use, 0, (__force u32)r->rt_src,
438                         dst_metric_advmss(&r->dst) + 40,
439                         dst_metric(&r->dst, RTAX_WINDOW),
440                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
441                               dst_metric(&r->dst, RTAX_RTTVAR)),
442                         r->rt_key_tos,
443                         -1,
444                         HHUptod,
445                         r->rt_spec_dst, &len);
446
447                 seq_printf(seq, "%*s\n", 127 - len, "");
448         }
449         return 0;
450 }
451
452 static const struct seq_operations rt_cache_seq_ops = {
453         .start  = rt_cache_seq_start,
454         .next   = rt_cache_seq_next,
455         .stop   = rt_cache_seq_stop,
456         .show   = rt_cache_seq_show,
457 };
458
459 static int rt_cache_seq_open(struct inode *inode, struct file *file)
460 {
461         return seq_open_net(inode, file, &rt_cache_seq_ops,
462                         sizeof(struct rt_cache_iter_state));
463 }
464
465 static const struct file_operations rt_cache_seq_fops = {
466         .owner   = THIS_MODULE,
467         .open    = rt_cache_seq_open,
468         .read    = seq_read,
469         .llseek  = seq_lseek,
470         .release = seq_release_net,
471 };
472
473
474 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475 {
476         int cpu;
477
478         if (*pos == 0)
479                 return SEQ_START_TOKEN;
480
481         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
482                 if (!cpu_possible(cpu))
483                         continue;
484                 *pos = cpu+1;
485                 return &per_cpu(rt_cache_stat, cpu);
486         }
487         return NULL;
488 }
489
490 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
491 {
492         int cpu;
493
494         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
495                 if (!cpu_possible(cpu))
496                         continue;
497                 *pos = cpu+1;
498                 return &per_cpu(rt_cache_stat, cpu);
499         }
500         return NULL;
501
502 }
503
504 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505 {
506
507 }
508
509 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
510 {
511         struct rt_cache_stat *st = v;
512
513         if (v == SEQ_START_TOKEN) {
514                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
515                 return 0;
516         }
517
518         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
519                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
520                    dst_entries_get_slow(&ipv4_dst_ops),
521                    st->in_hit,
522                    st->in_slow_tot,
523                    st->in_slow_mc,
524                    st->in_no_route,
525                    st->in_brd,
526                    st->in_martian_dst,
527                    st->in_martian_src,
528
529                    st->out_hit,
530                    st->out_slow_tot,
531                    st->out_slow_mc,
532
533                    st->gc_total,
534                    st->gc_ignored,
535                    st->gc_goal_miss,
536                    st->gc_dst_overflow,
537                    st->in_hlist_search,
538                    st->out_hlist_search
539                 );
540         return 0;
541 }
542
543 static const struct seq_operations rt_cpu_seq_ops = {
544         .start  = rt_cpu_seq_start,
545         .next   = rt_cpu_seq_next,
546         .stop   = rt_cpu_seq_stop,
547         .show   = rt_cpu_seq_show,
548 };
549
550
551 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
552 {
553         return seq_open(file, &rt_cpu_seq_ops);
554 }
555
556 static const struct file_operations rt_cpu_seq_fops = {
557         .owner   = THIS_MODULE,
558         .open    = rt_cpu_seq_open,
559         .read    = seq_read,
560         .llseek  = seq_lseek,
561         .release = seq_release,
562 };
563
564 #ifdef CONFIG_IP_ROUTE_CLASSID
565 static int rt_acct_proc_show(struct seq_file *m, void *v)
566 {
567         struct ip_rt_acct *dst, *src;
568         unsigned int i, j;
569
570         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
571         if (!dst)
572                 return -ENOMEM;
573
574         for_each_possible_cpu(i) {
575                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
576                 for (j = 0; j < 256; j++) {
577                         dst[j].o_bytes   += src[j].o_bytes;
578                         dst[j].o_packets += src[j].o_packets;
579                         dst[j].i_bytes   += src[j].i_bytes;
580                         dst[j].i_packets += src[j].i_packets;
581                 }
582         }
583
584         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585         kfree(dst);
586         return 0;
587 }
588
589 static int rt_acct_proc_open(struct inode *inode, struct file *file)
590 {
591         return single_open(file, rt_acct_proc_show, NULL);
592 }
593
594 static const struct file_operations rt_acct_proc_fops = {
595         .owner          = THIS_MODULE,
596         .open           = rt_acct_proc_open,
597         .read           = seq_read,
598         .llseek         = seq_lseek,
599         .release        = single_release,
600 };
601 #endif
602
603 static int __net_init ip_rt_do_proc_init(struct net *net)
604 {
605         struct proc_dir_entry *pde;
606
607         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608                         &rt_cache_seq_fops);
609         if (!pde)
610                 goto err1;
611
612         pde = proc_create("rt_cache", S_IRUGO,
613                           net->proc_net_stat, &rt_cpu_seq_fops);
614         if (!pde)
615                 goto err2;
616
617 #ifdef CONFIG_IP_ROUTE_CLASSID
618         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
619         if (!pde)
620                 goto err3;
621 #endif
622         return 0;
623
624 #ifdef CONFIG_IP_ROUTE_CLASSID
625 err3:
626         remove_proc_entry("rt_cache", net->proc_net_stat);
627 #endif
628 err2:
629         remove_proc_entry("rt_cache", net->proc_net);
630 err1:
631         return -ENOMEM;
632 }
633
634 static void __net_exit ip_rt_do_proc_exit(struct net *net)
635 {
636         remove_proc_entry("rt_cache", net->proc_net_stat);
637         remove_proc_entry("rt_cache", net->proc_net);
638 #ifdef CONFIG_IP_ROUTE_CLASSID
639         remove_proc_entry("rt_acct", net->proc_net);
640 #endif
641 }
642
643 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
644         .init = ip_rt_do_proc_init,
645         .exit = ip_rt_do_proc_exit,
646 };
647
648 static int __init ip_rt_proc_init(void)
649 {
650         return register_pernet_subsys(&ip_rt_proc_ops);
651 }
652
653 #else
654 static inline int ip_rt_proc_init(void)
655 {
656         return 0;
657 }
658 #endif /* CONFIG_PROC_FS */
659
660 static inline void rt_free(struct rtable *rt)
661 {
662         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
663 }
664
665 static inline void rt_drop(struct rtable *rt)
666 {
667         ip_rt_put(rt);
668         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
669 }
670
671 static inline int rt_fast_clean(struct rtable *rth)
672 {
673         /* Kill broadcast/multicast entries very aggresively, if they
674            collide in hash table with more useful entries */
675         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
676                 rt_is_input_route(rth) && rth->dst.rt_next;
677 }
678
679 static inline int rt_valuable(struct rtable *rth)
680 {
681         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
682                 (rth->peer && rth->peer->pmtu_expires);
683 }
684
685 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686 {
687         unsigned long age;
688         int ret = 0;
689
690         if (atomic_read(&rth->dst.__refcnt))
691                 goto out;
692
693         age = jiffies - rth->dst.lastuse;
694         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
695             (age <= tmo2 && rt_valuable(rth)))
696                 goto out;
697         ret = 1;
698 out:    return ret;
699 }
700
701 /* Bits of score are:
702  * 31: very valuable
703  * 30: not quite useless
704  * 29..0: usage counter
705  */
706 static inline u32 rt_score(struct rtable *rt)
707 {
708         u32 score = jiffies - rt->dst.lastuse;
709
710         score = ~score & ~(3<<30);
711
712         if (rt_valuable(rt))
713                 score |= (1<<31);
714
715         if (rt_is_output_route(rt) ||
716             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
717                 score |= (1<<30);
718
719         return score;
720 }
721
722 static inline bool rt_caching(const struct net *net)
723 {
724         return net->ipv4.current_rt_cache_rebuild_count <=
725                 net->ipv4.sysctl_rt_cache_rebuild_count;
726 }
727
728 static inline bool compare_hash_inputs(const struct rtable *rt1,
729                                        const struct rtable *rt2)
730 {
731         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
732                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
733                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
734 }
735
736 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
737 {
738         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
739                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
740                 (rt1->rt_mark ^ rt2->rt_mark) |
741                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
742                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
743                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
744 }
745
746 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
747 {
748         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
749 }
750
751 static inline int rt_is_expired(struct rtable *rth)
752 {
753         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
754 }
755
756 /*
757  * Perform a full scan of hash table and free all entries.
758  * Can be called by a softirq or a process.
759  * In the later case, we want to be reschedule if necessary
760  */
761 static void rt_do_flush(struct net *net, int process_context)
762 {
763         unsigned int i;
764         struct rtable *rth, *next;
765
766         for (i = 0; i <= rt_hash_mask; i++) {
767                 struct rtable __rcu **pprev;
768                 struct rtable *list;
769
770                 if (process_context && need_resched())
771                         cond_resched();
772                 rth = rcu_access_pointer(rt_hash_table[i].chain);
773                 if (!rth)
774                         continue;
775
776                 spin_lock_bh(rt_hash_lock_addr(i));
777
778                 list = NULL;
779                 pprev = &rt_hash_table[i].chain;
780                 rth = rcu_dereference_protected(*pprev,
781                         lockdep_is_held(rt_hash_lock_addr(i)));
782
783                 while (rth) {
784                         next = rcu_dereference_protected(rth->dst.rt_next,
785                                 lockdep_is_held(rt_hash_lock_addr(i)));
786
787                         if (!net ||
788                             net_eq(dev_net(rth->dst.dev), net)) {
789                                 rcu_assign_pointer(*pprev, next);
790                                 rcu_assign_pointer(rth->dst.rt_next, list);
791                                 list = rth;
792                         } else {
793                                 pprev = &rth->dst.rt_next;
794                         }
795                         rth = next;
796                 }
797
798                 spin_unlock_bh(rt_hash_lock_addr(i));
799
800                 for (; list; list = next) {
801                         next = rcu_dereference_protected(list->dst.rt_next, 1);
802                         rt_free(list);
803                 }
804         }
805 }
806
807 /*
808  * While freeing expired entries, we compute average chain length
809  * and standard deviation, using fixed-point arithmetic.
810  * This to have an estimation of rt_chain_length_max
811  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
812  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
813  */
814
815 #define FRACT_BITS 3
816 #define ONE (1UL << FRACT_BITS)
817
818 /*
819  * Given a hash chain and an item in this hash chain,
820  * find if a previous entry has the same hash_inputs
821  * (but differs on tos, mark or oif)
822  * Returns 0 if an alias is found.
823  * Returns ONE if rth has no alias before itself.
824  */
825 static int has_noalias(const struct rtable *head, const struct rtable *rth)
826 {
827         const struct rtable *aux = head;
828
829         while (aux != rth) {
830                 if (compare_hash_inputs(aux, rth))
831                         return 0;
832                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
833         }
834         return ONE;
835 }
836
837 static void rt_check_expire(void)
838 {
839         static unsigned int rover;
840         unsigned int i = rover, goal;
841         struct rtable *rth;
842         struct rtable __rcu **rthp;
843         unsigned long samples = 0;
844         unsigned long sum = 0, sum2 = 0;
845         unsigned long delta;
846         u64 mult;
847
848         delta = jiffies - expires_ljiffies;
849         expires_ljiffies = jiffies;
850         mult = ((u64)delta) << rt_hash_log;
851         if (ip_rt_gc_timeout > 1)
852                 do_div(mult, ip_rt_gc_timeout);
853         goal = (unsigned int)mult;
854         if (goal > rt_hash_mask)
855                 goal = rt_hash_mask + 1;
856         for (; goal > 0; goal--) {
857                 unsigned long tmo = ip_rt_gc_timeout;
858                 unsigned long length;
859
860                 i = (i + 1) & rt_hash_mask;
861                 rthp = &rt_hash_table[i].chain;
862
863                 if (need_resched())
864                         cond_resched();
865
866                 samples++;
867
868                 if (rcu_dereference_raw(*rthp) == NULL)
869                         continue;
870                 length = 0;
871                 spin_lock_bh(rt_hash_lock_addr(i));
872                 while ((rth = rcu_dereference_protected(*rthp,
873                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
874                         prefetch(rth->dst.rt_next);
875                         if (rt_is_expired(rth)) {
876                                 *rthp = rth->dst.rt_next;
877                                 rt_free(rth);
878                                 continue;
879                         }
880                         if (rth->dst.expires) {
881                                 /* Entry is expired even if it is in use */
882                                 if (time_before_eq(jiffies, rth->dst.expires)) {
883 nofree:
884                                         tmo >>= 1;
885                                         rthp = &rth->dst.rt_next;
886                                         /*
887                                          * We only count entries on
888                                          * a chain with equal hash inputs once
889                                          * so that entries for different QOS
890                                          * levels, and other non-hash input
891                                          * attributes don't unfairly skew
892                                          * the length computation
893                                          */
894                                         length += has_noalias(rt_hash_table[i].chain, rth);
895                                         continue;
896                                 }
897                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
898                                 goto nofree;
899
900                         /* Cleanup aged off entries. */
901                         *rthp = rth->dst.rt_next;
902                         rt_free(rth);
903                 }
904                 spin_unlock_bh(rt_hash_lock_addr(i));
905                 sum += length;
906                 sum2 += length*length;
907         }
908         if (samples) {
909                 unsigned long avg = sum / samples;
910                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
911                 rt_chain_length_max = max_t(unsigned long,
912                                         ip_rt_gc_elasticity,
913                                         (avg + 4*sd) >> FRACT_BITS);
914         }
915         rover = i;
916 }
917
918 /*
919  * rt_worker_func() is run in process context.
920  * we call rt_check_expire() to scan part of the hash table
921  */
922 static void rt_worker_func(struct work_struct *work)
923 {
924         rt_check_expire();
925         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
926 }
927
928 /*
929  * Perturbation of rt_genid by a small quantity [1..256]
930  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
931  * many times (2^24) without giving recent rt_genid.
932  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
933  */
934 static void rt_cache_invalidate(struct net *net)
935 {
936         unsigned char shuffle;
937
938         get_random_bytes(&shuffle, sizeof(shuffle));
939         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
940         redirect_genid++;
941 }
942
943 /*
944  * delay < 0  : invalidate cache (fast : entries will be deleted later)
945  * delay >= 0 : invalidate & flush cache (can be long)
946  */
947 void rt_cache_flush(struct net *net, int delay)
948 {
949         rt_cache_invalidate(net);
950         if (delay >= 0)
951                 rt_do_flush(net, !in_softirq());
952 }
953
954 /* Flush previous cache invalidated entries from the cache */
955 void rt_cache_flush_batch(struct net *net)
956 {
957         rt_do_flush(net, !in_softirq());
958 }
959
960 static void rt_emergency_hash_rebuild(struct net *net)
961 {
962         if (net_ratelimit())
963                 printk(KERN_WARNING "Route hash chain too long!\n");
964         rt_cache_invalidate(net);
965 }
966
967 /*
968    Short description of GC goals.
969
970    We want to build algorithm, which will keep routing cache
971    at some equilibrium point, when number of aged off entries
972    is kept approximately equal to newly generated ones.
973
974    Current expiration strength is variable "expire".
975    We try to adjust it dynamically, so that if networking
976    is idle expires is large enough to keep enough of warm entries,
977    and when load increases it reduces to limit cache size.
978  */
979
980 static int rt_garbage_collect(struct dst_ops *ops)
981 {
982         static unsigned long expire = RT_GC_TIMEOUT;
983         static unsigned long last_gc;
984         static int rover;
985         static int equilibrium;
986         struct rtable *rth;
987         struct rtable __rcu **rthp;
988         unsigned long now = jiffies;
989         int goal;
990         int entries = dst_entries_get_fast(&ipv4_dst_ops);
991
992         /*
993          * Garbage collection is pretty expensive,
994          * do not make it too frequently.
995          */
996
997         RT_CACHE_STAT_INC(gc_total);
998
999         if (now - last_gc < ip_rt_gc_min_interval &&
1000             entries < ip_rt_max_size) {
1001                 RT_CACHE_STAT_INC(gc_ignored);
1002                 goto out;
1003         }
1004
1005         entries = dst_entries_get_slow(&ipv4_dst_ops);
1006         /* Calculate number of entries, which we want to expire now. */
1007         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1008         if (goal <= 0) {
1009                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1010                         equilibrium = ipv4_dst_ops.gc_thresh;
1011                 goal = entries - equilibrium;
1012                 if (goal > 0) {
1013                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014                         goal = entries - equilibrium;
1015                 }
1016         } else {
1017                 /* We are in dangerous area. Try to reduce cache really
1018                  * aggressively.
1019                  */
1020                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021                 equilibrium = entries - goal;
1022         }
1023
1024         if (now - last_gc >= ip_rt_gc_min_interval)
1025                 last_gc = now;
1026
1027         if (goal <= 0) {
1028                 equilibrium += goal;
1029                 goto work_done;
1030         }
1031
1032         do {
1033                 int i, k;
1034
1035                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036                         unsigned long tmo = expire;
1037
1038                         k = (k + 1) & rt_hash_mask;
1039                         rthp = &rt_hash_table[k].chain;
1040                         spin_lock_bh(rt_hash_lock_addr(k));
1041                         while ((rth = rcu_dereference_protected(*rthp,
1042                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043                                 if (!rt_is_expired(rth) &&
1044                                         !rt_may_expire(rth, tmo, expire)) {
1045                                         tmo >>= 1;
1046                                         rthp = &rth->dst.rt_next;
1047                                         continue;
1048                                 }
1049                                 *rthp = rth->dst.rt_next;
1050                                 rt_free(rth);
1051                                 goal--;
1052                         }
1053                         spin_unlock_bh(rt_hash_lock_addr(k));
1054                         if (goal <= 0)
1055                                 break;
1056                 }
1057                 rover = k;
1058
1059                 if (goal <= 0)
1060                         goto work_done;
1061
1062                 /* Goal is not achieved. We stop process if:
1063
1064                    - if expire reduced to zero. Otherwise, expire is halfed.
1065                    - if table is not full.
1066                    - if we are called from interrupt.
1067                    - jiffies check is just fallback/debug loop breaker.
1068                      We will not spin here for long time in any case.
1069                  */
1070
1071                 RT_CACHE_STAT_INC(gc_goal_miss);
1072
1073                 if (expire == 0)
1074                         break;
1075
1076                 expire >>= 1;
1077
1078                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079                         goto out;
1080         } while (!in_softirq() && time_before_eq(jiffies, now));
1081
1082         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083                 goto out;
1084         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085                 goto out;
1086         if (net_ratelimit())
1087                 printk(KERN_WARNING "dst cache overflow\n");
1088         RT_CACHE_STAT_INC(gc_dst_overflow);
1089         return 1;
1090
1091 work_done:
1092         expire += ip_rt_gc_min_interval;
1093         if (expire > ip_rt_gc_timeout ||
1094             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1095             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1096                 expire = ip_rt_gc_timeout;
1097 out:    return 0;
1098 }
1099
1100 /*
1101  * Returns number of entries in a hash chain that have different hash_inputs
1102  */
1103 static int slow_chain_length(const struct rtable *head)
1104 {
1105         int length = 0;
1106         const struct rtable *rth = head;
1107
1108         while (rth) {
1109                 length += has_noalias(head, rth);
1110                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1111         }
1112         return length >> FRACT_BITS;
1113 }
1114
1115 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1116 {
1117         struct neigh_table *tbl = &arp_tbl;
1118         static const __be32 inaddr_any = 0;
1119         struct net_device *dev = dst->dev;
1120         const __be32 *pkey = daddr;
1121         struct neighbour *n;
1122
1123 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1124         if (dev->type == ARPHRD_ATM)
1125                 tbl = clip_tbl_hook;
1126 #endif
1127         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1128                 pkey = &inaddr_any;
1129
1130         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1131         if (n)
1132                 return n;
1133         return neigh_create(tbl, pkey, dev);
1134 }
1135
1136 static int rt_bind_neighbour(struct rtable *rt)
1137 {
1138         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1139         if (IS_ERR(n))
1140                 return PTR_ERR(n);
1141         dst_set_neighbour(&rt->dst, n);
1142
1143         return 0;
1144 }
1145
1146 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1147                                      struct sk_buff *skb, int ifindex)
1148 {
1149         struct rtable   *rth, *cand;
1150         struct rtable __rcu **rthp, **candp;
1151         unsigned long   now;
1152         u32             min_score;
1153         int             chain_length;
1154         int attempts = !in_softirq();
1155
1156 restart:
1157         chain_length = 0;
1158         min_score = ~(u32)0;
1159         cand = NULL;
1160         candp = NULL;
1161         now = jiffies;
1162
1163         if (!rt_caching(dev_net(rt->dst.dev))) {
1164                 /*
1165                  * If we're not caching, just tell the caller we
1166                  * were successful and don't touch the route.  The
1167                  * caller hold the sole reference to the cache entry, and
1168                  * it will be released when the caller is done with it.
1169                  * If we drop it here, the callers have no way to resolve routes
1170                  * when we're not caching.  Instead, just point *rp at rt, so
1171                  * the caller gets a single use out of the route
1172                  * Note that we do rt_free on this new route entry, so that
1173                  * once its refcount hits zero, we are still able to reap it
1174                  * (Thanks Alexey)
1175                  * Note: To avoid expensive rcu stuff for this uncached dst,
1176                  * we set DST_NOCACHE so that dst_release() can free dst without
1177                  * waiting a grace period.
1178                  */
1179
1180                 rt->dst.flags |= DST_NOCACHE;
1181                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1182                         int err = rt_bind_neighbour(rt);
1183                         if (err) {
1184                                 if (net_ratelimit())
1185                                         printk(KERN_WARNING
1186                                             "Neighbour table failure & not caching routes.\n");
1187                                 ip_rt_put(rt);
1188                                 return ERR_PTR(err);
1189                         }
1190                 }
1191
1192                 goto skip_hashing;
1193         }
1194
1195         rthp = &rt_hash_table[hash].chain;
1196
1197         spin_lock_bh(rt_hash_lock_addr(hash));
1198         while ((rth = rcu_dereference_protected(*rthp,
1199                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1200                 if (rt_is_expired(rth)) {
1201                         *rthp = rth->dst.rt_next;
1202                         rt_free(rth);
1203                         continue;
1204                 }
1205                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1206                         /* Put it first */
1207                         *rthp = rth->dst.rt_next;
1208                         /*
1209                          * Since lookup is lockfree, the deletion
1210                          * must be visible to another weakly ordered CPU before
1211                          * the insertion at the start of the hash chain.
1212                          */
1213                         rcu_assign_pointer(rth->dst.rt_next,
1214                                            rt_hash_table[hash].chain);
1215                         /*
1216                          * Since lookup is lockfree, the update writes
1217                          * must be ordered for consistency on SMP.
1218                          */
1219                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1220
1221                         dst_use(&rth->dst, now);
1222                         spin_unlock_bh(rt_hash_lock_addr(hash));
1223
1224                         rt_drop(rt);
1225                         if (skb)
1226                                 skb_dst_set(skb, &rth->dst);
1227                         return rth;
1228                 }
1229
1230                 if (!atomic_read(&rth->dst.__refcnt)) {
1231                         u32 score = rt_score(rth);
1232
1233                         if (score <= min_score) {
1234                                 cand = rth;
1235                                 candp = rthp;
1236                                 min_score = score;
1237                         }
1238                 }
1239
1240                 chain_length++;
1241
1242                 rthp = &rth->dst.rt_next;
1243         }
1244
1245         if (cand) {
1246                 /* ip_rt_gc_elasticity used to be average length of chain
1247                  * length, when exceeded gc becomes really aggressive.
1248                  *
1249                  * The second limit is less certain. At the moment it allows
1250                  * only 2 entries per bucket. We will see.
1251                  */
1252                 if (chain_length > ip_rt_gc_elasticity) {
1253                         *candp = cand->dst.rt_next;
1254                         rt_free(cand);
1255                 }
1256         } else {
1257                 if (chain_length > rt_chain_length_max &&
1258                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1259                         struct net *net = dev_net(rt->dst.dev);
1260                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1261                         if (!rt_caching(net)) {
1262                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1263                                         rt->dst.dev->name, num);
1264                         }
1265                         rt_emergency_hash_rebuild(net);
1266                         spin_unlock_bh(rt_hash_lock_addr(hash));
1267
1268                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1269                                         ifindex, rt_genid(net));
1270                         goto restart;
1271                 }
1272         }
1273
1274         /* Try to bind route to arp only if it is output
1275            route or unicast forwarding path.
1276          */
1277         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1278                 int err = rt_bind_neighbour(rt);
1279                 if (err) {
1280                         spin_unlock_bh(rt_hash_lock_addr(hash));
1281
1282                         if (err != -ENOBUFS) {
1283                                 rt_drop(rt);
1284                                 return ERR_PTR(err);
1285                         }
1286
1287                         /* Neighbour tables are full and nothing
1288                            can be released. Try to shrink route cache,
1289                            it is most likely it holds some neighbour records.
1290                          */
1291                         if (attempts-- > 0) {
1292                                 int saved_elasticity = ip_rt_gc_elasticity;
1293                                 int saved_int = ip_rt_gc_min_interval;
1294                                 ip_rt_gc_elasticity     = 1;
1295                                 ip_rt_gc_min_interval   = 0;
1296                                 rt_garbage_collect(&ipv4_dst_ops);
1297                                 ip_rt_gc_min_interval   = saved_int;
1298                                 ip_rt_gc_elasticity     = saved_elasticity;
1299                                 goto restart;
1300                         }
1301
1302                         if (net_ratelimit())
1303                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1304                         rt_drop(rt);
1305                         return ERR_PTR(-ENOBUFS);
1306                 }
1307         }
1308
1309         rt->dst.rt_next = rt_hash_table[hash].chain;
1310
1311         /*
1312          * Since lookup is lockfree, we must make sure
1313          * previous writes to rt are committed to memory
1314          * before making rt visible to other CPUS.
1315          */
1316         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1317
1318         spin_unlock_bh(rt_hash_lock_addr(hash));
1319
1320 skip_hashing:
1321         if (skb)
1322                 skb_dst_set(skb, &rt->dst);
1323         return rt;
1324 }
1325
1326 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1327
1328 static u32 rt_peer_genid(void)
1329 {
1330         return atomic_read(&__rt_peer_genid);
1331 }
1332
1333 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1334 {
1335         struct inet_peer *peer;
1336
1337         peer = inet_getpeer_v4(daddr, create);
1338
1339         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1340                 inet_putpeer(peer);
1341         else
1342                 rt->rt_peer_genid = rt_peer_genid();
1343 }
1344
1345 /*
1346  * Peer allocation may fail only in serious out-of-memory conditions.  However
1347  * we still can generate some output.
1348  * Random ID selection looks a bit dangerous because we have no chances to
1349  * select ID being unique in a reasonable period of time.
1350  * But broken packet identifier may be better than no packet at all.
1351  */
1352 static void ip_select_fb_ident(struct iphdr *iph)
1353 {
1354         static DEFINE_SPINLOCK(ip_fb_id_lock);
1355         static u32 ip_fallback_id;
1356         u32 salt;
1357
1358         spin_lock_bh(&ip_fb_id_lock);
1359         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1360         iph->id = htons(salt & 0xFFFF);
1361         ip_fallback_id = salt;
1362         spin_unlock_bh(&ip_fb_id_lock);
1363 }
1364
1365 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1366 {
1367         struct rtable *rt = (struct rtable *) dst;
1368
1369         if (rt) {
1370                 if (rt->peer == NULL)
1371                         rt_bind_peer(rt, rt->rt_dst, 1);
1372
1373                 /* If peer is attached to destination, it is never detached,
1374                    so that we need not to grab a lock to dereference it.
1375                  */
1376                 if (rt->peer) {
1377                         iph->id = htons(inet_getid(rt->peer, more));
1378                         return;
1379                 }
1380         } else
1381                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1382                        __builtin_return_address(0));
1383
1384         ip_select_fb_ident(iph);
1385 }
1386 EXPORT_SYMBOL(__ip_select_ident);
1387
1388 static void rt_del(unsigned hash, struct rtable *rt)
1389 {
1390         struct rtable __rcu **rthp;
1391         struct rtable *aux;
1392
1393         rthp = &rt_hash_table[hash].chain;
1394         spin_lock_bh(rt_hash_lock_addr(hash));
1395         ip_rt_put(rt);
1396         while ((aux = rcu_dereference_protected(*rthp,
1397                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1398                 if (aux == rt || rt_is_expired(aux)) {
1399                         *rthp = aux->dst.rt_next;
1400                         rt_free(aux);
1401                         continue;
1402                 }
1403                 rthp = &aux->dst.rt_next;
1404         }
1405         spin_unlock_bh(rt_hash_lock_addr(hash));
1406 }
1407
1408 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1409 {
1410         struct rtable *rt = (struct rtable *) dst;
1411         __be32 orig_gw = rt->rt_gateway;
1412         struct neighbour *n, *old_n;
1413
1414         dst_confirm(&rt->dst);
1415
1416         rt->rt_gateway = peer->redirect_learned.a4;
1417
1418         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1419         if (IS_ERR(n)) {
1420                 rt->rt_gateway = orig_gw;
1421                 return;
1422         }
1423         old_n = xchg(&rt->dst._neighbour, n);
1424         if (old_n)
1425                 neigh_release(old_n);
1426         if (!(n->nud_state & NUD_VALID)) {
1427                 neigh_event_send(n, NULL);
1428         } else {
1429                 rt->rt_flags |= RTCF_REDIRECTED;
1430                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1431         }
1432 }
1433
1434 /* called in rcu_read_lock() section */
1435 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1436                     __be32 saddr, struct net_device *dev)
1437 {
1438         int s, i;
1439         struct in_device *in_dev = __in_dev_get_rcu(dev);
1440         __be32 skeys[2] = { saddr, 0 };
1441         int    ikeys[2] = { dev->ifindex, 0 };
1442         struct inet_peer *peer;
1443         struct net *net;
1444
1445         if (!in_dev)
1446                 return;
1447
1448         net = dev_net(dev);
1449         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1450             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1451             ipv4_is_zeronet(new_gw))
1452                 goto reject_redirect;
1453
1454         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1455                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1456                         goto reject_redirect;
1457                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1458                         goto reject_redirect;
1459         } else {
1460                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1461                         goto reject_redirect;
1462         }
1463
1464         for (s = 0; s < 2; s++) {
1465                 for (i = 0; i < 2; i++) {
1466                         unsigned int hash;
1467                         struct rtable __rcu **rthp;
1468                         struct rtable *rt;
1469
1470                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1471
1472                         rthp = &rt_hash_table[hash].chain;
1473
1474                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1475                                 rthp = &rt->dst.rt_next;
1476
1477                                 if (rt->rt_key_dst != daddr ||
1478                                     rt->rt_key_src != skeys[s] ||
1479                                     rt->rt_oif != ikeys[i] ||
1480                                     rt_is_input_route(rt) ||
1481                                     rt_is_expired(rt) ||
1482                                     !net_eq(dev_net(rt->dst.dev), net) ||
1483                                     rt->dst.error ||
1484                                     rt->dst.dev != dev ||
1485                                     rt->rt_gateway != old_gw)
1486                                         continue;
1487
1488                                 if (!rt->peer)
1489                                         rt_bind_peer(rt, rt->rt_dst, 1);
1490
1491                                 peer = rt->peer;
1492                                 if (peer) {
1493                                         if (peer->redirect_learned.a4 != new_gw ||
1494                                             peer->redirect_genid != redirect_genid) {
1495                                                 peer->redirect_learned.a4 = new_gw;
1496                                                 peer->redirect_genid = redirect_genid;
1497                                                 atomic_inc(&__rt_peer_genid);
1498                                         }
1499                                         check_peer_redir(&rt->dst, peer);
1500                                 }
1501                         }
1502                 }
1503         }
1504         return;
1505
1506 reject_redirect:
1507 #ifdef CONFIG_IP_ROUTE_VERBOSE
1508         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1509                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1510                         "  Advised path = %pI4 -> %pI4\n",
1511                        &old_gw, dev->name, &new_gw,
1512                        &saddr, &daddr);
1513 #endif
1514         ;
1515 }
1516
1517 static bool peer_pmtu_expired(struct inet_peer *peer)
1518 {
1519         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1520
1521         return orig &&
1522                time_after_eq(jiffies, orig) &&
1523                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1524 }
1525
1526 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1527 {
1528         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1529
1530         return orig &&
1531                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1532 }
1533
1534 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1535 {
1536         struct rtable *rt = (struct rtable *)dst;
1537         struct dst_entry *ret = dst;
1538
1539         if (rt) {
1540                 if (dst->obsolete > 0) {
1541                         ip_rt_put(rt);
1542                         ret = NULL;
1543                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1544                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1545                                                 rt->rt_oif,
1546                                                 rt_genid(dev_net(dst->dev)));
1547                         rt_del(hash, rt);
1548                         ret = NULL;
1549                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1550                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1551                 }
1552         }
1553         return ret;
1554 }
1555
1556 /*
1557  * Algorithm:
1558  *      1. The first ip_rt_redirect_number redirects are sent
1559  *         with exponential backoff, then we stop sending them at all,
1560  *         assuming that the host ignores our redirects.
1561  *      2. If we did not see packets requiring redirects
1562  *         during ip_rt_redirect_silence, we assume that the host
1563  *         forgot redirected route and start to send redirects again.
1564  *
1565  * This algorithm is much cheaper and more intelligent than dumb load limiting
1566  * in icmp.c.
1567  *
1568  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1569  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1570  */
1571
1572 void ip_rt_send_redirect(struct sk_buff *skb)
1573 {
1574         struct rtable *rt = skb_rtable(skb);
1575         struct in_device *in_dev;
1576         struct inet_peer *peer;
1577         int log_martians;
1578
1579         rcu_read_lock();
1580         in_dev = __in_dev_get_rcu(rt->dst.dev);
1581         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1582                 rcu_read_unlock();
1583                 return;
1584         }
1585         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1586         rcu_read_unlock();
1587
1588         if (!rt->peer)
1589                 rt_bind_peer(rt, rt->rt_dst, 1);
1590         peer = rt->peer;
1591         if (!peer) {
1592                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1593                 return;
1594         }
1595
1596         /* No redirected packets during ip_rt_redirect_silence;
1597          * reset the algorithm.
1598          */
1599         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1600                 peer->rate_tokens = 0;
1601
1602         /* Too many ignored redirects; do not send anything
1603          * set dst.rate_last to the last seen redirected packet.
1604          */
1605         if (peer->rate_tokens >= ip_rt_redirect_number) {
1606                 peer->rate_last = jiffies;
1607                 return;
1608         }
1609
1610         /* Check for load limit; set rate_last to the latest sent
1611          * redirect.
1612          */
1613         if (peer->rate_tokens == 0 ||
1614             time_after(jiffies,
1615                        (peer->rate_last +
1616                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1617                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1618                 peer->rate_last = jiffies;
1619                 ++peer->rate_tokens;
1620 #ifdef CONFIG_IP_ROUTE_VERBOSE
1621                 if (log_martians &&
1622                     peer->rate_tokens == ip_rt_redirect_number &&
1623                     net_ratelimit())
1624                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1625                                &ip_hdr(skb)->saddr, rt->rt_iif,
1626                                 &rt->rt_dst, &rt->rt_gateway);
1627 #endif
1628         }
1629 }
1630
1631 static int ip_error(struct sk_buff *skb)
1632 {
1633         struct rtable *rt = skb_rtable(skb);
1634         struct inet_peer *peer;
1635         unsigned long now;
1636         bool send;
1637         int code;
1638
1639         switch (rt->dst.error) {
1640         case EINVAL:
1641         default:
1642                 goto out;
1643         case EHOSTUNREACH:
1644                 code = ICMP_HOST_UNREACH;
1645                 break;
1646         case ENETUNREACH:
1647                 code = ICMP_NET_UNREACH;
1648                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1649                                 IPSTATS_MIB_INNOROUTES);
1650                 break;
1651         case EACCES:
1652                 code = ICMP_PKT_FILTERED;
1653                 break;
1654         }
1655
1656         if (!rt->peer)
1657                 rt_bind_peer(rt, rt->rt_dst, 1);
1658         peer = rt->peer;
1659
1660         send = true;
1661         if (peer) {
1662                 now = jiffies;
1663                 peer->rate_tokens += now - peer->rate_last;
1664                 if (peer->rate_tokens > ip_rt_error_burst)
1665                         peer->rate_tokens = ip_rt_error_burst;
1666                 peer->rate_last = now;
1667                 if (peer->rate_tokens >= ip_rt_error_cost)
1668                         peer->rate_tokens -= ip_rt_error_cost;
1669                 else
1670                         send = false;
1671         }
1672         if (send)
1673                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1674
1675 out:    kfree_skb(skb);
1676         return 0;
1677 }
1678
1679 /*
1680  *      The last two values are not from the RFC but
1681  *      are needed for AMPRnet AX.25 paths.
1682  */
1683
1684 static const unsigned short mtu_plateau[] =
1685 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1686
1687 static inline unsigned short guess_mtu(unsigned short old_mtu)
1688 {
1689         int i;
1690
1691         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1692                 if (old_mtu > mtu_plateau[i])
1693                         return mtu_plateau[i];
1694         return 68;
1695 }
1696
1697 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1698                                  unsigned short new_mtu,
1699                                  struct net_device *dev)
1700 {
1701         unsigned short old_mtu = ntohs(iph->tot_len);
1702         unsigned short est_mtu = 0;
1703         struct inet_peer *peer;
1704
1705         peer = inet_getpeer_v4(iph->daddr, 1);
1706         if (peer) {
1707                 unsigned short mtu = new_mtu;
1708
1709                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1710                         /* BSD 4.2 derived systems incorrectly adjust
1711                          * tot_len by the IP header length, and report
1712                          * a zero MTU in the ICMP message.
1713                          */
1714                         if (mtu == 0 &&
1715                             old_mtu >= 68 + (iph->ihl << 2))
1716                                 old_mtu -= iph->ihl << 2;
1717                         mtu = guess_mtu(old_mtu);
1718                 }
1719
1720                 if (mtu < ip_rt_min_pmtu)
1721                         mtu = ip_rt_min_pmtu;
1722                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1723                         unsigned long pmtu_expires;
1724
1725                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1726                         if (!pmtu_expires)
1727                                 pmtu_expires = 1UL;
1728
1729                         est_mtu = mtu;
1730                         peer->pmtu_learned = mtu;
1731                         peer->pmtu_expires = pmtu_expires;
1732                         atomic_inc(&__rt_peer_genid);
1733                 }
1734
1735                 inet_putpeer(peer);
1736         }
1737         return est_mtu ? : new_mtu;
1738 }
1739
1740 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1741 {
1742         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1743
1744         if (!expires)
1745                 return;
1746         if (time_before(jiffies, expires)) {
1747                 u32 orig_dst_mtu = dst_mtu(dst);
1748                 if (peer->pmtu_learned < orig_dst_mtu) {
1749                         if (!peer->pmtu_orig)
1750                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1751                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1752                 }
1753         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1754                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1755 }
1756
1757 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1758 {
1759         struct rtable *rt = (struct rtable *) dst;
1760         struct inet_peer *peer;
1761
1762         dst_confirm(dst);
1763
1764         if (!rt->peer)
1765                 rt_bind_peer(rt, rt->rt_dst, 1);
1766         peer = rt->peer;
1767         if (peer) {
1768                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1769
1770                 if (mtu < ip_rt_min_pmtu)
1771                         mtu = ip_rt_min_pmtu;
1772                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1773
1774                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1775                         if (!pmtu_expires)
1776                                 pmtu_expires = 1UL;
1777
1778                         peer->pmtu_learned = mtu;
1779                         peer->pmtu_expires = pmtu_expires;
1780
1781                         atomic_inc(&__rt_peer_genid);
1782                         rt->rt_peer_genid = rt_peer_genid();
1783                 }
1784                 check_peer_pmtu(dst, peer);
1785         }
1786 }
1787
1788
1789 static void ipv4_validate_peer(struct rtable *rt)
1790 {
1791         if (rt->rt_peer_genid != rt_peer_genid()) {
1792                 struct inet_peer *peer;
1793
1794                 if (!rt->peer)
1795                         rt_bind_peer(rt, rt->rt_dst, 0);
1796
1797                 peer = rt->peer;
1798                 if (peer) {
1799                         check_peer_pmtu(&rt->dst, peer);
1800
1801                         if (peer->redirect_genid != redirect_genid)
1802                                 peer->redirect_learned.a4 = 0;
1803                         if (peer->redirect_learned.a4 &&
1804                             peer->redirect_learned.a4 != rt->rt_gateway)
1805                                 check_peer_redir(&rt->dst, peer);
1806                 }
1807
1808                 rt->rt_peer_genid = rt_peer_genid();
1809         }
1810 }
1811
1812 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1813 {
1814         struct rtable *rt = (struct rtable *) dst;
1815
1816         if (rt_is_expired(rt))
1817                 return NULL;
1818         ipv4_validate_peer(rt);
1819         return dst;
1820 }
1821
1822 static void ipv4_dst_destroy(struct dst_entry *dst)
1823 {
1824         struct rtable *rt = (struct rtable *) dst;
1825         struct inet_peer *peer = rt->peer;
1826
1827         if (rt->fi) {
1828                 fib_info_put(rt->fi);
1829                 rt->fi = NULL;
1830         }
1831         if (peer) {
1832                 rt->peer = NULL;
1833                 inet_putpeer(peer);
1834         }
1835 }
1836
1837
1838 static void ipv4_link_failure(struct sk_buff *skb)
1839 {
1840         struct rtable *rt;
1841
1842         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1843
1844         rt = skb_rtable(skb);
1845         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1846                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1847 }
1848
1849 static int ip_rt_bug(struct sk_buff *skb)
1850 {
1851         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1852                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1853                 skb->dev ? skb->dev->name : "?");
1854         kfree_skb(skb);
1855         WARN_ON(1);
1856         return 0;
1857 }
1858
1859 /*
1860    We do not cache source address of outgoing interface,
1861    because it is used only by IP RR, TS and SRR options,
1862    so that it out of fast path.
1863
1864    BTW remember: "addr" is allowed to be not aligned
1865    in IP options!
1866  */
1867
1868 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1869 {
1870         __be32 src;
1871
1872         if (rt_is_output_route(rt))
1873                 src = ip_hdr(skb)->saddr;
1874         else {
1875                 struct fib_result res;
1876                 struct flowi4 fl4;
1877                 struct iphdr *iph;
1878
1879                 iph = ip_hdr(skb);
1880
1881                 memset(&fl4, 0, sizeof(fl4));
1882                 fl4.daddr = iph->daddr;
1883                 fl4.saddr = iph->saddr;
1884                 fl4.flowi4_tos = RT_TOS(iph->tos);
1885                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1886                 fl4.flowi4_iif = skb->dev->ifindex;
1887                 fl4.flowi4_mark = skb->mark;
1888
1889                 rcu_read_lock();
1890                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1891                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1892                 else
1893                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1894                                         RT_SCOPE_UNIVERSE);
1895                 rcu_read_unlock();
1896         }
1897         memcpy(addr, &src, 4);
1898 }
1899
1900 #ifdef CONFIG_IP_ROUTE_CLASSID
1901 static void set_class_tag(struct rtable *rt, u32 tag)
1902 {
1903         if (!(rt->dst.tclassid & 0xFFFF))
1904                 rt->dst.tclassid |= tag & 0xFFFF;
1905         if (!(rt->dst.tclassid & 0xFFFF0000))
1906                 rt->dst.tclassid |= tag & 0xFFFF0000;
1907 }
1908 #endif
1909
1910 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1911 {
1912         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1913
1914         if (advmss == 0) {
1915                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1916                                ip_rt_min_advmss);
1917                 if (advmss > 65535 - 40)
1918                         advmss = 65535 - 40;
1919         }
1920         return advmss;
1921 }
1922
1923 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1924 {
1925         const struct rtable *rt = (const struct rtable *) dst;
1926         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1927
1928         if (mtu && rt_is_output_route(rt))
1929                 return mtu;
1930
1931         mtu = dst->dev->mtu;
1932
1933         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1934
1935                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1936                         mtu = 576;
1937         }
1938
1939         if (mtu > IP_MAX_MTU)
1940                 mtu = IP_MAX_MTU;
1941
1942         return mtu;
1943 }
1944
1945 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1946                             struct fib_info *fi)
1947 {
1948         struct inet_peer *peer;
1949         int create = 0;
1950
1951         /* If a peer entry exists for this destination, we must hook
1952          * it up in order to get at cached metrics.
1953          */
1954         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1955                 create = 1;
1956
1957         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1958         if (peer) {
1959                 rt->rt_peer_genid = rt_peer_genid();
1960                 if (inet_metrics_new(peer))
1961                         memcpy(peer->metrics, fi->fib_metrics,
1962                                sizeof(u32) * RTAX_MAX);
1963                 dst_init_metrics(&rt->dst, peer->metrics, false);
1964
1965                 check_peer_pmtu(&rt->dst, peer);
1966                 if (peer->redirect_genid != redirect_genid)
1967                         peer->redirect_learned.a4 = 0;
1968                 if (peer->redirect_learned.a4 &&
1969                     peer->redirect_learned.a4 != rt->rt_gateway) {
1970                         rt->rt_gateway = peer->redirect_learned.a4;
1971                         rt->rt_flags |= RTCF_REDIRECTED;
1972                 }
1973         } else {
1974                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1975                         rt->fi = fi;
1976                         atomic_inc(&fi->fib_clntref);
1977                 }
1978                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1979         }
1980 }
1981
1982 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1983                            const struct fib_result *res,
1984                            struct fib_info *fi, u16 type, u32 itag)
1985 {
1986         struct dst_entry *dst = &rt->dst;
1987
1988         if (fi) {
1989                 if (FIB_RES_GW(*res) &&
1990                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1991                         rt->rt_gateway = FIB_RES_GW(*res);
1992                 rt_init_metrics(rt, fl4, fi);
1993 #ifdef CONFIG_IP_ROUTE_CLASSID
1994                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1995 #endif
1996         }
1997
1998         if (dst_mtu(dst) > IP_MAX_MTU)
1999                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
2000         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
2001                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
2002
2003 #ifdef CONFIG_IP_ROUTE_CLASSID
2004 #ifdef CONFIG_IP_MULTIPLE_TABLES
2005         set_class_tag(rt, fib_rules_tclass(res));
2006 #endif
2007         set_class_tag(rt, itag);
2008 #endif
2009 }
2010
2011 static struct rtable *rt_dst_alloc(struct net_device *dev,
2012                                    bool nopolicy, bool noxfrm)
2013 {
2014         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2015                          DST_HOST |
2016                          (nopolicy ? DST_NOPOLICY : 0) |
2017                          (noxfrm ? DST_NOXFRM : 0));
2018 }
2019
2020 /* called in rcu_read_lock() section */
2021 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2022                                 u8 tos, struct net_device *dev, int our)
2023 {
2024         unsigned int hash;
2025         struct rtable *rth;
2026         __be32 spec_dst;
2027         struct in_device *in_dev = __in_dev_get_rcu(dev);
2028         u32 itag = 0;
2029         int err;
2030
2031         /* Primary sanity checks. */
2032
2033         if (in_dev == NULL)
2034                 return -EINVAL;
2035
2036         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2037             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2038                 goto e_inval;
2039
2040         if (ipv4_is_zeronet(saddr)) {
2041                 if (!ipv4_is_local_multicast(daddr))
2042                         goto e_inval;
2043                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2044         } else {
2045                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2046                                           &itag);
2047                 if (err < 0)
2048                         goto e_err;
2049         }
2050         rth = rt_dst_alloc(init_net.loopback_dev,
2051                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2052         if (!rth)
2053                 goto e_nobufs;
2054
2055 #ifdef CONFIG_IP_ROUTE_CLASSID
2056         rth->dst.tclassid = itag;
2057 #endif
2058         rth->dst.output = ip_rt_bug;
2059
2060         rth->rt_key_dst = daddr;
2061         rth->rt_key_src = saddr;
2062         rth->rt_genid   = rt_genid(dev_net(dev));
2063         rth->rt_flags   = RTCF_MULTICAST;
2064         rth->rt_type    = RTN_MULTICAST;
2065         rth->rt_key_tos = tos;
2066         rth->rt_dst     = daddr;
2067         rth->rt_src     = saddr;
2068         rth->rt_route_iif = dev->ifindex;
2069         rth->rt_iif     = dev->ifindex;
2070         rth->rt_oif     = 0;
2071         rth->rt_mark    = skb->mark;
2072         rth->rt_gateway = daddr;
2073         rth->rt_spec_dst= spec_dst;
2074         rth->rt_peer_genid = 0;
2075         rth->peer = NULL;
2076         rth->fi = NULL;
2077         if (our) {
2078                 rth->dst.input= ip_local_deliver;
2079                 rth->rt_flags |= RTCF_LOCAL;
2080         }
2081
2082 #ifdef CONFIG_IP_MROUTE
2083         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2084                 rth->dst.input = ip_mr_input;
2085 #endif
2086         RT_CACHE_STAT_INC(in_slow_mc);
2087
2088         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2089         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2090         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2091
2092 e_nobufs:
2093         return -ENOBUFS;
2094 e_inval:
2095         return -EINVAL;
2096 e_err:
2097         return err;
2098 }
2099
2100
2101 static void ip_handle_martian_source(struct net_device *dev,
2102                                      struct in_device *in_dev,
2103                                      struct sk_buff *skb,
2104                                      __be32 daddr,
2105                                      __be32 saddr)
2106 {
2107         RT_CACHE_STAT_INC(in_martian_src);
2108 #ifdef CONFIG_IP_ROUTE_VERBOSE
2109         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2110                 /*
2111                  *      RFC1812 recommendation, if source is martian,
2112                  *      the only hint is MAC header.
2113                  */
2114                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2115                         &daddr, &saddr, dev->name);
2116                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2117                         int i;
2118                         const unsigned char *p = skb_mac_header(skb);
2119                         printk(KERN_WARNING "ll header: ");
2120                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2121                                 printk("%02x", *p);
2122                                 if (i < (dev->hard_header_len - 1))
2123                                         printk(":");
2124                         }
2125                         printk("\n");
2126                 }
2127         }
2128 #endif
2129 }
2130
2131 /* called in rcu_read_lock() section */
2132 static int __mkroute_input(struct sk_buff *skb,
2133                            const struct fib_result *res,
2134                            struct in_device *in_dev,
2135                            __be32 daddr, __be32 saddr, u32 tos,
2136                            struct rtable **result)
2137 {
2138         struct rtable *rth;
2139         int err;
2140         struct in_device *out_dev;
2141         unsigned int flags = 0;
2142         __be32 spec_dst;
2143         u32 itag;
2144
2145         /* get a working reference to the output device */
2146         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2147         if (out_dev == NULL) {
2148                 if (net_ratelimit())
2149                         printk(KERN_CRIT "Bug in ip_route_input" \
2150                                "_slow(). Please, report\n");
2151                 return -EINVAL;
2152         }
2153
2154
2155         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2156                                   in_dev->dev, &spec_dst, &itag);
2157         if (err < 0) {
2158                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2159                                          saddr);
2160
2161                 goto cleanup;
2162         }
2163
2164         if (err)
2165                 flags |= RTCF_DIRECTSRC;
2166
2167         if (out_dev == in_dev && err &&
2168             (IN_DEV_SHARED_MEDIA(out_dev) ||
2169              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2170                 flags |= RTCF_DOREDIRECT;
2171
2172         if (skb->protocol != htons(ETH_P_IP)) {
2173                 /* Not IP (i.e. ARP). Do not create route, if it is
2174                  * invalid for proxy arp. DNAT routes are always valid.
2175                  *
2176                  * Proxy arp feature have been extended to allow, ARP
2177                  * replies back to the same interface, to support
2178                  * Private VLAN switch technologies. See arp.c.
2179                  */
2180                 if (out_dev == in_dev &&
2181                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2182                         err = -EINVAL;
2183                         goto cleanup;
2184                 }
2185         }
2186
2187         rth = rt_dst_alloc(out_dev->dev,
2188                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2189                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2190         if (!rth) {
2191                 err = -ENOBUFS;
2192                 goto cleanup;
2193         }
2194
2195         rth->rt_key_dst = daddr;
2196         rth->rt_key_src = saddr;
2197         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2198         rth->rt_flags = flags;
2199         rth->rt_type = res->type;
2200         rth->rt_key_tos = tos;
2201         rth->rt_dst     = daddr;
2202         rth->rt_src     = saddr;
2203         rth->rt_route_iif = in_dev->dev->ifindex;
2204         rth->rt_iif     = in_dev->dev->ifindex;
2205         rth->rt_oif     = 0;
2206         rth->rt_mark    = skb->mark;
2207         rth->rt_gateway = daddr;
2208         rth->rt_spec_dst= spec_dst;
2209         rth->rt_peer_genid = 0;
2210         rth->peer = NULL;
2211         rth->fi = NULL;
2212
2213         rth->dst.input = ip_forward;
2214         rth->dst.output = ip_output;
2215
2216         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2217
2218         *result = rth;
2219         err = 0;
2220  cleanup:
2221         return err;
2222 }
2223
2224 static int ip_mkroute_input(struct sk_buff *skb,
2225                             struct fib_result *res,
2226                             const struct flowi4 *fl4,
2227                             struct in_device *in_dev,
2228                             __be32 daddr, __be32 saddr, u32 tos)
2229 {
2230         struct rtable* rth = NULL;
2231         int err;
2232         unsigned hash;
2233
2234 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2235         if (res->fi && res->fi->fib_nhs > 1)
2236                 fib_select_multipath(res);
2237 #endif
2238
2239         /* create a routing cache entry */
2240         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2241         if (err)
2242                 return err;
2243
2244         /* put it into the cache */
2245         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2246                        rt_genid(dev_net(rth->dst.dev)));
2247         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2248         if (IS_ERR(rth))
2249                 return PTR_ERR(rth);
2250         return 0;
2251 }
2252
2253 /*
2254  *      NOTE. We drop all the packets that has local source
2255  *      addresses, because every properly looped back packet
2256  *      must have correct destination already attached by output routine.
2257  *
2258  *      Such approach solves two big problems:
2259  *      1. Not simplex devices are handled properly.
2260  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2261  *      called with rcu_read_lock()
2262  */
2263
2264 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2265                                u8 tos, struct net_device *dev)
2266 {
2267         struct fib_result res;
2268         struct in_device *in_dev = __in_dev_get_rcu(dev);
2269         struct flowi4   fl4;
2270         unsigned        flags = 0;
2271         u32             itag = 0;
2272         struct rtable * rth;
2273         unsigned        hash;
2274         __be32          spec_dst;
2275         int             err = -EINVAL;
2276         struct net    * net = dev_net(dev);
2277
2278         /* IP on this device is disabled. */
2279
2280         if (!in_dev)
2281                 goto out;
2282
2283         /* Check for the most weird martians, which can be not detected
2284            by fib_lookup.
2285          */
2286
2287         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2288             ipv4_is_loopback(saddr))
2289                 goto martian_source;
2290
2291         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2292                 goto brd_input;
2293
2294         /* Accept zero addresses only to limited broadcast;
2295          * I even do not know to fix it or not. Waiting for complains :-)
2296          */
2297         if (ipv4_is_zeronet(saddr))
2298                 goto martian_source;
2299
2300         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2301                 goto martian_destination;
2302
2303         /*
2304          *      Now we are ready to route packet.
2305          */
2306         fl4.flowi4_oif = 0;
2307         fl4.flowi4_iif = dev->ifindex;
2308         fl4.flowi4_mark = skb->mark;
2309         fl4.flowi4_tos = tos;
2310         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2311         fl4.daddr = daddr;
2312         fl4.saddr = saddr;
2313         err = fib_lookup(net, &fl4, &res);
2314         if (err != 0) {
2315                 if (!IN_DEV_FORWARD(in_dev))
2316                         goto e_hostunreach;
2317                 goto no_route;
2318         }
2319
2320         RT_CACHE_STAT_INC(in_slow_tot);
2321
2322         if (res.type == RTN_BROADCAST)
2323                 goto brd_input;
2324
2325         if (res.type == RTN_LOCAL) {
2326                 err = fib_validate_source(skb, saddr, daddr, tos,
2327                                           net->loopback_dev->ifindex,
2328                                           dev, &spec_dst, &itag);
2329                 if (err < 0)
2330                         goto martian_source_keep_err;
2331                 if (err)
2332                         flags |= RTCF_DIRECTSRC;
2333                 spec_dst = daddr;
2334                 goto local_input;
2335         }
2336
2337         if (!IN_DEV_FORWARD(in_dev))
2338                 goto e_hostunreach;
2339         if (res.type != RTN_UNICAST)
2340                 goto martian_destination;
2341
2342         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2343 out:    return err;
2344
2345 brd_input:
2346         if (skb->protocol != htons(ETH_P_IP))
2347                 goto e_inval;
2348
2349         if (ipv4_is_zeronet(saddr))
2350                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2351         else {
2352                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2353                                           &itag);
2354                 if (err < 0)
2355                         goto martian_source_keep_err;
2356                 if (err)
2357                         flags |= RTCF_DIRECTSRC;
2358         }
2359         flags |= RTCF_BROADCAST;
2360         res.type = RTN_BROADCAST;
2361         RT_CACHE_STAT_INC(in_brd);
2362
2363 local_input:
2364         rth = rt_dst_alloc(net->loopback_dev,
2365                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2366         if (!rth)
2367                 goto e_nobufs;
2368
2369         rth->dst.input= ip_local_deliver;
2370         rth->dst.output= ip_rt_bug;
2371 #ifdef CONFIG_IP_ROUTE_CLASSID
2372         rth->dst.tclassid = itag;
2373 #endif
2374
2375         rth->rt_key_dst = daddr;
2376         rth->rt_key_src = saddr;
2377         rth->rt_genid = rt_genid(net);
2378         rth->rt_flags   = flags|RTCF_LOCAL;
2379         rth->rt_type    = res.type;
2380         rth->rt_key_tos = tos;
2381         rth->rt_dst     = daddr;
2382         rth->rt_src     = saddr;
2383 #ifdef CONFIG_IP_ROUTE_CLASSID
2384         rth->dst.tclassid = itag;
2385 #endif
2386         rth->rt_route_iif = dev->ifindex;
2387         rth->rt_iif     = dev->ifindex;
2388         rth->rt_oif     = 0;
2389         rth->rt_mark    = skb->mark;
2390         rth->rt_gateway = daddr;
2391         rth->rt_spec_dst= spec_dst;
2392         rth->rt_peer_genid = 0;
2393         rth->peer = NULL;
2394         rth->fi = NULL;
2395         if (res.type == RTN_UNREACHABLE) {
2396                 rth->dst.input= ip_error;
2397                 rth->dst.error= -err;
2398                 rth->rt_flags   &= ~RTCF_LOCAL;
2399         }
2400         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2401         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2402         err = 0;
2403         if (IS_ERR(rth))
2404                 err = PTR_ERR(rth);
2405         goto out;
2406
2407 no_route:
2408         RT_CACHE_STAT_INC(in_no_route);
2409         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2410         res.type = RTN_UNREACHABLE;
2411         if (err == -ESRCH)
2412                 err = -ENETUNREACH;
2413         goto local_input;
2414
2415         /*
2416          *      Do not cache martian addresses: they should be logged (RFC1812)
2417          */
2418 martian_destination:
2419         RT_CACHE_STAT_INC(in_martian_dst);
2420 #ifdef CONFIG_IP_ROUTE_VERBOSE
2421         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2422                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2423                         &daddr, &saddr, dev->name);
2424 #endif
2425
2426 e_hostunreach:
2427         err = -EHOSTUNREACH;
2428         goto out;
2429
2430 e_inval:
2431         err = -EINVAL;
2432         goto out;
2433
2434 e_nobufs:
2435         err = -ENOBUFS;
2436         goto out;
2437
2438 martian_source:
2439         err = -EINVAL;
2440 martian_source_keep_err:
2441         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2442         goto out;
2443 }
2444
2445 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2446                            u8 tos, struct net_device *dev, bool noref)
2447 {
2448         struct rtable * rth;
2449         unsigned        hash;
2450         int iif = dev->ifindex;
2451         struct net *net;
2452         int res;
2453
2454         net = dev_net(dev);
2455
2456         rcu_read_lock();
2457
2458         if (!rt_caching(net))
2459                 goto skip_cache;
2460
2461         tos &= IPTOS_RT_MASK;
2462         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2463
2464         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2465              rth = rcu_dereference(rth->dst.rt_next)) {
2466                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2467                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2468                      (rth->rt_route_iif ^ iif) |
2469                      (rth->rt_key_tos ^ tos)) == 0 &&
2470                     rth->rt_mark == skb->mark &&
2471                     net_eq(dev_net(rth->dst.dev), net) &&
2472                     !rt_is_expired(rth)) {
2473                         ipv4_validate_peer(rth);
2474                         if (noref) {
2475                                 dst_use_noref(&rth->dst, jiffies);
2476                                 skb_dst_set_noref(skb, &rth->dst);
2477                         } else {
2478                                 dst_use(&rth->dst, jiffies);
2479                                 skb_dst_set(skb, &rth->dst);
2480                         }
2481                         RT_CACHE_STAT_INC(in_hit);
2482                         rcu_read_unlock();
2483                         return 0;
2484                 }
2485                 RT_CACHE_STAT_INC(in_hlist_search);
2486         }
2487
2488 skip_cache:
2489         /* Multicast recognition logic is moved from route cache to here.
2490            The problem was that too many Ethernet cards have broken/missing
2491            hardware multicast filters :-( As result the host on multicasting
2492            network acquires a lot of useless route cache entries, sort of
2493            SDR messages from all the world. Now we try to get rid of them.
2494            Really, provided software IP multicast filter is organized
2495            reasonably (at least, hashed), it does not result in a slowdown
2496            comparing with route cache reject entries.
2497            Note, that multicast routers are not affected, because
2498            route cache entry is created eventually.
2499          */
2500         if (ipv4_is_multicast(daddr)) {
2501                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2502
2503                 if (in_dev) {
2504                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2505                                                   ip_hdr(skb)->protocol);
2506                         if (our
2507 #ifdef CONFIG_IP_MROUTE
2508                                 ||
2509                             (!ipv4_is_local_multicast(daddr) &&
2510                              IN_DEV_MFORWARD(in_dev))
2511 #endif
2512                            ) {
2513                                 int res = ip_route_input_mc(skb, daddr, saddr,
2514                                                             tos, dev, our);
2515                                 rcu_read_unlock();
2516                                 return res;
2517                         }
2518                 }
2519                 rcu_read_unlock();
2520                 return -EINVAL;
2521         }
2522         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2523         rcu_read_unlock();
2524         return res;
2525 }
2526 EXPORT_SYMBOL(ip_route_input_common);
2527
2528 /* called with rcu_read_lock() */
2529 static struct rtable *__mkroute_output(const struct fib_result *res,
2530                                        const struct flowi4 *fl4,
2531                                        __be32 orig_daddr, __be32 orig_saddr,
2532                                        int orig_oif, __u8 orig_rtos,
2533                                        struct net_device *dev_out,
2534                                        unsigned int flags)
2535 {
2536         struct fib_info *fi = res->fi;
2537         struct in_device *in_dev;
2538         u16 type = res->type;
2539         struct rtable *rth;
2540
2541         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2542                 return ERR_PTR(-EINVAL);
2543
2544         if (ipv4_is_lbcast(fl4->daddr))
2545                 type = RTN_BROADCAST;
2546         else if (ipv4_is_multicast(fl4->daddr))
2547                 type = RTN_MULTICAST;
2548         else if (ipv4_is_zeronet(fl4->daddr))
2549                 return ERR_PTR(-EINVAL);
2550
2551         if (dev_out->flags & IFF_LOOPBACK)
2552                 flags |= RTCF_LOCAL;
2553
2554         in_dev = __in_dev_get_rcu(dev_out);
2555         if (!in_dev)
2556                 return ERR_PTR(-EINVAL);
2557
2558         if (type == RTN_BROADCAST) {
2559                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2560                 fi = NULL;
2561         } else if (type == RTN_MULTICAST) {
2562                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2563                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2564                                      fl4->flowi4_proto))
2565                         flags &= ~RTCF_LOCAL;
2566                 /* If multicast route do not exist use
2567                  * default one, but do not gateway in this case.
2568                  * Yes, it is hack.
2569                  */
2570                 if (fi && res->prefixlen < 4)
2571                         fi = NULL;
2572         }
2573
2574         rth = rt_dst_alloc(dev_out,
2575                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2576                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2577         if (!rth)
2578                 return ERR_PTR(-ENOBUFS);
2579
2580         rth->dst.output = ip_output;
2581
2582         rth->rt_key_dst = orig_daddr;
2583         rth->rt_key_src = orig_saddr;
2584         rth->rt_genid = rt_genid(dev_net(dev_out));
2585         rth->rt_flags   = flags;
2586         rth->rt_type    = type;
2587         rth->rt_key_tos = orig_rtos;
2588         rth->rt_dst     = fl4->daddr;
2589         rth->rt_src     = fl4->saddr;
2590         rth->rt_route_iif = 0;
2591         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2592         rth->rt_oif     = orig_oif;
2593         rth->rt_mark    = fl4->flowi4_mark;
2594         rth->rt_gateway = fl4->daddr;
2595         rth->rt_spec_dst= fl4->saddr;
2596         rth->rt_peer_genid = 0;
2597         rth->peer = NULL;
2598         rth->fi = NULL;
2599
2600         RT_CACHE_STAT_INC(out_slow_tot);
2601
2602         if (flags & RTCF_LOCAL) {
2603                 rth->dst.input = ip_local_deliver;
2604                 rth->rt_spec_dst = fl4->daddr;
2605         }
2606         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2607                 rth->rt_spec_dst = fl4->saddr;
2608                 if (flags & RTCF_LOCAL &&
2609                     !(dev_out->flags & IFF_LOOPBACK)) {
2610                         rth->dst.output = ip_mc_output;
2611                         RT_CACHE_STAT_INC(out_slow_mc);
2612                 }
2613 #ifdef CONFIG_IP_MROUTE
2614                 if (type == RTN_MULTICAST) {
2615                         if (IN_DEV_MFORWARD(in_dev) &&
2616                             !ipv4_is_local_multicast(fl4->daddr)) {
2617                                 rth->dst.input = ip_mr_input;
2618                                 rth->dst.output = ip_mc_output;
2619                         }
2620                 }
2621 #endif
2622         }
2623
2624         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2625
2626         return rth;
2627 }
2628
2629 /*
2630  * Major route resolver routine.
2631  * called with rcu_read_lock();
2632  */
2633
2634 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2635 {
2636         struct net_device *dev_out = NULL;
2637         __u8 tos = RT_FL_TOS(fl4);
2638         unsigned int flags = 0;
2639         struct fib_result res;
2640         struct rtable *rth;
2641         __be32 orig_daddr;
2642         __be32 orig_saddr;
2643         int orig_oif;
2644
2645         res.fi          = NULL;
2646 #ifdef CONFIG_IP_MULTIPLE_TABLES
2647         res.r           = NULL;
2648 #endif
2649
2650         orig_daddr = fl4->daddr;
2651         orig_saddr = fl4->saddr;
2652         orig_oif = fl4->flowi4_oif;
2653
2654         fl4->flowi4_iif = net->loopback_dev->ifindex;
2655         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2656         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2657                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2658
2659         rcu_read_lock();
2660         if (fl4->saddr) {
2661                 rth = ERR_PTR(-EINVAL);
2662                 if (ipv4_is_multicast(fl4->saddr) ||
2663                     ipv4_is_lbcast(fl4->saddr) ||
2664                     ipv4_is_zeronet(fl4->saddr))
2665                         goto out;
2666
2667                 /* I removed check for oif == dev_out->oif here.
2668                    It was wrong for two reasons:
2669                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2670                       is assigned to multiple interfaces.
2671                    2. Moreover, we are allowed to send packets with saddr
2672                       of another iface. --ANK
2673                  */
2674
2675                 if (fl4->flowi4_oif == 0 &&
2676                     (ipv4_is_multicast(fl4->daddr) ||
2677                      ipv4_is_lbcast(fl4->daddr))) {
2678                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2679                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2680                         if (dev_out == NULL)
2681                                 goto out;
2682
2683                         /* Special hack: user can direct multicasts
2684                            and limited broadcast via necessary interface
2685                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2686                            This hack is not just for fun, it allows
2687                            vic,vat and friends to work.
2688                            They bind socket to loopback, set ttl to zero
2689                            and expect that it will work.
2690                            From the viewpoint of routing cache they are broken,
2691                            because we are not allowed to build multicast path
2692                            with loopback source addr (look, routing cache
2693                            cannot know, that ttl is zero, so that packet
2694                            will not leave this host and route is valid).
2695                            Luckily, this hack is good workaround.
2696                          */
2697
2698                         fl4->flowi4_oif = dev_out->ifindex;
2699                         goto make_route;
2700                 }
2701
2702                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2703                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2704                         if (!__ip_dev_find(net, fl4->saddr, false))
2705                                 goto out;
2706                 }
2707         }
2708
2709
2710         if (fl4->flowi4_oif) {
2711                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2712                 rth = ERR_PTR(-ENODEV);
2713                 if (dev_out == NULL)
2714                         goto out;
2715
2716                 /* RACE: Check return value of inet_select_addr instead. */
2717                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2718                         rth = ERR_PTR(-ENETUNREACH);
2719                         goto out;
2720                 }
2721                 if (ipv4_is_local_multicast(fl4->daddr) ||
2722                     ipv4_is_lbcast(fl4->daddr)) {
2723                         if (!fl4->saddr)
2724                                 fl4->saddr = inet_select_addr(dev_out, 0,
2725                                                               RT_SCOPE_LINK);
2726                         goto make_route;
2727                 }
2728                 if (fl4->saddr) {
2729                         if (ipv4_is_multicast(fl4->daddr))
2730                                 fl4->saddr = inet_select_addr(dev_out, 0,
2731                                                               fl4->flowi4_scope);
2732                         else if (!fl4->daddr)
2733                                 fl4->saddr = inet_select_addr(dev_out, 0,
2734                                                               RT_SCOPE_HOST);
2735                 }
2736         }
2737
2738         if (!fl4->daddr) {
2739                 fl4->daddr = fl4->saddr;
2740                 if (!fl4->daddr)
2741                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2742                 dev_out = net->loopback_dev;
2743                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2744                 res.type = RTN_LOCAL;
2745                 flags |= RTCF_LOCAL;
2746                 goto make_route;
2747         }
2748
2749         if (fib_lookup(net, fl4, &res)) {
2750                 res.fi = NULL;
2751                 if (fl4->flowi4_oif) {
2752                         /* Apparently, routing tables are wrong. Assume,
2753                            that the destination is on link.
2754
2755                            WHY? DW.
2756                            Because we are allowed to send to iface
2757                            even if it has NO routes and NO assigned
2758                            addresses. When oif is specified, routing
2759                            tables are looked up with only one purpose:
2760                            to catch if destination is gatewayed, rather than
2761                            direct. Moreover, if MSG_DONTROUTE is set,
2762                            we send packet, ignoring both routing tables
2763                            and ifaddr state. --ANK
2764
2765
2766                            We could make it even if oif is unknown,
2767                            likely IPv6, but we do not.
2768                          */
2769
2770                         if (fl4->saddr == 0)
2771                                 fl4->saddr = inet_select_addr(dev_out, 0,
2772                                                               RT_SCOPE_LINK);
2773                         res.type = RTN_UNICAST;
2774                         goto make_route;
2775                 }
2776                 rth = ERR_PTR(-ENETUNREACH);
2777                 goto out;
2778         }
2779
2780         if (res.type == RTN_LOCAL) {
2781                 if (!fl4->saddr) {
2782                         if (res.fi->fib_prefsrc)
2783                                 fl4->saddr = res.fi->fib_prefsrc;
2784                         else
2785                                 fl4->saddr = fl4->daddr;
2786                 }
2787                 dev_out = net->loopback_dev;
2788                 fl4->flowi4_oif = dev_out->ifindex;
2789                 res.fi = NULL;
2790                 flags |= RTCF_LOCAL;
2791                 goto make_route;
2792         }
2793
2794 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2795         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2796                 fib_select_multipath(&res);
2797         else
2798 #endif
2799         if (!res.prefixlen &&
2800             res.table->tb_num_default > 1 &&
2801             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2802                 fib_select_default(&res);
2803
2804         if (!fl4->saddr)
2805                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2806
2807         dev_out = FIB_RES_DEV(res);
2808         fl4->flowi4_oif = dev_out->ifindex;
2809
2810
2811 make_route:
2812         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2813                                tos, dev_out, flags);
2814         if (!IS_ERR(rth)) {
2815                 unsigned int hash;
2816
2817                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2818                                rt_genid(dev_net(dev_out)));
2819                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2820         }
2821
2822 out:
2823         rcu_read_unlock();
2824         return rth;
2825 }
2826
2827 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2828 {
2829         struct rtable *rth;
2830         unsigned int hash;
2831
2832         if (!rt_caching(net))
2833                 goto slow_output;
2834
2835         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2836
2837         rcu_read_lock_bh();
2838         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2839                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2840                 if (rth->rt_key_dst == flp4->daddr &&
2841                     rth->rt_key_src == flp4->saddr &&
2842                     rt_is_output_route(rth) &&
2843                     rth->rt_oif == flp4->flowi4_oif &&
2844                     rth->rt_mark == flp4->flowi4_mark &&
2845                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2846                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2847                     net_eq(dev_net(rth->dst.dev), net) &&
2848                     !rt_is_expired(rth)) {
2849                         ipv4_validate_peer(rth);
2850                         dst_use(&rth->dst, jiffies);
2851                         RT_CACHE_STAT_INC(out_hit);
2852                         rcu_read_unlock_bh();
2853                         if (!flp4->saddr)
2854                                 flp4->saddr = rth->rt_src;
2855                         if (!flp4->daddr)
2856                                 flp4->daddr = rth->rt_dst;
2857                         return rth;
2858                 }
2859                 RT_CACHE_STAT_INC(out_hlist_search);
2860         }
2861         rcu_read_unlock_bh();
2862
2863 slow_output:
2864         return ip_route_output_slow(net, flp4);
2865 }
2866 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2867
2868 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2869 {
2870         return NULL;
2871 }
2872
2873 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2874 {
2875         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2876
2877         return mtu ? : dst->dev->mtu;
2878 }
2879
2880 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2881 {
2882 }
2883
2884 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2885                                           unsigned long old)
2886 {
2887         return NULL;
2888 }
2889
2890 static struct dst_ops ipv4_dst_blackhole_ops = {
2891         .family                 =       AF_INET,
2892         .protocol               =       cpu_to_be16(ETH_P_IP),
2893         .destroy                =       ipv4_dst_destroy,
2894         .check                  =       ipv4_blackhole_dst_check,
2895         .mtu                    =       ipv4_blackhole_mtu,
2896         .default_advmss         =       ipv4_default_advmss,
2897         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2898         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2899         .neigh_lookup           =       ipv4_neigh_lookup,
2900 };
2901
2902 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2903 {
2904         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2905         struct rtable *ort = (struct rtable *) dst_orig;
2906
2907         if (rt) {
2908                 struct dst_entry *new = &rt->dst;
2909
2910                 new->__use = 1;
2911                 new->input = dst_discard;
2912                 new->output = dst_discard;
2913                 dst_copy_metrics(new, &ort->dst);
2914
2915                 new->dev = ort->dst.dev;
2916                 if (new->dev)
2917                         dev_hold(new->dev);
2918
2919                 rt->rt_key_dst = ort->rt_key_dst;
2920                 rt->rt_key_src = ort->rt_key_src;
2921                 rt->rt_key_tos = ort->rt_key_tos;
2922                 rt->rt_route_iif = ort->rt_route_iif;
2923                 rt->rt_iif = ort->rt_iif;
2924                 rt->rt_oif = ort->rt_oif;
2925                 rt->rt_mark = ort->rt_mark;
2926
2927                 rt->rt_genid = rt_genid(net);
2928                 rt->rt_flags = ort->rt_flags;
2929                 rt->rt_type = ort->rt_type;
2930                 rt->rt_dst = ort->rt_dst;
2931                 rt->rt_src = ort->rt_src;
2932                 rt->rt_gateway = ort->rt_gateway;
2933                 rt->rt_spec_dst = ort->rt_spec_dst;
2934                 rt->peer = ort->peer;
2935                 if (rt->peer)
2936                         atomic_inc(&rt->peer->refcnt);
2937                 rt->fi = ort->fi;
2938                 if (rt->fi)
2939                         atomic_inc(&rt->fi->fib_clntref);
2940
2941                 dst_free(new);
2942         }
2943
2944         dst_release(dst_orig);
2945
2946         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2947 }
2948
2949 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2950                                     struct sock *sk)
2951 {
2952         struct rtable *rt = __ip_route_output_key(net, flp4);
2953
2954         if (IS_ERR(rt))
2955                 return rt;
2956
2957         if (flp4->flowi4_proto)
2958                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2959                                                    flowi4_to_flowi(flp4),
2960                                                    sk, 0);
2961
2962         return rt;
2963 }
2964 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2965
2966 static int rt_fill_info(struct net *net,
2967                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2968                         int nowait, unsigned int flags)
2969 {
2970         struct rtable *rt = skb_rtable(skb);
2971         struct rtmsg *r;
2972         struct nlmsghdr *nlh;
2973         unsigned long expires = 0;
2974         const struct inet_peer *peer = rt->peer;
2975         u32 id = 0, ts = 0, tsage = 0, error;
2976
2977         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2978         if (nlh == NULL)
2979                 return -EMSGSIZE;
2980
2981         r = nlmsg_data(nlh);
2982         r->rtm_family    = AF_INET;
2983         r->rtm_dst_len  = 32;
2984         r->rtm_src_len  = 0;
2985         r->rtm_tos      = rt->rt_key_tos;
2986         r->rtm_table    = RT_TABLE_MAIN;
2987         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2988         r->rtm_type     = rt->rt_type;
2989         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2990         r->rtm_protocol = RTPROT_UNSPEC;
2991         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2992         if (rt->rt_flags & RTCF_NOTIFY)
2993                 r->rtm_flags |= RTM_F_NOTIFY;
2994
2995         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2996
2997         if (rt->rt_key_src) {
2998                 r->rtm_src_len = 32;
2999                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
3000         }
3001         if (rt->dst.dev)
3002                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
3003 #ifdef CONFIG_IP_ROUTE_CLASSID
3004         if (rt->dst.tclassid)
3005                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3006 #endif
3007         if (rt_is_input_route(rt))
3008                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3009         else if (rt->rt_src != rt->rt_key_src)
3010                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3011
3012         if (rt->rt_dst != rt->rt_gateway)
3013                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3014
3015         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3016                 goto nla_put_failure;
3017
3018         if (rt->rt_mark)
3019                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3020
3021         error = rt->dst.error;
3022         if (peer) {
3023                 inet_peer_refcheck(rt->peer);
3024                 id = atomic_read(&peer->ip_id_count) & 0xffff;
3025                 if (peer->tcp_ts_stamp) {
3026                         ts = peer->tcp_ts;
3027                         tsage = get_seconds() - peer->tcp_ts_stamp;
3028                 }
3029                 expires = ACCESS_ONCE(peer->pmtu_expires);
3030                 if (expires) {
3031                         if (time_before(jiffies, expires))
3032                                 expires -= jiffies;
3033                         else
3034                                 expires = 0;
3035                 }
3036         }
3037
3038         if (rt_is_input_route(rt)) {
3039 #ifdef CONFIG_IP_MROUTE
3040                 __be32 dst = rt->rt_dst;
3041
3042                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3043                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3044                         int err = ipmr_get_route(net, skb,
3045                                                  rt->rt_src, rt->rt_dst,
3046                                                  r, nowait);
3047                         if (err <= 0) {
3048                                 if (!nowait) {
3049                                         if (err == 0)
3050                                                 return 0;
3051                                         goto nla_put_failure;
3052                                 } else {
3053                                         if (err == -EMSGSIZE)
3054                                                 goto nla_put_failure;
3055                                         error = err;
3056                                 }
3057                         }
3058                 } else
3059 #endif
3060                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3061         }
3062
3063         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3064                                expires, error) < 0)
3065                 goto nla_put_failure;
3066
3067         return nlmsg_end(skb, nlh);
3068
3069 nla_put_failure:
3070         nlmsg_cancel(skb, nlh);
3071         return -EMSGSIZE;
3072 }
3073
3074 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3075 {
3076         struct net *net = sock_net(in_skb->sk);
3077         struct rtmsg *rtm;
3078         struct nlattr *tb[RTA_MAX+1];
3079         struct rtable *rt = NULL;
3080         __be32 dst = 0;
3081         __be32 src = 0;
3082         u32 iif;
3083         int err;
3084         int mark;
3085         struct sk_buff *skb;
3086
3087         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3088         if (err < 0)
3089                 goto errout;
3090
3091         rtm = nlmsg_data(nlh);
3092
3093         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3094         if (skb == NULL) {
3095                 err = -ENOBUFS;
3096                 goto errout;
3097         }
3098
3099         /* Reserve room for dummy headers, this skb can pass
3100            through good chunk of routing engine.
3101          */
3102         skb_reset_mac_header(skb);
3103         skb_reset_network_header(skb);
3104
3105         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3106         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3107         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3108
3109         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3110         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3111         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3112         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3113
3114         if (iif) {
3115                 struct net_device *dev;
3116
3117                 dev = __dev_get_by_index(net, iif);
3118                 if (dev == NULL) {
3119                         err = -ENODEV;
3120                         goto errout_free;
3121                 }
3122
3123                 skb->protocol   = htons(ETH_P_IP);
3124                 skb->dev        = dev;
3125                 skb->mark       = mark;
3126                 local_bh_disable();
3127                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3128                 local_bh_enable();
3129
3130                 rt = skb_rtable(skb);
3131                 if (err == 0 && rt->dst.error)
3132                         err = -rt->dst.error;
3133         } else {
3134                 struct flowi4 fl4 = {
3135                         .daddr = dst,
3136                         .saddr = src,
3137                         .flowi4_tos = rtm->rtm_tos,
3138                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3139                         .flowi4_mark = mark,
3140                 };
3141                 rt = ip_route_output_key(net, &fl4);
3142
3143                 err = 0;
3144                 if (IS_ERR(rt))
3145                         err = PTR_ERR(rt);
3146         }
3147
3148         if (err)
3149                 goto errout_free;
3150
3151         skb_dst_set(skb, &rt->dst);
3152         if (rtm->rtm_flags & RTM_F_NOTIFY)
3153                 rt->rt_flags |= RTCF_NOTIFY;
3154
3155         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3156                            RTM_NEWROUTE, 0, 0);
3157         if (err <= 0)
3158                 goto errout_free;
3159
3160         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3161 errout:
3162         return err;
3163
3164 errout_free:
3165         kfree_skb(skb);
3166         goto errout;
3167 }
3168
3169 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3170 {
3171         struct rtable *rt;
3172         int h, s_h;
3173         int idx, s_idx;
3174         struct net *net;
3175
3176         net = sock_net(skb->sk);
3177
3178         s_h = cb->args[0];
3179         if (s_h < 0)
3180                 s_h = 0;
3181         s_idx = idx = cb->args[1];
3182         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3183                 if (!rt_hash_table[h].chain)
3184                         continue;
3185                 rcu_read_lock_bh();
3186                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3187                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3188                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3189                                 continue;
3190                         if (rt_is_expired(rt))
3191                                 continue;
3192                         skb_dst_set_noref(skb, &rt->dst);
3193                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3194                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3195                                          1, NLM_F_MULTI) <= 0) {
3196                                 skb_dst_drop(skb);
3197                                 rcu_read_unlock_bh();
3198                                 goto done;
3199                         }
3200                         skb_dst_drop(skb);
3201                 }
3202                 rcu_read_unlock_bh();
3203         }
3204
3205 done:
3206         cb->args[0] = h;
3207         cb->args[1] = idx;
3208         return skb->len;
3209 }
3210
3211 void ip_rt_multicast_event(struct in_device *in_dev)
3212 {
3213         rt_cache_flush(dev_net(in_dev->dev), 0);
3214 }
3215
3216 #ifdef CONFIG_SYSCTL
3217 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3218                                         void __user *buffer,
3219                                         size_t *lenp, loff_t *ppos)
3220 {
3221         if (write) {
3222                 int flush_delay;
3223                 ctl_table ctl;
3224                 struct net *net;
3225
3226                 memcpy(&ctl, __ctl, sizeof(ctl));
3227                 ctl.data = &flush_delay;
3228                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3229
3230                 net = (struct net *)__ctl->extra1;
3231                 rt_cache_flush(net, flush_delay);
3232                 return 0;
3233         }
3234
3235         return -EINVAL;
3236 }
3237
3238 static ctl_table ipv4_route_table[] = {
3239         {
3240                 .procname       = "gc_thresh",
3241                 .data           = &ipv4_dst_ops.gc_thresh,
3242                 .maxlen         = sizeof(int),
3243                 .mode           = 0644,
3244                 .proc_handler   = proc_dointvec,
3245         },
3246         {
3247                 .procname       = "max_size",
3248                 .data           = &ip_rt_max_size,
3249                 .maxlen         = sizeof(int),
3250                 .mode           = 0644,
3251                 .proc_handler   = proc_dointvec,
3252         },
3253         {
3254                 /*  Deprecated. Use gc_min_interval_ms */
3255
3256                 .procname       = "gc_min_interval",
3257                 .data           = &ip_rt_gc_min_interval,
3258                 .maxlen         = sizeof(int),
3259                 .mode           = 0644,
3260                 .proc_handler   = proc_dointvec_jiffies,
3261         },
3262         {
3263                 .procname       = "gc_min_interval_ms",
3264                 .data           = &ip_rt_gc_min_interval,
3265                 .maxlen         = sizeof(int),
3266                 .mode           = 0644,
3267                 .proc_handler   = proc_dointvec_ms_jiffies,
3268         },
3269         {
3270                 .procname       = "gc_timeout",
3271                 .data           = &ip_rt_gc_timeout,
3272                 .maxlen         = sizeof(int),
3273                 .mode           = 0644,
3274                 .proc_handler   = proc_dointvec_jiffies,
3275         },
3276         {
3277                 .procname       = "gc_interval",
3278                 .data           = &ip_rt_gc_interval,
3279                 .maxlen         = sizeof(int),
3280                 .mode           = 0644,
3281                 .proc_handler   = proc_dointvec_jiffies,
3282         },
3283         {
3284                 .procname       = "redirect_load",
3285                 .data           = &ip_rt_redirect_load,
3286                 .maxlen         = sizeof(int),
3287                 .mode           = 0644,
3288                 .proc_handler   = proc_dointvec,
3289         },
3290         {
3291                 .procname       = "redirect_number",
3292                 .data           = &ip_rt_redirect_number,
3293                 .maxlen         = sizeof(int),
3294                 .mode           = 0644,
3295                 .proc_handler   = proc_dointvec,
3296         },
3297         {
3298                 .procname       = "redirect_silence",
3299                 .data           = &ip_rt_redirect_silence,
3300                 .maxlen         = sizeof(int),
3301                 .mode           = 0644,
3302                 .proc_handler   = proc_dointvec,
3303         },
3304         {
3305                 .procname       = "error_cost",
3306                 .data           = &ip_rt_error_cost,
3307                 .maxlen         = sizeof(int),
3308                 .mode           = 0644,
3309                 .proc_handler   = proc_dointvec,
3310         },
3311         {
3312                 .procname       = "error_burst",
3313                 .data           = &ip_rt_error_burst,
3314                 .maxlen         = sizeof(int),
3315                 .mode           = 0644,
3316                 .proc_handler   = proc_dointvec,
3317         },
3318         {
3319                 .procname       = "gc_elasticity",
3320                 .data           = &ip_rt_gc_elasticity,
3321                 .maxlen         = sizeof(int),
3322                 .mode           = 0644,
3323                 .proc_handler   = proc_dointvec,
3324         },
3325         {
3326                 .procname       = "mtu_expires",
3327                 .data           = &ip_rt_mtu_expires,
3328                 .maxlen         = sizeof(int),
3329                 .mode           = 0644,
3330                 .proc_handler   = proc_dointvec_jiffies,
3331         },
3332         {
3333                 .procname       = "min_pmtu",
3334                 .data           = &ip_rt_min_pmtu,
3335                 .maxlen         = sizeof(int),
3336                 .mode           = 0644,
3337                 .proc_handler   = proc_dointvec,
3338         },
3339         {
3340                 .procname       = "min_adv_mss",
3341                 .data           = &ip_rt_min_advmss,
3342                 .maxlen         = sizeof(int),
3343                 .mode           = 0644,
3344                 .proc_handler   = proc_dointvec,
3345         },
3346         { }
3347 };
3348
3349 static struct ctl_table empty[1];
3350
3351 static struct ctl_table ipv4_skeleton[] =
3352 {
3353         { .procname = "route", 
3354           .mode = 0555, .child = ipv4_route_table},
3355         { .procname = "neigh", 
3356           .mode = 0555, .child = empty},
3357         { }
3358 };
3359
3360 static __net_initdata struct ctl_path ipv4_path[] = {
3361         { .procname = "net", },
3362         { .procname = "ipv4", },
3363         { },
3364 };
3365
3366 static struct ctl_table ipv4_route_flush_table[] = {
3367         {
3368                 .procname       = "flush",
3369                 .maxlen         = sizeof(int),
3370                 .mode           = 0200,
3371                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3372         },
3373         { },
3374 };
3375
3376 static __net_initdata struct ctl_path ipv4_route_path[] = {
3377         { .procname = "net", },
3378         { .procname = "ipv4", },
3379         { .procname = "route", },
3380         { },
3381 };
3382
3383 static __net_init int sysctl_route_net_init(struct net *net)
3384 {
3385         struct ctl_table *tbl;
3386
3387         tbl = ipv4_route_flush_table;
3388         if (!net_eq(net, &init_net)) {
3389                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3390                 if (tbl == NULL)
3391                         goto err_dup;
3392         }
3393         tbl[0].extra1 = net;
3394
3395         net->ipv4.route_hdr =
3396                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3397         if (net->ipv4.route_hdr == NULL)
3398                 goto err_reg;
3399         return 0;
3400
3401 err_reg:
3402         if (tbl != ipv4_route_flush_table)
3403                 kfree(tbl);
3404 err_dup:
3405         return -ENOMEM;
3406 }
3407
3408 static __net_exit void sysctl_route_net_exit(struct net *net)
3409 {
3410         struct ctl_table *tbl;
3411
3412         tbl = net->ipv4.route_hdr->ctl_table_arg;
3413         unregister_net_sysctl_table(net->ipv4.route_hdr);
3414         BUG_ON(tbl == ipv4_route_flush_table);
3415         kfree(tbl);
3416 }
3417
3418 static __net_initdata struct pernet_operations sysctl_route_ops = {
3419         .init = sysctl_route_net_init,
3420         .exit = sysctl_route_net_exit,
3421 };
3422 #endif
3423
3424 static __net_init int rt_genid_init(struct net *net)
3425 {
3426         get_random_bytes(&net->ipv4.rt_genid,
3427                          sizeof(net->ipv4.rt_genid));
3428         get_random_bytes(&net->ipv4.dev_addr_genid,
3429                          sizeof(net->ipv4.dev_addr_genid));
3430         return 0;
3431 }
3432
3433 static __net_initdata struct pernet_operations rt_genid_ops = {
3434         .init = rt_genid_init,
3435 };
3436
3437
3438 #ifdef CONFIG_IP_ROUTE_CLASSID
3439 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3440 #endif /* CONFIG_IP_ROUTE_CLASSID */
3441
3442 static __initdata unsigned long rhash_entries;
3443 static int __init set_rhash_entries(char *str)
3444 {
3445         if (!str)
3446                 return 0;
3447         rhash_entries = simple_strtoul(str, &str, 0);
3448         return 1;
3449 }
3450 __setup("rhash_entries=", set_rhash_entries);
3451
3452 int __init ip_rt_init(void)
3453 {
3454         int rc = 0;
3455
3456 #ifdef CONFIG_IP_ROUTE_CLASSID
3457         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3458         if (!ip_rt_acct)
3459                 panic("IP: failed to allocate ip_rt_acct\n");
3460 #endif
3461
3462         ipv4_dst_ops.kmem_cachep =
3463                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3464                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3465
3466         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3467
3468         if (dst_entries_init(&ipv4_dst_ops) < 0)
3469                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3470
3471         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3472                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3473
3474         rt_hash_table = (struct rt_hash_bucket *)
3475                 alloc_large_system_hash("IP route cache",
3476                                         sizeof(struct rt_hash_bucket),
3477                                         rhash_entries,
3478                                         (totalram_pages >= 128 * 1024) ?
3479                                         15 : 17,
3480                                         0,
3481                                         &rt_hash_log,
3482                                         &rt_hash_mask,
3483                                         rhash_entries ? 0 : 512 * 1024);
3484         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3485         rt_hash_lock_init();
3486
3487         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3488         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3489
3490         devinet_init();
3491         ip_fib_init();
3492
3493         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3494         expires_ljiffies = jiffies;
3495         schedule_delayed_work(&expires_work,
3496                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3497
3498         if (ip_rt_proc_init())
3499                 printk(KERN_ERR "Unable to create route proc files\n");
3500 #ifdef CONFIG_XFRM
3501         xfrm_init();
3502         xfrm4_init(ip_rt_max_size);
3503 #endif
3504         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3505
3506 #ifdef CONFIG_SYSCTL
3507         register_pernet_subsys(&sysctl_route_ops);
3508 #endif
3509         register_pernet_subsys(&rt_genid_ops);
3510         return rc;
3511 }
3512
3513 #ifdef CONFIG_SYSCTL
3514 /*
3515  * We really need to sanitize the damn ipv4 init order, then all
3516  * this nonsense will go away.
3517  */
3518 void __init ip_static_sysctl_init(void)
3519 {
3520         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3521 }
3522 #endif