ipv4: move route garbage collector to work queue
[pandora-kernel.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/atmclip.h>
113 #include <net/secure_seq.h>
114
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117
118 #define IP_MAX_MTU      0xFFF0
119
120 #define RT_GC_TIMEOUT (300*HZ)
121
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
125 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
126 static int ip_rt_redirect_number __read_mostly  = 9;
127 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
128 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost __read_mostly       = HZ;
130 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
131 static int ip_rt_gc_elasticity __read_mostly    = 8;
132 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
134 static int ip_rt_min_advmss __read_mostly       = 256;
135 static int rt_chain_length_max __read_mostly    = 20;
136 static int redirect_genid;
137
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
148 static void              ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154 static void __rt_garbage_collect(struct work_struct *w);
155 static DECLARE_WORK(rt_gc_worker, __rt_garbage_collect);
156
157 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
158                             int how)
159 {
160 }
161
162 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
163 {
164         struct rtable *rt = (struct rtable *) dst;
165         struct inet_peer *peer;
166         u32 *p = NULL;
167
168         if (!rt->peer)
169                 rt_bind_peer(rt, rt->rt_dst, 1);
170
171         peer = rt->peer;
172         if (peer) {
173                 u32 *old_p = __DST_METRICS_PTR(old);
174                 unsigned long prev, new;
175
176                 p = peer->metrics;
177                 if (inet_metrics_new(peer))
178                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
179
180                 new = (unsigned long) p;
181                 prev = cmpxchg(&dst->_metrics, old, new);
182
183                 if (prev != old) {
184                         p = __DST_METRICS_PTR(prev);
185                         if (prev & DST_METRICS_READ_ONLY)
186                                 p = NULL;
187                 } else {
188                         if (rt->fi) {
189                                 fib_info_put(rt->fi);
190                                 rt->fi = NULL;
191                         }
192                 }
193         }
194         return p;
195 }
196
197 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
198
199 static struct dst_ops ipv4_dst_ops = {
200         .family =               AF_INET,
201         .protocol =             cpu_to_be16(ETH_P_IP),
202         .gc =                   rt_garbage_collect,
203         .check =                ipv4_dst_check,
204         .default_advmss =       ipv4_default_advmss,
205         .mtu =                  ipv4_mtu,
206         .cow_metrics =          ipv4_cow_metrics,
207         .destroy =              ipv4_dst_destroy,
208         .ifdown =               ipv4_dst_ifdown,
209         .negative_advice =      ipv4_negative_advice,
210         .link_failure =         ipv4_link_failure,
211         .update_pmtu =          ip_rt_update_pmtu,
212         .local_out =            __ip_local_out,
213         .neigh_lookup =         ipv4_neigh_lookup,
214 };
215
216 #define ECN_OR_COST(class)      TC_PRIO_##class
217
218 const __u8 ip_tos2prio[16] = {
219         TC_PRIO_BESTEFFORT,
220         ECN_OR_COST(BESTEFFORT),
221         TC_PRIO_BESTEFFORT,
222         ECN_OR_COST(BESTEFFORT),
223         TC_PRIO_BULK,
224         ECN_OR_COST(BULK),
225         TC_PRIO_BULK,
226         ECN_OR_COST(BULK),
227         TC_PRIO_INTERACTIVE,
228         ECN_OR_COST(INTERACTIVE),
229         TC_PRIO_INTERACTIVE,
230         ECN_OR_COST(INTERACTIVE),
231         TC_PRIO_INTERACTIVE_BULK,
232         ECN_OR_COST(INTERACTIVE_BULK),
233         TC_PRIO_INTERACTIVE_BULK,
234         ECN_OR_COST(INTERACTIVE_BULK)
235 };
236
237
238 /*
239  * Route cache.
240  */
241
242 /* The locking scheme is rather straight forward:
243  *
244  * 1) Read-Copy Update protects the buckets of the central route hash.
245  * 2) Only writers remove entries, and they hold the lock
246  *    as they look at rtable reference counts.
247  * 3) Only readers acquire references to rtable entries,
248  *    they do so with atomic increments and with the
249  *    lock held.
250  */
251
252 struct rt_hash_bucket {
253         struct rtable __rcu     *chain;
254 };
255
256 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
257         defined(CONFIG_PROVE_LOCKING)
258 /*
259  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
260  * The size of this table is a power of two and depends on the number of CPUS.
261  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
262  */
263 #ifdef CONFIG_LOCKDEP
264 # define RT_HASH_LOCK_SZ        256
265 #else
266 # if NR_CPUS >= 32
267 #  define RT_HASH_LOCK_SZ       4096
268 # elif NR_CPUS >= 16
269 #  define RT_HASH_LOCK_SZ       2048
270 # elif NR_CPUS >= 8
271 #  define RT_HASH_LOCK_SZ       1024
272 # elif NR_CPUS >= 4
273 #  define RT_HASH_LOCK_SZ       512
274 # else
275 #  define RT_HASH_LOCK_SZ       256
276 # endif
277 #endif
278
279 static spinlock_t       *rt_hash_locks;
280 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
281
282 static __init void rt_hash_lock_init(void)
283 {
284         int i;
285
286         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
287                         GFP_KERNEL);
288         if (!rt_hash_locks)
289                 panic("IP: failed to allocate rt_hash_locks\n");
290
291         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
292                 spin_lock_init(&rt_hash_locks[i]);
293 }
294 #else
295 # define rt_hash_lock_addr(slot) NULL
296
297 static inline void rt_hash_lock_init(void)
298 {
299 }
300 #endif
301
302 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
303 static unsigned                 rt_hash_mask __read_mostly;
304 static unsigned int             rt_hash_log  __read_mostly;
305
306 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
307 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
308
309 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
310                                    int genid)
311 {
312         return jhash_3words((__force u32)daddr, (__force u32)saddr,
313                             idx, genid)
314                 & rt_hash_mask;
315 }
316
317 static inline int rt_genid(struct net *net)
318 {
319         return atomic_read(&net->ipv4.rt_genid);
320 }
321
322 #ifdef CONFIG_PROC_FS
323 struct rt_cache_iter_state {
324         struct seq_net_private p;
325         int bucket;
326         int genid;
327 };
328
329 static struct rtable *rt_cache_get_first(struct seq_file *seq)
330 {
331         struct rt_cache_iter_state *st = seq->private;
332         struct rtable *r = NULL;
333
334         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
335                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
336                         continue;
337                 rcu_read_lock_bh();
338                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
339                 while (r) {
340                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
341                             r->rt_genid == st->genid)
342                                 return r;
343                         r = rcu_dereference_bh(r->dst.rt_next);
344                 }
345                 rcu_read_unlock_bh();
346         }
347         return r;
348 }
349
350 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
351                                           struct rtable *r)
352 {
353         struct rt_cache_iter_state *st = seq->private;
354
355         r = rcu_dereference_bh(r->dst.rt_next);
356         while (!r) {
357                 rcu_read_unlock_bh();
358                 do {
359                         if (--st->bucket < 0)
360                                 return NULL;
361                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
362                 rcu_read_lock_bh();
363                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
364         }
365         return r;
366 }
367
368 static struct rtable *rt_cache_get_next(struct seq_file *seq,
369                                         struct rtable *r)
370 {
371         struct rt_cache_iter_state *st = seq->private;
372         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
373                 if (dev_net(r->dst.dev) != seq_file_net(seq))
374                         continue;
375                 if (r->rt_genid == st->genid)
376                         break;
377         }
378         return r;
379 }
380
381 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
382 {
383         struct rtable *r = rt_cache_get_first(seq);
384
385         if (r)
386                 while (pos && (r = rt_cache_get_next(seq, r)))
387                         --pos;
388         return pos ? NULL : r;
389 }
390
391 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
392 {
393         struct rt_cache_iter_state *st = seq->private;
394         if (*pos)
395                 return rt_cache_get_idx(seq, *pos - 1);
396         st->genid = rt_genid(seq_file_net(seq));
397         return SEQ_START_TOKEN;
398 }
399
400 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
401 {
402         struct rtable *r;
403
404         if (v == SEQ_START_TOKEN)
405                 r = rt_cache_get_first(seq);
406         else
407                 r = rt_cache_get_next(seq, v);
408         ++*pos;
409         return r;
410 }
411
412 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
413 {
414         if (v && v != SEQ_START_TOKEN)
415                 rcu_read_unlock_bh();
416 }
417
418 static int rt_cache_seq_show(struct seq_file *seq, void *v)
419 {
420         if (v == SEQ_START_TOKEN)
421                 seq_printf(seq, "%-127s\n",
422                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
423                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
424                            "HHUptod\tSpecDst");
425         else {
426                 struct rtable *r = v;
427                 struct neighbour *n;
428                 int len, HHUptod;
429
430                 rcu_read_lock();
431                 n = dst_get_neighbour(&r->dst);
432                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
433                 rcu_read_unlock();
434
435                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
436                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
437                         r->dst.dev ? r->dst.dev->name : "*",
438                         (__force u32)r->rt_dst,
439                         (__force u32)r->rt_gateway,
440                         r->rt_flags, atomic_read(&r->dst.__refcnt),
441                         r->dst.__use, 0, (__force u32)r->rt_src,
442                         dst_metric_advmss(&r->dst) + 40,
443                         dst_metric(&r->dst, RTAX_WINDOW),
444                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
445                               dst_metric(&r->dst, RTAX_RTTVAR)),
446                         r->rt_key_tos,
447                         -1,
448                         HHUptod,
449                         r->rt_spec_dst, &len);
450
451                 seq_printf(seq, "%*s\n", 127 - len, "");
452         }
453         return 0;
454 }
455
456 static const struct seq_operations rt_cache_seq_ops = {
457         .start  = rt_cache_seq_start,
458         .next   = rt_cache_seq_next,
459         .stop   = rt_cache_seq_stop,
460         .show   = rt_cache_seq_show,
461 };
462
463 static int rt_cache_seq_open(struct inode *inode, struct file *file)
464 {
465         return seq_open_net(inode, file, &rt_cache_seq_ops,
466                         sizeof(struct rt_cache_iter_state));
467 }
468
469 static const struct file_operations rt_cache_seq_fops = {
470         .owner   = THIS_MODULE,
471         .open    = rt_cache_seq_open,
472         .read    = seq_read,
473         .llseek  = seq_lseek,
474         .release = seq_release_net,
475 };
476
477
478 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
479 {
480         int cpu;
481
482         if (*pos == 0)
483                 return SEQ_START_TOKEN;
484
485         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
486                 if (!cpu_possible(cpu))
487                         continue;
488                 *pos = cpu+1;
489                 return &per_cpu(rt_cache_stat, cpu);
490         }
491         return NULL;
492 }
493
494 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
495 {
496         int cpu;
497
498         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
499                 if (!cpu_possible(cpu))
500                         continue;
501                 *pos = cpu+1;
502                 return &per_cpu(rt_cache_stat, cpu);
503         }
504         return NULL;
505
506 }
507
508 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
509 {
510
511 }
512
513 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
514 {
515         struct rt_cache_stat *st = v;
516
517         if (v == SEQ_START_TOKEN) {
518                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
519                 return 0;
520         }
521
522         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
523                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
524                    dst_entries_get_slow(&ipv4_dst_ops),
525                    st->in_hit,
526                    st->in_slow_tot,
527                    st->in_slow_mc,
528                    st->in_no_route,
529                    st->in_brd,
530                    st->in_martian_dst,
531                    st->in_martian_src,
532
533                    st->out_hit,
534                    st->out_slow_tot,
535                    st->out_slow_mc,
536
537                    st->gc_total,
538                    st->gc_ignored,
539                    st->gc_goal_miss,
540                    st->gc_dst_overflow,
541                    st->in_hlist_search,
542                    st->out_hlist_search
543                 );
544         return 0;
545 }
546
547 static const struct seq_operations rt_cpu_seq_ops = {
548         .start  = rt_cpu_seq_start,
549         .next   = rt_cpu_seq_next,
550         .stop   = rt_cpu_seq_stop,
551         .show   = rt_cpu_seq_show,
552 };
553
554
555 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
556 {
557         return seq_open(file, &rt_cpu_seq_ops);
558 }
559
560 static const struct file_operations rt_cpu_seq_fops = {
561         .owner   = THIS_MODULE,
562         .open    = rt_cpu_seq_open,
563         .read    = seq_read,
564         .llseek  = seq_lseek,
565         .release = seq_release,
566 };
567
568 #ifdef CONFIG_IP_ROUTE_CLASSID
569 static int rt_acct_proc_show(struct seq_file *m, void *v)
570 {
571         struct ip_rt_acct *dst, *src;
572         unsigned int i, j;
573
574         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
575         if (!dst)
576                 return -ENOMEM;
577
578         for_each_possible_cpu(i) {
579                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
580                 for (j = 0; j < 256; j++) {
581                         dst[j].o_bytes   += src[j].o_bytes;
582                         dst[j].o_packets += src[j].o_packets;
583                         dst[j].i_bytes   += src[j].i_bytes;
584                         dst[j].i_packets += src[j].i_packets;
585                 }
586         }
587
588         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
589         kfree(dst);
590         return 0;
591 }
592
593 static int rt_acct_proc_open(struct inode *inode, struct file *file)
594 {
595         return single_open(file, rt_acct_proc_show, NULL);
596 }
597
598 static const struct file_operations rt_acct_proc_fops = {
599         .owner          = THIS_MODULE,
600         .open           = rt_acct_proc_open,
601         .read           = seq_read,
602         .llseek         = seq_lseek,
603         .release        = single_release,
604 };
605 #endif
606
607 static int __net_init ip_rt_do_proc_init(struct net *net)
608 {
609         struct proc_dir_entry *pde;
610
611         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
612                         &rt_cache_seq_fops);
613         if (!pde)
614                 goto err1;
615
616         pde = proc_create("rt_cache", S_IRUGO,
617                           net->proc_net_stat, &rt_cpu_seq_fops);
618         if (!pde)
619                 goto err2;
620
621 #ifdef CONFIG_IP_ROUTE_CLASSID
622         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
623         if (!pde)
624                 goto err3;
625 #endif
626         return 0;
627
628 #ifdef CONFIG_IP_ROUTE_CLASSID
629 err3:
630         remove_proc_entry("rt_cache", net->proc_net_stat);
631 #endif
632 err2:
633         remove_proc_entry("rt_cache", net->proc_net);
634 err1:
635         return -ENOMEM;
636 }
637
638 static void __net_exit ip_rt_do_proc_exit(struct net *net)
639 {
640         remove_proc_entry("rt_cache", net->proc_net_stat);
641         remove_proc_entry("rt_cache", net->proc_net);
642 #ifdef CONFIG_IP_ROUTE_CLASSID
643         remove_proc_entry("rt_acct", net->proc_net);
644 #endif
645 }
646
647 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
648         .init = ip_rt_do_proc_init,
649         .exit = ip_rt_do_proc_exit,
650 };
651
652 static int __init ip_rt_proc_init(void)
653 {
654         return register_pernet_subsys(&ip_rt_proc_ops);
655 }
656
657 #else
658 static inline int ip_rt_proc_init(void)
659 {
660         return 0;
661 }
662 #endif /* CONFIG_PROC_FS */
663
664 static inline void rt_free(struct rtable *rt)
665 {
666         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
667 }
668
669 static inline void rt_drop(struct rtable *rt)
670 {
671         ip_rt_put(rt);
672         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
673 }
674
675 static inline int rt_fast_clean(struct rtable *rth)
676 {
677         /* Kill broadcast/multicast entries very aggresively, if they
678            collide in hash table with more useful entries */
679         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
680                 rt_is_input_route(rth) && rth->dst.rt_next;
681 }
682
683 static inline int rt_valuable(struct rtable *rth)
684 {
685         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
686                 (rth->peer && rth->peer->pmtu_expires);
687 }
688
689 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
690 {
691         unsigned long age;
692         int ret = 0;
693
694         if (atomic_read(&rth->dst.__refcnt))
695                 goto out;
696
697         age = jiffies - rth->dst.lastuse;
698         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
699             (age <= tmo2 && rt_valuable(rth)))
700                 goto out;
701         ret = 1;
702 out:    return ret;
703 }
704
705 /* Bits of score are:
706  * 31: very valuable
707  * 30: not quite useless
708  * 29..0: usage counter
709  */
710 static inline u32 rt_score(struct rtable *rt)
711 {
712         u32 score = jiffies - rt->dst.lastuse;
713
714         score = ~score & ~(3<<30);
715
716         if (rt_valuable(rt))
717                 score |= (1<<31);
718
719         if (rt_is_output_route(rt) ||
720             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
721                 score |= (1<<30);
722
723         return score;
724 }
725
726 static inline bool rt_caching(const struct net *net)
727 {
728         return net->ipv4.current_rt_cache_rebuild_count <=
729                 net->ipv4.sysctl_rt_cache_rebuild_count;
730 }
731
732 static inline bool compare_hash_inputs(const struct rtable *rt1,
733                                        const struct rtable *rt2)
734 {
735         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
736                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
737                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
738 }
739
740 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
741 {
742         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
743                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
744                 (rt1->rt_mark ^ rt2->rt_mark) |
745                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
746                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
747                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
748 }
749
750 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
751 {
752         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
753 }
754
755 static inline int rt_is_expired(struct rtable *rth)
756 {
757         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
758 }
759
760 /*
761  * Perform a full scan of hash table and free all entries.
762  * Can be called by a softirq or a process.
763  * In the later case, we want to be reschedule if necessary
764  */
765 static void rt_do_flush(struct net *net, int process_context)
766 {
767         unsigned int i;
768         struct rtable *rth, *next;
769
770         for (i = 0; i <= rt_hash_mask; i++) {
771                 struct rtable __rcu **pprev;
772                 struct rtable *list;
773
774                 if (process_context && need_resched())
775                         cond_resched();
776                 rth = rcu_access_pointer(rt_hash_table[i].chain);
777                 if (!rth)
778                         continue;
779
780                 spin_lock_bh(rt_hash_lock_addr(i));
781
782                 list = NULL;
783                 pprev = &rt_hash_table[i].chain;
784                 rth = rcu_dereference_protected(*pprev,
785                         lockdep_is_held(rt_hash_lock_addr(i)));
786
787                 while (rth) {
788                         next = rcu_dereference_protected(rth->dst.rt_next,
789                                 lockdep_is_held(rt_hash_lock_addr(i)));
790
791                         if (!net ||
792                             net_eq(dev_net(rth->dst.dev), net)) {
793                                 rcu_assign_pointer(*pprev, next);
794                                 rcu_assign_pointer(rth->dst.rt_next, list);
795                                 list = rth;
796                         } else {
797                                 pprev = &rth->dst.rt_next;
798                         }
799                         rth = next;
800                 }
801
802                 spin_unlock_bh(rt_hash_lock_addr(i));
803
804                 for (; list; list = next) {
805                         next = rcu_dereference_protected(list->dst.rt_next, 1);
806                         rt_free(list);
807                 }
808         }
809 }
810
811 /*
812  * While freeing expired entries, we compute average chain length
813  * and standard deviation, using fixed-point arithmetic.
814  * This to have an estimation of rt_chain_length_max
815  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
816  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
817  */
818
819 #define FRACT_BITS 3
820 #define ONE (1UL << FRACT_BITS)
821
822 /*
823  * Given a hash chain and an item in this hash chain,
824  * find if a previous entry has the same hash_inputs
825  * (but differs on tos, mark or oif)
826  * Returns 0 if an alias is found.
827  * Returns ONE if rth has no alias before itself.
828  */
829 static int has_noalias(const struct rtable *head, const struct rtable *rth)
830 {
831         const struct rtable *aux = head;
832
833         while (aux != rth) {
834                 if (compare_hash_inputs(aux, rth))
835                         return 0;
836                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
837         }
838         return ONE;
839 }
840
841 static void rt_check_expire(void)
842 {
843         static unsigned int rover;
844         unsigned int i = rover, goal;
845         struct rtable *rth;
846         struct rtable __rcu **rthp;
847         unsigned long samples = 0;
848         unsigned long sum = 0, sum2 = 0;
849         unsigned long delta;
850         u64 mult;
851
852         delta = jiffies - expires_ljiffies;
853         expires_ljiffies = jiffies;
854         mult = ((u64)delta) << rt_hash_log;
855         if (ip_rt_gc_timeout > 1)
856                 do_div(mult, ip_rt_gc_timeout);
857         goal = (unsigned int)mult;
858         if (goal > rt_hash_mask)
859                 goal = rt_hash_mask + 1;
860         for (; goal > 0; goal--) {
861                 unsigned long tmo = ip_rt_gc_timeout;
862                 unsigned long length;
863
864                 i = (i + 1) & rt_hash_mask;
865                 rthp = &rt_hash_table[i].chain;
866
867                 if (need_resched())
868                         cond_resched();
869
870                 samples++;
871
872                 if (rcu_dereference_raw(*rthp) == NULL)
873                         continue;
874                 length = 0;
875                 spin_lock_bh(rt_hash_lock_addr(i));
876                 while ((rth = rcu_dereference_protected(*rthp,
877                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
878                         prefetch(rth->dst.rt_next);
879                         if (rt_is_expired(rth)) {
880                                 *rthp = rth->dst.rt_next;
881                                 rt_free(rth);
882                                 continue;
883                         }
884                         if (rth->dst.expires) {
885                                 /* Entry is expired even if it is in use */
886                                 if (time_before_eq(jiffies, rth->dst.expires)) {
887 nofree:
888                                         tmo >>= 1;
889                                         rthp = &rth->dst.rt_next;
890                                         /*
891                                          * We only count entries on
892                                          * a chain with equal hash inputs once
893                                          * so that entries for different QOS
894                                          * levels, and other non-hash input
895                                          * attributes don't unfairly skew
896                                          * the length computation
897                                          */
898                                         length += has_noalias(rt_hash_table[i].chain, rth);
899                                         continue;
900                                 }
901                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
902                                 goto nofree;
903
904                         /* Cleanup aged off entries. */
905                         *rthp = rth->dst.rt_next;
906                         rt_free(rth);
907                 }
908                 spin_unlock_bh(rt_hash_lock_addr(i));
909                 sum += length;
910                 sum2 += length*length;
911         }
912         if (samples) {
913                 unsigned long avg = sum / samples;
914                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
915                 rt_chain_length_max = max_t(unsigned long,
916                                         ip_rt_gc_elasticity,
917                                         (avg + 4*sd) >> FRACT_BITS);
918         }
919         rover = i;
920 }
921
922 /*
923  * rt_worker_func() is run in process context.
924  * we call rt_check_expire() to scan part of the hash table
925  */
926 static void rt_worker_func(struct work_struct *work)
927 {
928         rt_check_expire();
929         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
930 }
931
932 /*
933  * Perturbation of rt_genid by a small quantity [1..256]
934  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
935  * many times (2^24) without giving recent rt_genid.
936  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
937  */
938 static void rt_cache_invalidate(struct net *net)
939 {
940         unsigned char shuffle;
941
942         get_random_bytes(&shuffle, sizeof(shuffle));
943         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
944         redirect_genid++;
945         inetpeer_invalidate_tree(AF_INET);
946 }
947
948 /*
949  * delay < 0  : invalidate cache (fast : entries will be deleted later)
950  * delay >= 0 : invalidate & flush cache (can be long)
951  */
952 void rt_cache_flush(struct net *net, int delay)
953 {
954         rt_cache_invalidate(net);
955         if (delay >= 0)
956                 rt_do_flush(net, !in_softirq());
957 }
958
959 /* Flush previous cache invalidated entries from the cache */
960 void rt_cache_flush_batch(struct net *net)
961 {
962         rt_do_flush(net, !in_softirq());
963 }
964
965 static void rt_emergency_hash_rebuild(struct net *net)
966 {
967         if (net_ratelimit())
968                 printk(KERN_WARNING "Route hash chain too long!\n");
969         rt_cache_invalidate(net);
970 }
971
972 /*
973    Short description of GC goals.
974
975    We want to build algorithm, which will keep routing cache
976    at some equilibrium point, when number of aged off entries
977    is kept approximately equal to newly generated ones.
978
979    Current expiration strength is variable "expire".
980    We try to adjust it dynamically, so that if networking
981    is idle expires is large enough to keep enough of warm entries,
982    and when load increases it reduces to limit cache size.
983  */
984
985 static void __do_rt_garbage_collect(int elasticity, int min_interval)
986 {
987         static unsigned long expire = RT_GC_TIMEOUT;
988         static unsigned long last_gc;
989         static int rover;
990         static int equilibrium;
991         struct rtable *rth;
992         struct rtable __rcu **rthp;
993         unsigned long now = jiffies;
994         int goal;
995         int entries = dst_entries_get_fast(&ipv4_dst_ops);
996
997         /*
998          * Garbage collection is pretty expensive,
999          * do not make it too frequently.
1000          */
1001
1002         RT_CACHE_STAT_INC(gc_total);
1003
1004         if (now - last_gc < min_interval &&
1005             entries < ip_rt_max_size) {
1006                 RT_CACHE_STAT_INC(gc_ignored);
1007                 goto out;
1008         }
1009
1010         entries = dst_entries_get_slow(&ipv4_dst_ops);
1011         /* Calculate number of entries, which we want to expire now. */
1012         goal = entries - (elasticity << rt_hash_log);
1013         if (goal <= 0) {
1014                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1015                         equilibrium = ipv4_dst_ops.gc_thresh;
1016                 goal = entries - equilibrium;
1017                 if (goal > 0) {
1018                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1019                         goal = entries - equilibrium;
1020                 }
1021         } else {
1022                 /* We are in dangerous area. Try to reduce cache really
1023                  * aggressively.
1024                  */
1025                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1026                 equilibrium = entries - goal;
1027         }
1028
1029         if (now - last_gc >= min_interval)
1030                 last_gc = now;
1031
1032         if (goal <= 0) {
1033                 equilibrium += goal;
1034                 goto work_done;
1035         }
1036
1037         do {
1038                 int i, k;
1039
1040                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1041                         unsigned long tmo = expire;
1042
1043                         k = (k + 1) & rt_hash_mask;
1044                         rthp = &rt_hash_table[k].chain;
1045                         spin_lock_bh(rt_hash_lock_addr(k));
1046                         while ((rth = rcu_dereference_protected(*rthp,
1047                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1048                                 if (!rt_is_expired(rth) &&
1049                                         !rt_may_expire(rth, tmo, expire)) {
1050                                         tmo >>= 1;
1051                                         rthp = &rth->dst.rt_next;
1052                                         continue;
1053                                 }
1054                                 *rthp = rth->dst.rt_next;
1055                                 rt_free(rth);
1056                                 goal--;
1057                         }
1058                         spin_unlock_bh(rt_hash_lock_addr(k));
1059                         if (goal <= 0)
1060                                 break;
1061                 }
1062                 rover = k;
1063
1064                 if (goal <= 0)
1065                         goto work_done;
1066
1067                 /* Goal is not achieved. We stop process if:
1068
1069                    - if expire reduced to zero. Otherwise, expire is halfed.
1070                    - if table is not full.
1071                    - if we are called from interrupt.
1072                    - jiffies check is just fallback/debug loop breaker.
1073                      We will not spin here for long time in any case.
1074                  */
1075
1076                 RT_CACHE_STAT_INC(gc_goal_miss);
1077
1078                 if (expire == 0)
1079                         break;
1080
1081                 expire >>= 1;
1082
1083                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1084                         goto out;
1085         } while (!in_softirq() && time_before_eq(jiffies, now));
1086
1087         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1088                 goto out;
1089         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1090                 goto out;
1091         if (net_ratelimit())
1092                 printk(KERN_WARNING "dst cache overflow\n");
1093         RT_CACHE_STAT_INC(gc_dst_overflow);
1094         return;
1095
1096 work_done:
1097         expire += min_interval;
1098         if (expire > ip_rt_gc_timeout ||
1099             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1100             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1101                 expire = ip_rt_gc_timeout;
1102 out:    return;
1103 }
1104
1105 static void __rt_garbage_collect(struct work_struct *w)
1106 {
1107         __do_rt_garbage_collect(ip_rt_gc_elasticity, ip_rt_gc_min_interval);
1108 }
1109
1110 static int rt_garbage_collect(struct dst_ops *ops)
1111 {
1112         if (!work_pending(&rt_gc_worker))
1113                 schedule_work(&rt_gc_worker);
1114
1115         if (dst_entries_get_fast(&ipv4_dst_ops) >= ip_rt_max_size ||
1116             dst_entries_get_slow(&ipv4_dst_ops) >= ip_rt_max_size) {
1117                 RT_CACHE_STAT_INC(gc_dst_overflow);
1118                 return 1;
1119         }
1120         return 0;
1121 }
1122
1123 /*
1124  * Returns number of entries in a hash chain that have different hash_inputs
1125  */
1126 static int slow_chain_length(const struct rtable *head)
1127 {
1128         int length = 0;
1129         const struct rtable *rth = head;
1130
1131         while (rth) {
1132                 length += has_noalias(head, rth);
1133                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1134         }
1135         return length >> FRACT_BITS;
1136 }
1137
1138 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1139 {
1140         struct neigh_table *tbl = &arp_tbl;
1141         static const __be32 inaddr_any = 0;
1142         struct net_device *dev = dst->dev;
1143         const __be32 *pkey = daddr;
1144         struct neighbour *n;
1145
1146 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1147         if (dev->type == ARPHRD_ATM)
1148                 tbl = clip_tbl_hook;
1149 #endif
1150         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1151                 pkey = &inaddr_any;
1152
1153         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1154         if (n)
1155                 return n;
1156         return neigh_create(tbl, pkey, dev);
1157 }
1158
1159 static int rt_bind_neighbour(struct rtable *rt)
1160 {
1161         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1162         if (IS_ERR(n))
1163                 return PTR_ERR(n);
1164         dst_set_neighbour(&rt->dst, n);
1165
1166         return 0;
1167 }
1168
1169 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1170                                      struct sk_buff *skb, int ifindex)
1171 {
1172         struct rtable   *rth, *cand;
1173         struct rtable __rcu **rthp, **candp;
1174         unsigned long   now;
1175         u32             min_score;
1176         int             chain_length;
1177         int attempts = !in_softirq();
1178
1179 restart:
1180         chain_length = 0;
1181         min_score = ~(u32)0;
1182         cand = NULL;
1183         candp = NULL;
1184         now = jiffies;
1185
1186         if (!rt_caching(dev_net(rt->dst.dev))) {
1187                 /*
1188                  * If we're not caching, just tell the caller we
1189                  * were successful and don't touch the route.  The
1190                  * caller hold the sole reference to the cache entry, and
1191                  * it will be released when the caller is done with it.
1192                  * If we drop it here, the callers have no way to resolve routes
1193                  * when we're not caching.  Instead, just point *rp at rt, so
1194                  * the caller gets a single use out of the route
1195                  * Note that we do rt_free on this new route entry, so that
1196                  * once its refcount hits zero, we are still able to reap it
1197                  * (Thanks Alexey)
1198                  * Note: To avoid expensive rcu stuff for this uncached dst,
1199                  * we set DST_NOCACHE so that dst_release() can free dst without
1200                  * waiting a grace period.
1201                  */
1202
1203                 rt->dst.flags |= DST_NOCACHE;
1204                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1205                         int err = rt_bind_neighbour(rt);
1206                         if (err) {
1207                                 if (net_ratelimit())
1208                                         printk(KERN_WARNING
1209                                             "Neighbour table failure & not caching routes.\n");
1210                                 ip_rt_put(rt);
1211                                 return ERR_PTR(err);
1212                         }
1213                 }
1214
1215                 goto skip_hashing;
1216         }
1217
1218         rthp = &rt_hash_table[hash].chain;
1219
1220         spin_lock_bh(rt_hash_lock_addr(hash));
1221         while ((rth = rcu_dereference_protected(*rthp,
1222                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1223                 if (rt_is_expired(rth)) {
1224                         *rthp = rth->dst.rt_next;
1225                         rt_free(rth);
1226                         continue;
1227                 }
1228                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1229                         /* Put it first */
1230                         *rthp = rth->dst.rt_next;
1231                         /*
1232                          * Since lookup is lockfree, the deletion
1233                          * must be visible to another weakly ordered CPU before
1234                          * the insertion at the start of the hash chain.
1235                          */
1236                         rcu_assign_pointer(rth->dst.rt_next,
1237                                            rt_hash_table[hash].chain);
1238                         /*
1239                          * Since lookup is lockfree, the update writes
1240                          * must be ordered for consistency on SMP.
1241                          */
1242                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1243
1244                         dst_use(&rth->dst, now);
1245                         spin_unlock_bh(rt_hash_lock_addr(hash));
1246
1247                         rt_drop(rt);
1248                         if (skb)
1249                                 skb_dst_set(skb, &rth->dst);
1250                         return rth;
1251                 }
1252
1253                 if (!atomic_read(&rth->dst.__refcnt)) {
1254                         u32 score = rt_score(rth);
1255
1256                         if (score <= min_score) {
1257                                 cand = rth;
1258                                 candp = rthp;
1259                                 min_score = score;
1260                         }
1261                 }
1262
1263                 chain_length++;
1264
1265                 rthp = &rth->dst.rt_next;
1266         }
1267
1268         if (cand) {
1269                 /* ip_rt_gc_elasticity used to be average length of chain
1270                  * length, when exceeded gc becomes really aggressive.
1271                  *
1272                  * The second limit is less certain. At the moment it allows
1273                  * only 2 entries per bucket. We will see.
1274                  */
1275                 if (chain_length > ip_rt_gc_elasticity) {
1276                         *candp = cand->dst.rt_next;
1277                         rt_free(cand);
1278                 }
1279         } else {
1280                 if (chain_length > rt_chain_length_max &&
1281                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1282                         struct net *net = dev_net(rt->dst.dev);
1283                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1284                         if (!rt_caching(net)) {
1285                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1286                                         rt->dst.dev->name, num);
1287                         }
1288                         rt_emergency_hash_rebuild(net);
1289                         spin_unlock_bh(rt_hash_lock_addr(hash));
1290
1291                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1292                                         ifindex, rt_genid(net));
1293                         goto restart;
1294                 }
1295         }
1296
1297         /* Try to bind route to arp only if it is output
1298            route or unicast forwarding path.
1299          */
1300         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1301                 int err = rt_bind_neighbour(rt);
1302                 if (err) {
1303                         spin_unlock_bh(rt_hash_lock_addr(hash));
1304
1305                         if (err != -ENOBUFS) {
1306                                 rt_drop(rt);
1307                                 return ERR_PTR(err);
1308                         }
1309
1310                         /* Neighbour tables are full and nothing
1311                            can be released. Try to shrink route cache,
1312                            it is most likely it holds some neighbour records.
1313                          */
1314                         if (attempts-- > 0) {
1315                                 __do_rt_garbage_collect(1, 0);
1316                                 goto restart;
1317                         }
1318
1319                         if (net_ratelimit())
1320                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1321                         rt_drop(rt);
1322                         return ERR_PTR(-ENOBUFS);
1323                 }
1324         }
1325
1326         rt->dst.rt_next = rt_hash_table[hash].chain;
1327
1328         /*
1329          * Since lookup is lockfree, we must make sure
1330          * previous writes to rt are committed to memory
1331          * before making rt visible to other CPUS.
1332          */
1333         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1334
1335         spin_unlock_bh(rt_hash_lock_addr(hash));
1336
1337 skip_hashing:
1338         if (skb)
1339                 skb_dst_set(skb, &rt->dst);
1340         return rt;
1341 }
1342
1343 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1344
1345 static u32 rt_peer_genid(void)
1346 {
1347         return atomic_read(&__rt_peer_genid);
1348 }
1349
1350 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1351 {
1352         struct inet_peer *peer;
1353
1354         peer = inet_getpeer_v4(daddr, create);
1355
1356         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1357                 inet_putpeer(peer);
1358         else
1359                 rt->rt_peer_genid = rt_peer_genid();
1360 }
1361
1362 #define IP_IDENTS_SZ 2048u
1363 struct ip_ident_bucket {
1364         atomic_t        id;
1365         u32             stamp32;
1366 };
1367
1368 static struct ip_ident_bucket *ip_idents __read_mostly;
1369
1370 /* In order to protect privacy, we add a perturbation to identifiers
1371  * if one generator is seldom used. This makes hard for an attacker
1372  * to infer how many packets were sent between two points in time.
1373  */
1374 u32 ip_idents_reserve(u32 hash, int segs)
1375 {
1376         struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
1377         u32 old = ACCESS_ONCE(bucket->stamp32);
1378         u32 now = (u32)jiffies;
1379         u32 delta = 0;
1380
1381         if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
1382                 u64 x = random32();
1383
1384                 x *= (now - old);
1385                 delta = (u32)(x >> 32);
1386         }
1387
1388         return atomic_add_return(segs + delta, &bucket->id) - segs;
1389 }
1390 EXPORT_SYMBOL(ip_idents_reserve);
1391
1392 void __ip_select_ident(struct iphdr *iph, int segs)
1393 {
1394         static u32 ip_idents_hashrnd __read_mostly;
1395         static bool hashrnd_initialized = false;
1396         u32 hash, id;
1397
1398         if (unlikely(!hashrnd_initialized)) {
1399                 hashrnd_initialized = true;
1400                 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1401         }
1402
1403         hash = jhash_3words((__force u32)iph->daddr,
1404                             (__force u32)iph->saddr,
1405                             iph->protocol,
1406                             ip_idents_hashrnd);
1407         id = ip_idents_reserve(hash, segs);
1408         iph->id = htons(id);
1409 }
1410 EXPORT_SYMBOL(__ip_select_ident);
1411
1412 static void rt_del(unsigned hash, struct rtable *rt)
1413 {
1414         struct rtable __rcu **rthp;
1415         struct rtable *aux;
1416
1417         rthp = &rt_hash_table[hash].chain;
1418         spin_lock_bh(rt_hash_lock_addr(hash));
1419         ip_rt_put(rt);
1420         while ((aux = rcu_dereference_protected(*rthp,
1421                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1422                 if (aux == rt || rt_is_expired(aux)) {
1423                         *rthp = aux->dst.rt_next;
1424                         rt_free(aux);
1425                         continue;
1426                 }
1427                 rthp = &aux->dst.rt_next;
1428         }
1429         spin_unlock_bh(rt_hash_lock_addr(hash));
1430 }
1431
1432 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1433 {
1434         struct rtable *rt = (struct rtable *) dst;
1435         __be32 orig_gw = rt->rt_gateway;
1436         struct neighbour *n, *old_n;
1437
1438         dst_confirm(&rt->dst);
1439
1440         rt->rt_gateway = peer->redirect_learned.a4;
1441
1442         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1443         if (IS_ERR(n)) {
1444                 rt->rt_gateway = orig_gw;
1445                 return;
1446         }
1447         old_n = xchg(&rt->dst._neighbour, n);
1448         if (old_n)
1449                 neigh_release(old_n);
1450         if (!(n->nud_state & NUD_VALID)) {
1451                 neigh_event_send(n, NULL);
1452         } else {
1453                 rt->rt_flags |= RTCF_REDIRECTED;
1454                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1455         }
1456 }
1457
1458 /* called in rcu_read_lock() section */
1459 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1460                     __be32 saddr, struct net_device *dev)
1461 {
1462         int s, i;
1463         struct in_device *in_dev = __in_dev_get_rcu(dev);
1464         __be32 skeys[2] = { saddr, 0 };
1465         int    ikeys[2] = { dev->ifindex, 0 };
1466         struct inet_peer *peer;
1467         struct net *net;
1468
1469         if (!in_dev)
1470                 return;
1471
1472         net = dev_net(dev);
1473         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1474             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1475             ipv4_is_zeronet(new_gw))
1476                 goto reject_redirect;
1477
1478         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1479                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1480                         goto reject_redirect;
1481                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1482                         goto reject_redirect;
1483         } else {
1484                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1485                         goto reject_redirect;
1486         }
1487
1488         for (s = 0; s < 2; s++) {
1489                 for (i = 0; i < 2; i++) {
1490                         unsigned int hash;
1491                         struct rtable __rcu **rthp;
1492                         struct rtable *rt;
1493
1494                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1495
1496                         rthp = &rt_hash_table[hash].chain;
1497
1498                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1499                                 rthp = &rt->dst.rt_next;
1500
1501                                 if (rt->rt_key_dst != daddr ||
1502                                     rt->rt_key_src != skeys[s] ||
1503                                     rt->rt_oif != ikeys[i] ||
1504                                     rt_is_input_route(rt) ||
1505                                     rt_is_expired(rt) ||
1506                                     !net_eq(dev_net(rt->dst.dev), net) ||
1507                                     rt->dst.error ||
1508                                     rt->dst.dev != dev ||
1509                                     rt->rt_gateway != old_gw)
1510                                         continue;
1511
1512                                 if (!rt->peer)
1513                                         rt_bind_peer(rt, rt->rt_dst, 1);
1514
1515                                 peer = rt->peer;
1516                                 if (peer) {
1517                                         if (peer->redirect_learned.a4 != new_gw ||
1518                                             peer->redirect_genid != redirect_genid) {
1519                                                 peer->redirect_learned.a4 = new_gw;
1520                                                 peer->redirect_genid = redirect_genid;
1521                                                 atomic_inc(&__rt_peer_genid);
1522                                         }
1523                                         check_peer_redir(&rt->dst, peer);
1524                                 }
1525                         }
1526                 }
1527         }
1528         return;
1529
1530 reject_redirect:
1531 #ifdef CONFIG_IP_ROUTE_VERBOSE
1532         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1533                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1534                         "  Advised path = %pI4 -> %pI4\n",
1535                        &old_gw, dev->name, &new_gw,
1536                        &saddr, &daddr);
1537 #endif
1538         ;
1539 }
1540
1541 static bool peer_pmtu_expired(struct inet_peer *peer)
1542 {
1543         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1544
1545         return orig &&
1546                time_after_eq(jiffies, orig) &&
1547                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1548 }
1549
1550 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1551 {
1552         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1553
1554         return orig &&
1555                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1556 }
1557
1558 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1559 {
1560         struct rtable *rt = (struct rtable *)dst;
1561         struct dst_entry *ret = dst;
1562
1563         if (rt) {
1564                 if (dst->obsolete > 0) {
1565                         ip_rt_put(rt);
1566                         ret = NULL;
1567                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1568                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1569                                                 rt->rt_oif,
1570                                                 rt_genid(dev_net(dst->dev)));
1571                         rt_del(hash, rt);
1572                         ret = NULL;
1573                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1574                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1575                 }
1576         }
1577         return ret;
1578 }
1579
1580 /*
1581  * Algorithm:
1582  *      1. The first ip_rt_redirect_number redirects are sent
1583  *         with exponential backoff, then we stop sending them at all,
1584  *         assuming that the host ignores our redirects.
1585  *      2. If we did not see packets requiring redirects
1586  *         during ip_rt_redirect_silence, we assume that the host
1587  *         forgot redirected route and start to send redirects again.
1588  *
1589  * This algorithm is much cheaper and more intelligent than dumb load limiting
1590  * in icmp.c.
1591  *
1592  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1593  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1594  */
1595
1596 void ip_rt_send_redirect(struct sk_buff *skb)
1597 {
1598         struct rtable *rt = skb_rtable(skb);
1599         struct in_device *in_dev;
1600         struct inet_peer *peer;
1601         int log_martians;
1602
1603         rcu_read_lock();
1604         in_dev = __in_dev_get_rcu(rt->dst.dev);
1605         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1606                 rcu_read_unlock();
1607                 return;
1608         }
1609         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1610         rcu_read_unlock();
1611
1612         if (!rt->peer)
1613                 rt_bind_peer(rt, rt->rt_dst, 1);
1614         peer = rt->peer;
1615         if (!peer) {
1616                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1617                 return;
1618         }
1619
1620         /* No redirected packets during ip_rt_redirect_silence;
1621          * reset the algorithm.
1622          */
1623         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1624                 peer->rate_tokens = 0;
1625
1626         /* Too many ignored redirects; do not send anything
1627          * set dst.rate_last to the last seen redirected packet.
1628          */
1629         if (peer->rate_tokens >= ip_rt_redirect_number) {
1630                 peer->rate_last = jiffies;
1631                 return;
1632         }
1633
1634         /* Check for load limit; set rate_last to the latest sent
1635          * redirect.
1636          */
1637         if (peer->rate_tokens == 0 ||
1638             time_after(jiffies,
1639                        (peer->rate_last +
1640                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1641                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1642                 peer->rate_last = jiffies;
1643                 ++peer->rate_tokens;
1644 #ifdef CONFIG_IP_ROUTE_VERBOSE
1645                 if (log_martians &&
1646                     peer->rate_tokens == ip_rt_redirect_number &&
1647                     net_ratelimit())
1648                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1649                                &ip_hdr(skb)->saddr, rt->rt_iif,
1650                                 &rt->rt_dst, &rt->rt_gateway);
1651 #endif
1652         }
1653 }
1654
1655 static int ip_error(struct sk_buff *skb)
1656 {
1657         struct rtable *rt = skb_rtable(skb);
1658         struct inet_peer *peer;
1659         unsigned long now;
1660         bool send;
1661         int code;
1662
1663         switch (rt->dst.error) {
1664         case EINVAL:
1665         default:
1666                 goto out;
1667         case EHOSTUNREACH:
1668                 code = ICMP_HOST_UNREACH;
1669                 break;
1670         case ENETUNREACH:
1671                 code = ICMP_NET_UNREACH;
1672                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1673                                 IPSTATS_MIB_INNOROUTES);
1674                 break;
1675         case EACCES:
1676                 code = ICMP_PKT_FILTERED;
1677                 break;
1678         }
1679
1680         if (!rt->peer)
1681                 rt_bind_peer(rt, rt->rt_dst, 1);
1682         peer = rt->peer;
1683
1684         send = true;
1685         if (peer) {
1686                 now = jiffies;
1687                 peer->rate_tokens += now - peer->rate_last;
1688                 if (peer->rate_tokens > ip_rt_error_burst)
1689                         peer->rate_tokens = ip_rt_error_burst;
1690                 peer->rate_last = now;
1691                 if (peer->rate_tokens >= ip_rt_error_cost)
1692                         peer->rate_tokens -= ip_rt_error_cost;
1693                 else
1694                         send = false;
1695         }
1696         if (send)
1697                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1698
1699 out:    kfree_skb(skb);
1700         return 0;
1701 }
1702
1703 /*
1704  *      The last two values are not from the RFC but
1705  *      are needed for AMPRnet AX.25 paths.
1706  */
1707
1708 static const unsigned short mtu_plateau[] =
1709 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1710
1711 static inline unsigned short guess_mtu(unsigned short old_mtu)
1712 {
1713         int i;
1714
1715         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1716                 if (old_mtu > mtu_plateau[i])
1717                         return mtu_plateau[i];
1718         return 68;
1719 }
1720
1721 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1722                                  unsigned short new_mtu,
1723                                  struct net_device *dev)
1724 {
1725         unsigned short old_mtu = ntohs(iph->tot_len);
1726         unsigned short est_mtu = 0;
1727         struct inet_peer *peer;
1728
1729         peer = inet_getpeer_v4(iph->daddr, 1);
1730         if (peer) {
1731                 unsigned short mtu = new_mtu;
1732
1733                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1734                         /* BSD 4.2 derived systems incorrectly adjust
1735                          * tot_len by the IP header length, and report
1736                          * a zero MTU in the ICMP message.
1737                          */
1738                         if (mtu == 0 &&
1739                             old_mtu >= 68 + (iph->ihl << 2))
1740                                 old_mtu -= iph->ihl << 2;
1741                         mtu = guess_mtu(old_mtu);
1742                 }
1743
1744                 if (mtu < ip_rt_min_pmtu)
1745                         mtu = ip_rt_min_pmtu;
1746                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1747                         unsigned long pmtu_expires;
1748
1749                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1750                         if (!pmtu_expires)
1751                                 pmtu_expires = 1UL;
1752
1753                         est_mtu = mtu;
1754                         peer->pmtu_learned = mtu;
1755                         peer->pmtu_expires = pmtu_expires;
1756                         atomic_inc(&__rt_peer_genid);
1757                 }
1758
1759                 inet_putpeer(peer);
1760         }
1761         return est_mtu ? : new_mtu;
1762 }
1763
1764 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1765 {
1766         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1767
1768         if (!expires)
1769                 return;
1770         if (time_before(jiffies, expires)) {
1771                 u32 orig_dst_mtu = dst_mtu(dst);
1772                 if (peer->pmtu_learned < orig_dst_mtu) {
1773                         if (!peer->pmtu_orig)
1774                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1775                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1776                 }
1777         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1778                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1779 }
1780
1781 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1782 {
1783         struct rtable *rt = (struct rtable *) dst;
1784         struct inet_peer *peer;
1785
1786         dst_confirm(dst);
1787
1788         if (!rt->peer)
1789                 rt_bind_peer(rt, rt->rt_dst, 1);
1790         peer = rt->peer;
1791         if (peer) {
1792                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1793
1794                 if (mtu < ip_rt_min_pmtu)
1795                         mtu = ip_rt_min_pmtu;
1796                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1797
1798                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1799                         if (!pmtu_expires)
1800                                 pmtu_expires = 1UL;
1801
1802                         peer->pmtu_learned = mtu;
1803                         peer->pmtu_expires = pmtu_expires;
1804
1805                         atomic_inc(&__rt_peer_genid);
1806                         rt->rt_peer_genid = rt_peer_genid();
1807                 }
1808                 check_peer_pmtu(dst, peer);
1809         }
1810 }
1811
1812
1813 static void ipv4_validate_peer(struct rtable *rt)
1814 {
1815         if (rt->rt_peer_genid != rt_peer_genid()) {
1816                 struct inet_peer *peer;
1817
1818                 if (!rt->peer)
1819                         rt_bind_peer(rt, rt->rt_dst, 0);
1820
1821                 peer = rt->peer;
1822                 if (peer) {
1823                         check_peer_pmtu(&rt->dst, peer);
1824
1825                         if (peer->redirect_genid != redirect_genid)
1826                                 peer->redirect_learned.a4 = 0;
1827                         if (peer->redirect_learned.a4 &&
1828                             peer->redirect_learned.a4 != rt->rt_gateway)
1829                                 check_peer_redir(&rt->dst, peer);
1830                 }
1831
1832                 rt->rt_peer_genid = rt_peer_genid();
1833         }
1834 }
1835
1836 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1837 {
1838         struct rtable *rt = (struct rtable *) dst;
1839
1840         if (rt_is_expired(rt))
1841                 return NULL;
1842         ipv4_validate_peer(rt);
1843         return dst;
1844 }
1845
1846 static void ipv4_dst_destroy(struct dst_entry *dst)
1847 {
1848         struct rtable *rt = (struct rtable *) dst;
1849         struct inet_peer *peer = rt->peer;
1850
1851         if (rt->fi) {
1852                 fib_info_put(rt->fi);
1853                 rt->fi = NULL;
1854         }
1855         if (peer) {
1856                 rt->peer = NULL;
1857                 inet_putpeer(peer);
1858         }
1859 }
1860
1861
1862 static void ipv4_link_failure(struct sk_buff *skb)
1863 {
1864         struct rtable *rt;
1865
1866         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1867
1868         rt = skb_rtable(skb);
1869         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1870                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1871 }
1872
1873 static int ip_rt_bug(struct sk_buff *skb)
1874 {
1875         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1876                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1877                 skb->dev ? skb->dev->name : "?");
1878         kfree_skb(skb);
1879         WARN_ON(1);
1880         return 0;
1881 }
1882
1883 /*
1884    We do not cache source address of outgoing interface,
1885    because it is used only by IP RR, TS and SRR options,
1886    so that it out of fast path.
1887
1888    BTW remember: "addr" is allowed to be not aligned
1889    in IP options!
1890  */
1891
1892 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1893 {
1894         __be32 src;
1895
1896         if (rt_is_output_route(rt))
1897                 src = ip_hdr(skb)->saddr;
1898         else {
1899                 struct fib_result res;
1900                 struct flowi4 fl4;
1901                 struct iphdr *iph;
1902
1903                 iph = ip_hdr(skb);
1904
1905                 memset(&fl4, 0, sizeof(fl4));
1906                 fl4.daddr = iph->daddr;
1907                 fl4.saddr = iph->saddr;
1908                 fl4.flowi4_tos = RT_TOS(iph->tos);
1909                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1910                 fl4.flowi4_iif = skb->dev->ifindex;
1911                 fl4.flowi4_mark = skb->mark;
1912
1913                 rcu_read_lock();
1914                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1915                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1916                 else
1917                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1918                                         RT_SCOPE_UNIVERSE);
1919                 rcu_read_unlock();
1920         }
1921         memcpy(addr, &src, 4);
1922 }
1923
1924 #ifdef CONFIG_IP_ROUTE_CLASSID
1925 static void set_class_tag(struct rtable *rt, u32 tag)
1926 {
1927         if (!(rt->dst.tclassid & 0xFFFF))
1928                 rt->dst.tclassid |= tag & 0xFFFF;
1929         if (!(rt->dst.tclassid & 0xFFFF0000))
1930                 rt->dst.tclassid |= tag & 0xFFFF0000;
1931 }
1932 #endif
1933
1934 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1935 {
1936         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1937
1938         if (advmss == 0) {
1939                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1940                                ip_rt_min_advmss);
1941                 if (advmss > 65535 - 40)
1942                         advmss = 65535 - 40;
1943         }
1944         return advmss;
1945 }
1946
1947 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1948 {
1949         const struct rtable *rt = (const struct rtable *) dst;
1950         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1951
1952         if (mtu && rt_is_output_route(rt))
1953                 return mtu;
1954
1955         mtu = dst->dev->mtu;
1956
1957         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1958
1959                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1960                         mtu = 576;
1961         }
1962
1963         if (mtu > IP_MAX_MTU)
1964                 mtu = IP_MAX_MTU;
1965
1966         return mtu;
1967 }
1968
1969 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1970                             struct fib_info *fi)
1971 {
1972         struct inet_peer *peer;
1973         int create = 0;
1974
1975         /* If a peer entry exists for this destination, we must hook
1976          * it up in order to get at cached metrics.
1977          */
1978         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1979                 create = 1;
1980
1981         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1982         if (peer) {
1983                 rt->rt_peer_genid = rt_peer_genid();
1984                 if (inet_metrics_new(peer))
1985                         memcpy(peer->metrics, fi->fib_metrics,
1986                                sizeof(u32) * RTAX_MAX);
1987                 dst_init_metrics(&rt->dst, peer->metrics, false);
1988
1989                 check_peer_pmtu(&rt->dst, peer);
1990                 if (peer->redirect_genid != redirect_genid)
1991                         peer->redirect_learned.a4 = 0;
1992                 if (peer->redirect_learned.a4 &&
1993                     peer->redirect_learned.a4 != rt->rt_gateway) {
1994                         rt->rt_gateway = peer->redirect_learned.a4;
1995                         rt->rt_flags |= RTCF_REDIRECTED;
1996                 }
1997         } else {
1998                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1999                         rt->fi = fi;
2000                         atomic_inc(&fi->fib_clntref);
2001                 }
2002                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
2003         }
2004 }
2005
2006 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
2007                            const struct fib_result *res,
2008                            struct fib_info *fi, u16 type, u32 itag)
2009 {
2010         struct dst_entry *dst = &rt->dst;
2011
2012         if (fi) {
2013                 if (FIB_RES_GW(*res) &&
2014                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
2015                         rt->rt_gateway = FIB_RES_GW(*res);
2016                 rt_init_metrics(rt, fl4, fi);
2017 #ifdef CONFIG_IP_ROUTE_CLASSID
2018                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
2019 #endif
2020         }
2021
2022         if (dst_mtu(dst) > IP_MAX_MTU)
2023                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
2024         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
2025                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
2026
2027 #ifdef CONFIG_IP_ROUTE_CLASSID
2028 #ifdef CONFIG_IP_MULTIPLE_TABLES
2029         set_class_tag(rt, fib_rules_tclass(res));
2030 #endif
2031         set_class_tag(rt, itag);
2032 #endif
2033 }
2034
2035 static struct rtable *rt_dst_alloc(struct net_device *dev,
2036                                    bool nopolicy, bool noxfrm)
2037 {
2038         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2039                          DST_HOST |
2040                          (nopolicy ? DST_NOPOLICY : 0) |
2041                          (noxfrm ? DST_NOXFRM : 0));
2042 }
2043
2044 /* called in rcu_read_lock() section */
2045 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2046                                 u8 tos, struct net_device *dev, int our)
2047 {
2048         unsigned int hash;
2049         struct rtable *rth;
2050         __be32 spec_dst;
2051         struct in_device *in_dev = __in_dev_get_rcu(dev);
2052         u32 itag = 0;
2053         int err;
2054
2055         /* Primary sanity checks. */
2056
2057         if (in_dev == NULL)
2058                 return -EINVAL;
2059
2060         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2061             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2062                 goto e_inval;
2063
2064         if (ipv4_is_zeronet(saddr)) {
2065                 if (!ipv4_is_local_multicast(daddr))
2066                         goto e_inval;
2067                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2068         } else {
2069                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2070                                           &itag);
2071                 if (err < 0)
2072                         goto e_err;
2073         }
2074         rth = rt_dst_alloc(init_net.loopback_dev,
2075                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2076         if (!rth)
2077                 goto e_nobufs;
2078
2079 #ifdef CONFIG_IP_ROUTE_CLASSID
2080         rth->dst.tclassid = itag;
2081 #endif
2082         rth->dst.output = ip_rt_bug;
2083
2084         rth->rt_key_dst = daddr;
2085         rth->rt_key_src = saddr;
2086         rth->rt_genid   = rt_genid(dev_net(dev));
2087         rth->rt_flags   = RTCF_MULTICAST;
2088         rth->rt_type    = RTN_MULTICAST;
2089         rth->rt_key_tos = tos;
2090         rth->rt_dst     = daddr;
2091         rth->rt_src     = saddr;
2092         rth->rt_route_iif = dev->ifindex;
2093         rth->rt_iif     = dev->ifindex;
2094         rth->rt_oif     = 0;
2095         rth->rt_mark    = skb->mark;
2096         rth->rt_gateway = daddr;
2097         rth->rt_spec_dst= spec_dst;
2098         rth->rt_peer_genid = 0;
2099         rth->peer = NULL;
2100         rth->fi = NULL;
2101         if (our) {
2102                 rth->dst.input= ip_local_deliver;
2103                 rth->rt_flags |= RTCF_LOCAL;
2104         }
2105
2106 #ifdef CONFIG_IP_MROUTE
2107         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2108                 rth->dst.input = ip_mr_input;
2109 #endif
2110         RT_CACHE_STAT_INC(in_slow_mc);
2111
2112         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2113         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2114         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2115
2116 e_nobufs:
2117         return -ENOBUFS;
2118 e_inval:
2119         return -EINVAL;
2120 e_err:
2121         return err;
2122 }
2123
2124
2125 static void ip_handle_martian_source(struct net_device *dev,
2126                                      struct in_device *in_dev,
2127                                      struct sk_buff *skb,
2128                                      __be32 daddr,
2129                                      __be32 saddr)
2130 {
2131         RT_CACHE_STAT_INC(in_martian_src);
2132 #ifdef CONFIG_IP_ROUTE_VERBOSE
2133         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2134                 /*
2135                  *      RFC1812 recommendation, if source is martian,
2136                  *      the only hint is MAC header.
2137                  */
2138                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2139                         &daddr, &saddr, dev->name);
2140                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2141                         int i;
2142                         const unsigned char *p = skb_mac_header(skb);
2143                         printk(KERN_WARNING "ll header: ");
2144                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2145                                 printk("%02x", *p);
2146                                 if (i < (dev->hard_header_len - 1))
2147                                         printk(":");
2148                         }
2149                         printk("\n");
2150                 }
2151         }
2152 #endif
2153 }
2154
2155 /* called in rcu_read_lock() section */
2156 static int __mkroute_input(struct sk_buff *skb,
2157                            const struct fib_result *res,
2158                            struct in_device *in_dev,
2159                            __be32 daddr, __be32 saddr, u32 tos,
2160                            struct rtable **result)
2161 {
2162         struct rtable *rth;
2163         int err;
2164         struct in_device *out_dev;
2165         unsigned int flags = 0;
2166         __be32 spec_dst;
2167         u32 itag = 0;
2168
2169         /* get a working reference to the output device */
2170         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2171         if (out_dev == NULL) {
2172                 if (net_ratelimit())
2173                         printk(KERN_CRIT "Bug in ip_route_input" \
2174                                "_slow(). Please, report\n");
2175                 return -EINVAL;
2176         }
2177
2178
2179         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2180                                   in_dev->dev, &spec_dst, &itag);
2181         if (err < 0) {
2182                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2183                                          saddr);
2184
2185                 goto cleanup;
2186         }
2187
2188         if (err)
2189                 flags |= RTCF_DIRECTSRC;
2190
2191         if (out_dev == in_dev && err &&
2192             (IN_DEV_SHARED_MEDIA(out_dev) ||
2193              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2194                 flags |= RTCF_DOREDIRECT;
2195
2196         if (skb->protocol != htons(ETH_P_IP)) {
2197                 /* Not IP (i.e. ARP). Do not create route, if it is
2198                  * invalid for proxy arp. DNAT routes are always valid.
2199                  *
2200                  * Proxy arp feature have been extended to allow, ARP
2201                  * replies back to the same interface, to support
2202                  * Private VLAN switch technologies. See arp.c.
2203                  */
2204                 if (out_dev == in_dev &&
2205                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2206                         err = -EINVAL;
2207                         goto cleanup;
2208                 }
2209         }
2210
2211         rth = rt_dst_alloc(out_dev->dev,
2212                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2213                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2214         if (!rth) {
2215                 err = -ENOBUFS;
2216                 goto cleanup;
2217         }
2218
2219         rth->rt_key_dst = daddr;
2220         rth->rt_key_src = saddr;
2221         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2222         rth->rt_flags = flags;
2223         rth->rt_type = res->type;
2224         rth->rt_key_tos = tos;
2225         rth->rt_dst     = daddr;
2226         rth->rt_src     = saddr;
2227         rth->rt_route_iif = in_dev->dev->ifindex;
2228         rth->rt_iif     = in_dev->dev->ifindex;
2229         rth->rt_oif     = 0;
2230         rth->rt_mark    = skb->mark;
2231         rth->rt_gateway = daddr;
2232         rth->rt_spec_dst= spec_dst;
2233         rth->rt_peer_genid = 0;
2234         rth->peer = NULL;
2235         rth->fi = NULL;
2236
2237         rth->dst.input = ip_forward;
2238         rth->dst.output = ip_output;
2239
2240         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2241
2242         *result = rth;
2243         err = 0;
2244  cleanup:
2245         return err;
2246 }
2247
2248 static int ip_mkroute_input(struct sk_buff *skb,
2249                             struct fib_result *res,
2250                             const struct flowi4 *fl4,
2251                             struct in_device *in_dev,
2252                             __be32 daddr, __be32 saddr, u32 tos)
2253 {
2254         struct rtable* rth = NULL;
2255         int err;
2256         unsigned hash;
2257
2258 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2259         if (res->fi && res->fi->fib_nhs > 1)
2260                 fib_select_multipath(res);
2261 #endif
2262
2263         /* create a routing cache entry */
2264         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2265         if (err)
2266                 return err;
2267
2268         /* put it into the cache */
2269         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2270                        rt_genid(dev_net(rth->dst.dev)));
2271         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2272         if (IS_ERR(rth))
2273                 return PTR_ERR(rth);
2274         return 0;
2275 }
2276
2277 /*
2278  *      NOTE. We drop all the packets that has local source
2279  *      addresses, because every properly looped back packet
2280  *      must have correct destination already attached by output routine.
2281  *
2282  *      Such approach solves two big problems:
2283  *      1. Not simplex devices are handled properly.
2284  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2285  *      called with rcu_read_lock()
2286  */
2287
2288 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2289                                u8 tos, struct net_device *dev)
2290 {
2291         struct fib_result res;
2292         struct in_device *in_dev = __in_dev_get_rcu(dev);
2293         struct flowi4   fl4;
2294         unsigned        flags = 0;
2295         u32             itag = 0;
2296         struct rtable * rth;
2297         unsigned        hash;
2298         __be32          spec_dst;
2299         int             err = -EINVAL;
2300         struct net    * net = dev_net(dev);
2301
2302         /* IP on this device is disabled. */
2303
2304         if (!in_dev)
2305                 goto out;
2306
2307         /* Check for the most weird martians, which can be not detected
2308            by fib_lookup.
2309          */
2310
2311         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2312             ipv4_is_loopback(saddr))
2313                 goto martian_source;
2314
2315         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2316                 goto brd_input;
2317
2318         /* Accept zero addresses only to limited broadcast;
2319          * I even do not know to fix it or not. Waiting for complains :-)
2320          */
2321         if (ipv4_is_zeronet(saddr))
2322                 goto martian_source;
2323
2324         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2325                 goto martian_destination;
2326
2327         /*
2328          *      Now we are ready to route packet.
2329          */
2330         fl4.flowi4_oif = 0;
2331         fl4.flowi4_iif = dev->ifindex;
2332         fl4.flowi4_mark = skb->mark;
2333         fl4.flowi4_tos = tos;
2334         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2335         fl4.daddr = daddr;
2336         fl4.saddr = saddr;
2337         err = fib_lookup(net, &fl4, &res);
2338         if (err != 0) {
2339                 if (!IN_DEV_FORWARD(in_dev))
2340                         goto e_hostunreach;
2341                 goto no_route;
2342         }
2343
2344         RT_CACHE_STAT_INC(in_slow_tot);
2345
2346         if (res.type == RTN_BROADCAST)
2347                 goto brd_input;
2348
2349         if (res.type == RTN_LOCAL) {
2350                 err = fib_validate_source(skb, saddr, daddr, tos,
2351                                           net->loopback_dev->ifindex,
2352                                           dev, &spec_dst, &itag);
2353                 if (err < 0)
2354                         goto martian_source_keep_err;
2355                 if (err)
2356                         flags |= RTCF_DIRECTSRC;
2357                 spec_dst = daddr;
2358                 goto local_input;
2359         }
2360
2361         if (!IN_DEV_FORWARD(in_dev))
2362                 goto e_hostunreach;
2363         if (res.type != RTN_UNICAST)
2364                 goto martian_destination;
2365
2366         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2367 out:    return err;
2368
2369 brd_input:
2370         if (skb->protocol != htons(ETH_P_IP))
2371                 goto e_inval;
2372
2373         if (ipv4_is_zeronet(saddr))
2374                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2375         else {
2376                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2377                                           &itag);
2378                 if (err < 0)
2379                         goto martian_source_keep_err;
2380                 if (err)
2381                         flags |= RTCF_DIRECTSRC;
2382         }
2383         flags |= RTCF_BROADCAST;
2384         res.type = RTN_BROADCAST;
2385         RT_CACHE_STAT_INC(in_brd);
2386
2387 local_input:
2388         rth = rt_dst_alloc(net->loopback_dev,
2389                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2390         if (!rth)
2391                 goto e_nobufs;
2392
2393         rth->dst.input= ip_local_deliver;
2394         rth->dst.output= ip_rt_bug;
2395 #ifdef CONFIG_IP_ROUTE_CLASSID
2396         rth->dst.tclassid = itag;
2397 #endif
2398
2399         rth->rt_key_dst = daddr;
2400         rth->rt_key_src = saddr;
2401         rth->rt_genid = rt_genid(net);
2402         rth->rt_flags   = flags|RTCF_LOCAL;
2403         rth->rt_type    = res.type;
2404         rth->rt_key_tos = tos;
2405         rth->rt_dst     = daddr;
2406         rth->rt_src     = saddr;
2407 #ifdef CONFIG_IP_ROUTE_CLASSID
2408         rth->dst.tclassid = itag;
2409 #endif
2410         rth->rt_route_iif = dev->ifindex;
2411         rth->rt_iif     = dev->ifindex;
2412         rth->rt_oif     = 0;
2413         rth->rt_mark    = skb->mark;
2414         rth->rt_gateway = daddr;
2415         rth->rt_spec_dst= spec_dst;
2416         rth->rt_peer_genid = 0;
2417         rth->peer = NULL;
2418         rth->fi = NULL;
2419         if (res.type == RTN_UNREACHABLE) {
2420                 rth->dst.input= ip_error;
2421                 rth->dst.error= -err;
2422                 rth->rt_flags   &= ~RTCF_LOCAL;
2423         }
2424         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2425         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2426         err = 0;
2427         if (IS_ERR(rth))
2428                 err = PTR_ERR(rth);
2429         goto out;
2430
2431 no_route:
2432         RT_CACHE_STAT_INC(in_no_route);
2433         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2434         res.type = RTN_UNREACHABLE;
2435         if (err == -ESRCH)
2436                 err = -ENETUNREACH;
2437         goto local_input;
2438
2439         /*
2440          *      Do not cache martian addresses: they should be logged (RFC1812)
2441          */
2442 martian_destination:
2443         RT_CACHE_STAT_INC(in_martian_dst);
2444 #ifdef CONFIG_IP_ROUTE_VERBOSE
2445         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2446                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2447                         &daddr, &saddr, dev->name);
2448 #endif
2449
2450 e_hostunreach:
2451         err = -EHOSTUNREACH;
2452         goto out;
2453
2454 e_inval:
2455         err = -EINVAL;
2456         goto out;
2457
2458 e_nobufs:
2459         err = -ENOBUFS;
2460         goto out;
2461
2462 martian_source:
2463         err = -EINVAL;
2464 martian_source_keep_err:
2465         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2466         goto out;
2467 }
2468
2469 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2470                            u8 tos, struct net_device *dev, bool noref)
2471 {
2472         struct rtable * rth;
2473         unsigned        hash;
2474         int iif = dev->ifindex;
2475         struct net *net;
2476         int res;
2477
2478         net = dev_net(dev);
2479
2480         rcu_read_lock();
2481
2482         if (!rt_caching(net))
2483                 goto skip_cache;
2484
2485         tos &= IPTOS_RT_MASK;
2486         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2487
2488         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2489              rth = rcu_dereference(rth->dst.rt_next)) {
2490                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2491                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2492                      (rth->rt_route_iif ^ iif) |
2493                      (rth->rt_key_tos ^ tos)) == 0 &&
2494                     rth->rt_mark == skb->mark &&
2495                     net_eq(dev_net(rth->dst.dev), net) &&
2496                     !rt_is_expired(rth)) {
2497                         ipv4_validate_peer(rth);
2498                         if (noref) {
2499                                 dst_use_noref(&rth->dst, jiffies);
2500                                 skb_dst_set_noref(skb, &rth->dst);
2501                         } else {
2502                                 dst_use(&rth->dst, jiffies);
2503                                 skb_dst_set(skb, &rth->dst);
2504                         }
2505                         RT_CACHE_STAT_INC(in_hit);
2506                         rcu_read_unlock();
2507                         return 0;
2508                 }
2509                 RT_CACHE_STAT_INC(in_hlist_search);
2510         }
2511
2512 skip_cache:
2513         /* Multicast recognition logic is moved from route cache to here.
2514            The problem was that too many Ethernet cards have broken/missing
2515            hardware multicast filters :-( As result the host on multicasting
2516            network acquires a lot of useless route cache entries, sort of
2517            SDR messages from all the world. Now we try to get rid of them.
2518            Really, provided software IP multicast filter is organized
2519            reasonably (at least, hashed), it does not result in a slowdown
2520            comparing with route cache reject entries.
2521            Note, that multicast routers are not affected, because
2522            route cache entry is created eventually.
2523          */
2524         if (ipv4_is_multicast(daddr)) {
2525                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2526
2527                 if (in_dev) {
2528                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2529                                                   ip_hdr(skb)->protocol);
2530                         if (our
2531 #ifdef CONFIG_IP_MROUTE
2532                                 ||
2533                             (!ipv4_is_local_multicast(daddr) &&
2534                              IN_DEV_MFORWARD(in_dev))
2535 #endif
2536                            ) {
2537                                 int res = ip_route_input_mc(skb, daddr, saddr,
2538                                                             tos, dev, our);
2539                                 rcu_read_unlock();
2540                                 return res;
2541                         }
2542                 }
2543                 rcu_read_unlock();
2544                 return -EINVAL;
2545         }
2546         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2547         rcu_read_unlock();
2548         return res;
2549 }
2550 EXPORT_SYMBOL(ip_route_input_common);
2551
2552 /* called with rcu_read_lock() */
2553 static struct rtable *__mkroute_output(const struct fib_result *res,
2554                                        const struct flowi4 *fl4,
2555                                        __be32 orig_daddr, __be32 orig_saddr,
2556                                        int orig_oif, __u8 orig_rtos,
2557                                        struct net_device *dev_out,
2558                                        unsigned int flags)
2559 {
2560         struct fib_info *fi = res->fi;
2561         struct in_device *in_dev;
2562         u16 type = res->type;
2563         struct rtable *rth;
2564
2565         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2566                 return ERR_PTR(-EINVAL);
2567
2568         if (ipv4_is_lbcast(fl4->daddr))
2569                 type = RTN_BROADCAST;
2570         else if (ipv4_is_multicast(fl4->daddr))
2571                 type = RTN_MULTICAST;
2572         else if (ipv4_is_zeronet(fl4->daddr))
2573                 return ERR_PTR(-EINVAL);
2574
2575         if (dev_out->flags & IFF_LOOPBACK)
2576                 flags |= RTCF_LOCAL;
2577
2578         in_dev = __in_dev_get_rcu(dev_out);
2579         if (!in_dev)
2580                 return ERR_PTR(-EINVAL);
2581
2582         if (type == RTN_BROADCAST) {
2583                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2584                 fi = NULL;
2585         } else if (type == RTN_MULTICAST) {
2586                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2587                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2588                                      fl4->flowi4_proto))
2589                         flags &= ~RTCF_LOCAL;
2590                 /* If multicast route do not exist use
2591                  * default one, but do not gateway in this case.
2592                  * Yes, it is hack.
2593                  */
2594                 if (fi && res->prefixlen < 4)
2595                         fi = NULL;
2596         }
2597
2598         rth = rt_dst_alloc(dev_out,
2599                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2600                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2601         if (!rth)
2602                 return ERR_PTR(-ENOBUFS);
2603
2604         rth->dst.output = ip_output;
2605
2606         rth->rt_key_dst = orig_daddr;
2607         rth->rt_key_src = orig_saddr;
2608         rth->rt_genid = rt_genid(dev_net(dev_out));
2609         rth->rt_flags   = flags;
2610         rth->rt_type    = type;
2611         rth->rt_key_tos = orig_rtos;
2612         rth->rt_dst     = fl4->daddr;
2613         rth->rt_src     = fl4->saddr;
2614         rth->rt_route_iif = 0;
2615         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2616         rth->rt_oif     = orig_oif;
2617         rth->rt_mark    = fl4->flowi4_mark;
2618         rth->rt_gateway = fl4->daddr;
2619         rth->rt_spec_dst= fl4->saddr;
2620         rth->rt_peer_genid = 0;
2621         rth->peer = NULL;
2622         rth->fi = NULL;
2623
2624         RT_CACHE_STAT_INC(out_slow_tot);
2625
2626         if (flags & RTCF_LOCAL) {
2627                 rth->dst.input = ip_local_deliver;
2628                 rth->rt_spec_dst = fl4->daddr;
2629         }
2630         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2631                 rth->rt_spec_dst = fl4->saddr;
2632                 if (flags & RTCF_LOCAL &&
2633                     !(dev_out->flags & IFF_LOOPBACK)) {
2634                         rth->dst.output = ip_mc_output;
2635                         RT_CACHE_STAT_INC(out_slow_mc);
2636                 }
2637 #ifdef CONFIG_IP_MROUTE
2638                 if (type == RTN_MULTICAST) {
2639                         if (IN_DEV_MFORWARD(in_dev) &&
2640                             !ipv4_is_local_multicast(fl4->daddr)) {
2641                                 rth->dst.input = ip_mr_input;
2642                                 rth->dst.output = ip_mc_output;
2643                         }
2644                 }
2645 #endif
2646         }
2647
2648         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2649
2650         return rth;
2651 }
2652
2653 /*
2654  * Major route resolver routine.
2655  * called with rcu_read_lock();
2656  */
2657
2658 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2659 {
2660         struct net_device *dev_out = NULL;
2661         __u8 tos = RT_FL_TOS(fl4);
2662         unsigned int flags = 0;
2663         struct fib_result res;
2664         struct rtable *rth;
2665         __be32 orig_daddr;
2666         __be32 orig_saddr;
2667         int orig_oif;
2668
2669         res.fi          = NULL;
2670 #ifdef CONFIG_IP_MULTIPLE_TABLES
2671         res.r           = NULL;
2672 #endif
2673
2674         orig_daddr = fl4->daddr;
2675         orig_saddr = fl4->saddr;
2676         orig_oif = fl4->flowi4_oif;
2677
2678         fl4->flowi4_iif = net->loopback_dev->ifindex;
2679         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2680         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2681                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2682
2683         rcu_read_lock();
2684         if (fl4->saddr) {
2685                 rth = ERR_PTR(-EINVAL);
2686                 if (ipv4_is_multicast(fl4->saddr) ||
2687                     ipv4_is_lbcast(fl4->saddr) ||
2688                     ipv4_is_zeronet(fl4->saddr))
2689                         goto out;
2690
2691                 /* I removed check for oif == dev_out->oif here.
2692                    It was wrong for two reasons:
2693                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2694                       is assigned to multiple interfaces.
2695                    2. Moreover, we are allowed to send packets with saddr
2696                       of another iface. --ANK
2697                  */
2698
2699                 if (fl4->flowi4_oif == 0 &&
2700                     (ipv4_is_multicast(fl4->daddr) ||
2701                      ipv4_is_lbcast(fl4->daddr))) {
2702                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2703                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2704                         if (dev_out == NULL)
2705                                 goto out;
2706
2707                         /* Special hack: user can direct multicasts
2708                            and limited broadcast via necessary interface
2709                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2710                            This hack is not just for fun, it allows
2711                            vic,vat and friends to work.
2712                            They bind socket to loopback, set ttl to zero
2713                            and expect that it will work.
2714                            From the viewpoint of routing cache they are broken,
2715                            because we are not allowed to build multicast path
2716                            with loopback source addr (look, routing cache
2717                            cannot know, that ttl is zero, so that packet
2718                            will not leave this host and route is valid).
2719                            Luckily, this hack is good workaround.
2720                          */
2721
2722                         fl4->flowi4_oif = dev_out->ifindex;
2723                         goto make_route;
2724                 }
2725
2726                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2727                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2728                         if (!__ip_dev_find(net, fl4->saddr, false))
2729                                 goto out;
2730                 }
2731         }
2732
2733
2734         if (fl4->flowi4_oif) {
2735                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2736                 rth = ERR_PTR(-ENODEV);
2737                 if (dev_out == NULL)
2738                         goto out;
2739
2740                 /* RACE: Check return value of inet_select_addr instead. */
2741                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2742                         rth = ERR_PTR(-ENETUNREACH);
2743                         goto out;
2744                 }
2745                 if (ipv4_is_local_multicast(fl4->daddr) ||
2746                     ipv4_is_lbcast(fl4->daddr)) {
2747                         if (!fl4->saddr)
2748                                 fl4->saddr = inet_select_addr(dev_out, 0,
2749                                                               RT_SCOPE_LINK);
2750                         goto make_route;
2751                 }
2752                 if (!fl4->saddr) {
2753                         if (ipv4_is_multicast(fl4->daddr))
2754                                 fl4->saddr = inet_select_addr(dev_out, 0,
2755                                                               fl4->flowi4_scope);
2756                         else if (!fl4->daddr)
2757                                 fl4->saddr = inet_select_addr(dev_out, 0,
2758                                                               RT_SCOPE_HOST);
2759                 }
2760         }
2761
2762         if (!fl4->daddr) {
2763                 fl4->daddr = fl4->saddr;
2764                 if (!fl4->daddr)
2765                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2766                 dev_out = net->loopback_dev;
2767                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2768                 res.type = RTN_LOCAL;
2769                 flags |= RTCF_LOCAL;
2770                 goto make_route;
2771         }
2772
2773         if (fib_lookup(net, fl4, &res)) {
2774                 res.fi = NULL;
2775                 if (fl4->flowi4_oif) {
2776                         /* Apparently, routing tables are wrong. Assume,
2777                            that the destination is on link.
2778
2779                            WHY? DW.
2780                            Because we are allowed to send to iface
2781                            even if it has NO routes and NO assigned
2782                            addresses. When oif is specified, routing
2783                            tables are looked up with only one purpose:
2784                            to catch if destination is gatewayed, rather than
2785                            direct. Moreover, if MSG_DONTROUTE is set,
2786                            we send packet, ignoring both routing tables
2787                            and ifaddr state. --ANK
2788
2789
2790                            We could make it even if oif is unknown,
2791                            likely IPv6, but we do not.
2792                          */
2793
2794                         if (fl4->saddr == 0)
2795                                 fl4->saddr = inet_select_addr(dev_out, 0,
2796                                                               RT_SCOPE_LINK);
2797                         res.type = RTN_UNICAST;
2798                         goto make_route;
2799                 }
2800                 rth = ERR_PTR(-ENETUNREACH);
2801                 goto out;
2802         }
2803
2804         if (res.type == RTN_LOCAL) {
2805                 if (!fl4->saddr) {
2806                         if (res.fi->fib_prefsrc)
2807                                 fl4->saddr = res.fi->fib_prefsrc;
2808                         else
2809                                 fl4->saddr = fl4->daddr;
2810                 }
2811                 dev_out = net->loopback_dev;
2812                 fl4->flowi4_oif = dev_out->ifindex;
2813                 res.fi = NULL;
2814                 flags |= RTCF_LOCAL;
2815                 goto make_route;
2816         }
2817
2818 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2819         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2820                 fib_select_multipath(&res);
2821         else
2822 #endif
2823         if (!res.prefixlen &&
2824             res.table->tb_num_default > 1 &&
2825             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2826                 fib_select_default(&res);
2827
2828         if (!fl4->saddr)
2829                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2830
2831         dev_out = FIB_RES_DEV(res);
2832         fl4->flowi4_oif = dev_out->ifindex;
2833
2834
2835 make_route:
2836         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2837                                tos, dev_out, flags);
2838         if (!IS_ERR(rth)) {
2839                 unsigned int hash;
2840
2841                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2842                                rt_genid(dev_net(dev_out)));
2843                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2844         }
2845
2846 out:
2847         rcu_read_unlock();
2848         return rth;
2849 }
2850
2851 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2852 {
2853         struct rtable *rth;
2854         unsigned int hash;
2855
2856         if (!rt_caching(net))
2857                 goto slow_output;
2858
2859         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2860
2861         rcu_read_lock_bh();
2862         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2863                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2864                 if (rth->rt_key_dst == flp4->daddr &&
2865                     rth->rt_key_src == flp4->saddr &&
2866                     rt_is_output_route(rth) &&
2867                     rth->rt_oif == flp4->flowi4_oif &&
2868                     rth->rt_mark == flp4->flowi4_mark &&
2869                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2870                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2871                     net_eq(dev_net(rth->dst.dev), net) &&
2872                     !rt_is_expired(rth)) {
2873                         ipv4_validate_peer(rth);
2874                         dst_use(&rth->dst, jiffies);
2875                         RT_CACHE_STAT_INC(out_hit);
2876                         rcu_read_unlock_bh();
2877                         if (!flp4->saddr)
2878                                 flp4->saddr = rth->rt_src;
2879                         if (!flp4->daddr)
2880                                 flp4->daddr = rth->rt_dst;
2881                         return rth;
2882                 }
2883                 RT_CACHE_STAT_INC(out_hlist_search);
2884         }
2885         rcu_read_unlock_bh();
2886
2887 slow_output:
2888         return ip_route_output_slow(net, flp4);
2889 }
2890 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2891
2892 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2893 {
2894         return NULL;
2895 }
2896
2897 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2898 {
2899         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2900
2901         return mtu ? : dst->dev->mtu;
2902 }
2903
2904 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2905 {
2906 }
2907
2908 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2909                                           unsigned long old)
2910 {
2911         return NULL;
2912 }
2913
2914 static struct dst_ops ipv4_dst_blackhole_ops = {
2915         .family                 =       AF_INET,
2916         .protocol               =       cpu_to_be16(ETH_P_IP),
2917         .destroy                =       ipv4_dst_destroy,
2918         .check                  =       ipv4_blackhole_dst_check,
2919         .mtu                    =       ipv4_blackhole_mtu,
2920         .default_advmss         =       ipv4_default_advmss,
2921         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2922         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2923         .neigh_lookup           =       ipv4_neigh_lookup,
2924 };
2925
2926 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2927 {
2928         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2929         struct rtable *ort = (struct rtable *) dst_orig;
2930
2931         if (rt) {
2932                 struct dst_entry *new = &rt->dst;
2933
2934                 new->__use = 1;
2935                 new->input = dst_discard;
2936                 new->output = dst_discard;
2937                 dst_copy_metrics(new, &ort->dst);
2938
2939                 new->dev = ort->dst.dev;
2940                 if (new->dev)
2941                         dev_hold(new->dev);
2942
2943                 rt->rt_key_dst = ort->rt_key_dst;
2944                 rt->rt_key_src = ort->rt_key_src;
2945                 rt->rt_key_tos = ort->rt_key_tos;
2946                 rt->rt_route_iif = ort->rt_route_iif;
2947                 rt->rt_iif = ort->rt_iif;
2948                 rt->rt_oif = ort->rt_oif;
2949                 rt->rt_mark = ort->rt_mark;
2950
2951                 rt->rt_genid = rt_genid(net);
2952                 rt->rt_flags = ort->rt_flags;
2953                 rt->rt_type = ort->rt_type;
2954                 rt->rt_dst = ort->rt_dst;
2955                 rt->rt_src = ort->rt_src;
2956                 rt->rt_gateway = ort->rt_gateway;
2957                 rt->rt_spec_dst = ort->rt_spec_dst;
2958                 rt->peer = ort->peer;
2959                 if (rt->peer)
2960                         atomic_inc(&rt->peer->refcnt);
2961                 rt->fi = ort->fi;
2962                 if (rt->fi)
2963                         atomic_inc(&rt->fi->fib_clntref);
2964
2965                 dst_free(new);
2966         }
2967
2968         dst_release(dst_orig);
2969
2970         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2971 }
2972
2973 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2974                                     struct sock *sk)
2975 {
2976         struct rtable *rt = __ip_route_output_key(net, flp4);
2977
2978         if (IS_ERR(rt))
2979                 return rt;
2980
2981         if (flp4->flowi4_proto)
2982                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2983                                                    flowi4_to_flowi(flp4),
2984                                                    sk, 0);
2985
2986         return rt;
2987 }
2988 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2989
2990 static int rt_fill_info(struct net *net,
2991                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2992                         int nowait, unsigned int flags)
2993 {
2994         struct rtable *rt = skb_rtable(skb);
2995         struct rtmsg *r;
2996         struct nlmsghdr *nlh;
2997         unsigned long expires = 0;
2998         const struct inet_peer *peer = rt->peer;
2999         u32 id = 0, ts = 0, tsage = 0, error;
3000
3001         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
3002         if (nlh == NULL)
3003                 return -EMSGSIZE;
3004
3005         r = nlmsg_data(nlh);
3006         r->rtm_family    = AF_INET;
3007         r->rtm_dst_len  = 32;
3008         r->rtm_src_len  = 0;
3009         r->rtm_tos      = rt->rt_key_tos;
3010         r->rtm_table    = RT_TABLE_MAIN;
3011         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
3012         r->rtm_type     = rt->rt_type;
3013         r->rtm_scope    = RT_SCOPE_UNIVERSE;
3014         r->rtm_protocol = RTPROT_UNSPEC;
3015         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
3016         if (rt->rt_flags & RTCF_NOTIFY)
3017                 r->rtm_flags |= RTM_F_NOTIFY;
3018
3019         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
3020
3021         if (rt->rt_key_src) {
3022                 r->rtm_src_len = 32;
3023                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
3024         }
3025         if (rt->dst.dev)
3026                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
3027 #ifdef CONFIG_IP_ROUTE_CLASSID
3028         if (rt->dst.tclassid)
3029                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3030 #endif
3031         if (rt_is_input_route(rt))
3032                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3033         else if (rt->rt_src != rt->rt_key_src)
3034                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3035
3036         if (rt->rt_dst != rt->rt_gateway)
3037                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3038
3039         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3040                 goto nla_put_failure;
3041
3042         if (rt->rt_mark)
3043                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3044
3045         error = rt->dst.error;
3046         if (peer) {
3047                 inet_peer_refcheck(rt->peer);
3048                 if (peer->tcp_ts_stamp) {
3049                         ts = peer->tcp_ts;
3050                         tsage = get_seconds() - peer->tcp_ts_stamp;
3051                 }
3052                 expires = ACCESS_ONCE(peer->pmtu_expires);
3053                 if (expires) {
3054                         if (time_before(jiffies, expires))
3055                                 expires -= jiffies;
3056                         else
3057                                 expires = 0;
3058                 }
3059         }
3060
3061         if (rt_is_input_route(rt)) {
3062 #ifdef CONFIG_IP_MROUTE
3063                 __be32 dst = rt->rt_dst;
3064
3065                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3066                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3067                         int err = ipmr_get_route(net, skb,
3068                                                  rt->rt_src, rt->rt_dst,
3069                                                  r, nowait);
3070                         if (err <= 0) {
3071                                 if (!nowait) {
3072                                         if (err == 0)
3073                                                 return 0;
3074                                         goto nla_put_failure;
3075                                 } else {
3076                                         if (err == -EMSGSIZE)
3077                                                 goto nla_put_failure;
3078                                         error = err;
3079                                 }
3080                         }
3081                 } else
3082 #endif
3083                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3084         }
3085
3086         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3087                                expires, error) < 0)
3088                 goto nla_put_failure;
3089
3090         return nlmsg_end(skb, nlh);
3091
3092 nla_put_failure:
3093         nlmsg_cancel(skb, nlh);
3094         return -EMSGSIZE;
3095 }
3096
3097 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3098 {
3099         struct net *net = sock_net(in_skb->sk);
3100         struct rtmsg *rtm;
3101         struct nlattr *tb[RTA_MAX+1];
3102         struct rtable *rt = NULL;
3103         __be32 dst = 0;
3104         __be32 src = 0;
3105         u32 iif;
3106         int err;
3107         int mark;
3108         struct sk_buff *skb;
3109
3110         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3111         if (err < 0)
3112                 goto errout;
3113
3114         rtm = nlmsg_data(nlh);
3115
3116         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3117         if (skb == NULL) {
3118                 err = -ENOBUFS;
3119                 goto errout;
3120         }
3121
3122         /* Reserve room for dummy headers, this skb can pass
3123            through good chunk of routing engine.
3124          */
3125         skb_reset_mac_header(skb);
3126         skb_reset_network_header(skb);
3127
3128         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3129         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3130         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3131
3132         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3133         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3134         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3135         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3136
3137         if (iif) {
3138                 struct net_device *dev;
3139
3140                 dev = __dev_get_by_index(net, iif);
3141                 if (dev == NULL) {
3142                         err = -ENODEV;
3143                         goto errout_free;
3144                 }
3145
3146                 skb->protocol   = htons(ETH_P_IP);
3147                 skb->dev        = dev;
3148                 skb->mark       = mark;
3149                 local_bh_disable();
3150                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3151                 local_bh_enable();
3152
3153                 rt = skb_rtable(skb);
3154                 if (err == 0 && rt->dst.error)
3155                         err = -rt->dst.error;
3156         } else {
3157                 struct flowi4 fl4 = {
3158                         .daddr = dst,
3159                         .saddr = src,
3160                         .flowi4_tos = rtm->rtm_tos,
3161                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3162                         .flowi4_mark = mark,
3163                 };
3164                 rt = ip_route_output_key(net, &fl4);
3165
3166                 err = 0;
3167                 if (IS_ERR(rt))
3168                         err = PTR_ERR(rt);
3169         }
3170
3171         if (err)
3172                 goto errout_free;
3173
3174         skb_dst_set(skb, &rt->dst);
3175         if (rtm->rtm_flags & RTM_F_NOTIFY)
3176                 rt->rt_flags |= RTCF_NOTIFY;
3177
3178         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3179                            RTM_NEWROUTE, 0, 0);
3180         if (err <= 0)
3181                 goto errout_free;
3182
3183         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3184 errout:
3185         return err;
3186
3187 errout_free:
3188         kfree_skb(skb);
3189         goto errout;
3190 }
3191
3192 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3193 {
3194         struct rtable *rt;
3195         int h, s_h;
3196         int idx, s_idx;
3197         struct net *net;
3198
3199         net = sock_net(skb->sk);
3200
3201         s_h = cb->args[0];
3202         if (s_h < 0)
3203                 s_h = 0;
3204         s_idx = idx = cb->args[1];
3205         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3206                 if (!rt_hash_table[h].chain)
3207                         continue;
3208                 rcu_read_lock_bh();
3209                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3210                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3211                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3212                                 continue;
3213                         if (rt_is_expired(rt))
3214                                 continue;
3215                         skb_dst_set_noref(skb, &rt->dst);
3216                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3217                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3218                                          1, NLM_F_MULTI) <= 0) {
3219                                 skb_dst_drop(skb);
3220                                 rcu_read_unlock_bh();
3221                                 goto done;
3222                         }
3223                         skb_dst_drop(skb);
3224                 }
3225                 rcu_read_unlock_bh();
3226         }
3227
3228 done:
3229         cb->args[0] = h;
3230         cb->args[1] = idx;
3231         return skb->len;
3232 }
3233
3234 void ip_rt_multicast_event(struct in_device *in_dev)
3235 {
3236         rt_cache_flush(dev_net(in_dev->dev), 0);
3237 }
3238
3239 #ifdef CONFIG_SYSCTL
3240 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3241                                         void __user *buffer,
3242                                         size_t *lenp, loff_t *ppos)
3243 {
3244         if (write) {
3245                 int flush_delay;
3246                 ctl_table ctl;
3247                 struct net *net;
3248
3249                 memcpy(&ctl, __ctl, sizeof(ctl));
3250                 ctl.data = &flush_delay;
3251                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3252
3253                 net = (struct net *)__ctl->extra1;
3254                 rt_cache_flush(net, flush_delay);
3255                 return 0;
3256         }
3257
3258         return -EINVAL;
3259 }
3260
3261 static ctl_table ipv4_route_table[] = {
3262         {
3263                 .procname       = "gc_thresh",
3264                 .data           = &ipv4_dst_ops.gc_thresh,
3265                 .maxlen         = sizeof(int),
3266                 .mode           = 0644,
3267                 .proc_handler   = proc_dointvec,
3268         },
3269         {
3270                 .procname       = "max_size",
3271                 .data           = &ip_rt_max_size,
3272                 .maxlen         = sizeof(int),
3273                 .mode           = 0644,
3274                 .proc_handler   = proc_dointvec,
3275         },
3276         {
3277                 /*  Deprecated. Use gc_min_interval_ms */
3278
3279                 .procname       = "gc_min_interval",
3280                 .data           = &ip_rt_gc_min_interval,
3281                 .maxlen         = sizeof(int),
3282                 .mode           = 0644,
3283                 .proc_handler   = proc_dointvec_jiffies,
3284         },
3285         {
3286                 .procname       = "gc_min_interval_ms",
3287                 .data           = &ip_rt_gc_min_interval,
3288                 .maxlen         = sizeof(int),
3289                 .mode           = 0644,
3290                 .proc_handler   = proc_dointvec_ms_jiffies,
3291         },
3292         {
3293                 .procname       = "gc_timeout",
3294                 .data           = &ip_rt_gc_timeout,
3295                 .maxlen         = sizeof(int),
3296                 .mode           = 0644,
3297                 .proc_handler   = proc_dointvec_jiffies,
3298         },
3299         {
3300                 .procname       = "gc_interval",
3301                 .data           = &ip_rt_gc_interval,
3302                 .maxlen         = sizeof(int),
3303                 .mode           = 0644,
3304                 .proc_handler   = proc_dointvec_jiffies,
3305         },
3306         {
3307                 .procname       = "redirect_load",
3308                 .data           = &ip_rt_redirect_load,
3309                 .maxlen         = sizeof(int),
3310                 .mode           = 0644,
3311                 .proc_handler   = proc_dointvec,
3312         },
3313         {
3314                 .procname       = "redirect_number",
3315                 .data           = &ip_rt_redirect_number,
3316                 .maxlen         = sizeof(int),
3317                 .mode           = 0644,
3318                 .proc_handler   = proc_dointvec,
3319         },
3320         {
3321                 .procname       = "redirect_silence",
3322                 .data           = &ip_rt_redirect_silence,
3323                 .maxlen         = sizeof(int),
3324                 .mode           = 0644,
3325                 .proc_handler   = proc_dointvec,
3326         },
3327         {
3328                 .procname       = "error_cost",
3329                 .data           = &ip_rt_error_cost,
3330                 .maxlen         = sizeof(int),
3331                 .mode           = 0644,
3332                 .proc_handler   = proc_dointvec,
3333         },
3334         {
3335                 .procname       = "error_burst",
3336                 .data           = &ip_rt_error_burst,
3337                 .maxlen         = sizeof(int),
3338                 .mode           = 0644,
3339                 .proc_handler   = proc_dointvec,
3340         },
3341         {
3342                 .procname       = "gc_elasticity",
3343                 .data           = &ip_rt_gc_elasticity,
3344                 .maxlen         = sizeof(int),
3345                 .mode           = 0644,
3346                 .proc_handler   = proc_dointvec,
3347         },
3348         {
3349                 .procname       = "mtu_expires",
3350                 .data           = &ip_rt_mtu_expires,
3351                 .maxlen         = sizeof(int),
3352                 .mode           = 0644,
3353                 .proc_handler   = proc_dointvec_jiffies,
3354         },
3355         {
3356                 .procname       = "min_pmtu",
3357                 .data           = &ip_rt_min_pmtu,
3358                 .maxlen         = sizeof(int),
3359                 .mode           = 0644,
3360                 .proc_handler   = proc_dointvec,
3361         },
3362         {
3363                 .procname       = "min_adv_mss",
3364                 .data           = &ip_rt_min_advmss,
3365                 .maxlen         = sizeof(int),
3366                 .mode           = 0644,
3367                 .proc_handler   = proc_dointvec,
3368         },
3369         { }
3370 };
3371
3372 static struct ctl_table empty[1];
3373
3374 static struct ctl_table ipv4_skeleton[] =
3375 {
3376         { .procname = "route", 
3377           .mode = 0555, .child = ipv4_route_table},
3378         { .procname = "neigh", 
3379           .mode = 0555, .child = empty},
3380         { }
3381 };
3382
3383 static __net_initdata struct ctl_path ipv4_path[] = {
3384         { .procname = "net", },
3385         { .procname = "ipv4", },
3386         { },
3387 };
3388
3389 static struct ctl_table ipv4_route_flush_table[] = {
3390         {
3391                 .procname       = "flush",
3392                 .maxlen         = sizeof(int),
3393                 .mode           = 0200,
3394                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3395         },
3396         { },
3397 };
3398
3399 static __net_initdata struct ctl_path ipv4_route_path[] = {
3400         { .procname = "net", },
3401         { .procname = "ipv4", },
3402         { .procname = "route", },
3403         { },
3404 };
3405
3406 static __net_init int sysctl_route_net_init(struct net *net)
3407 {
3408         struct ctl_table *tbl;
3409
3410         tbl = ipv4_route_flush_table;
3411         if (!net_eq(net, &init_net)) {
3412                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3413                 if (tbl == NULL)
3414                         goto err_dup;
3415         }
3416         tbl[0].extra1 = net;
3417
3418         net->ipv4.route_hdr =
3419                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3420         if (net->ipv4.route_hdr == NULL)
3421                 goto err_reg;
3422         return 0;
3423
3424 err_reg:
3425         if (tbl != ipv4_route_flush_table)
3426                 kfree(tbl);
3427 err_dup:
3428         return -ENOMEM;
3429 }
3430
3431 static __net_exit void sysctl_route_net_exit(struct net *net)
3432 {
3433         struct ctl_table *tbl;
3434
3435         tbl = net->ipv4.route_hdr->ctl_table_arg;
3436         unregister_net_sysctl_table(net->ipv4.route_hdr);
3437         BUG_ON(tbl == ipv4_route_flush_table);
3438         kfree(tbl);
3439 }
3440
3441 static __net_initdata struct pernet_operations sysctl_route_ops = {
3442         .init = sysctl_route_net_init,
3443         .exit = sysctl_route_net_exit,
3444 };
3445 #endif
3446
3447 static __net_init int rt_genid_init(struct net *net)
3448 {
3449         get_random_bytes(&net->ipv4.rt_genid,
3450                          sizeof(net->ipv4.rt_genid));
3451         get_random_bytes(&net->ipv4.dev_addr_genid,
3452                          sizeof(net->ipv4.dev_addr_genid));
3453         return 0;
3454 }
3455
3456 static __net_initdata struct pernet_operations rt_genid_ops = {
3457         .init = rt_genid_init,
3458 };
3459
3460
3461 #ifdef CONFIG_IP_ROUTE_CLASSID
3462 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3463 #endif /* CONFIG_IP_ROUTE_CLASSID */
3464
3465 static __initdata unsigned long rhash_entries;
3466 static int __init set_rhash_entries(char *str)
3467 {
3468         if (!str)
3469                 return 0;
3470         rhash_entries = simple_strtoul(str, &str, 0);
3471         return 1;
3472 }
3473 __setup("rhash_entries=", set_rhash_entries);
3474
3475 int __init ip_rt_init(void)
3476 {
3477         int rc = 0;
3478
3479         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3480         if (!ip_idents)
3481                 panic("IP: failed to allocate ip_idents\n");
3482
3483         get_random_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3484
3485 #ifdef CONFIG_IP_ROUTE_CLASSID
3486         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3487         if (!ip_rt_acct)
3488                 panic("IP: failed to allocate ip_rt_acct\n");
3489 #endif
3490
3491         ipv4_dst_ops.kmem_cachep =
3492                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3493                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3494
3495         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3496
3497         if (dst_entries_init(&ipv4_dst_ops) < 0)
3498                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3499
3500         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3501                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3502
3503         rt_hash_table = (struct rt_hash_bucket *)
3504                 alloc_large_system_hash("IP route cache",
3505                                         sizeof(struct rt_hash_bucket),
3506                                         rhash_entries,
3507                                         (totalram_pages >= 128 * 1024) ?
3508                                         15 : 17,
3509                                         0,
3510                                         &rt_hash_log,
3511                                         &rt_hash_mask,
3512                                         rhash_entries ? 0 : 512 * 1024);
3513         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3514         rt_hash_lock_init();
3515
3516         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3517         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3518
3519         devinet_init();
3520         ip_fib_init();
3521
3522         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3523         expires_ljiffies = jiffies;
3524         schedule_delayed_work(&expires_work,
3525                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3526
3527         if (ip_rt_proc_init())
3528                 printk(KERN_ERR "Unable to create route proc files\n");
3529 #ifdef CONFIG_XFRM
3530         xfrm_init();
3531         xfrm4_init(ip_rt_max_size);
3532 #endif
3533         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3534
3535 #ifdef CONFIG_SYSCTL
3536         register_pernet_subsys(&sysctl_route_ops);
3537 #endif
3538         register_pernet_subsys(&rt_genid_ops);
3539         return rc;
3540 }
3541
3542 #ifdef CONFIG_SYSCTL
3543 /*
3544  * We really need to sanitize the damn ipv4 init order, then all
3545  * this nonsense will go away.
3546  */
3547 void __init ip_static_sysctl_init(void)
3548 {
3549         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3550 }
3551 #endif