8e79a9e04276c7a1ea4d45368909596aa68ee35d
[pandora-kernel.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/atmclip.h>
113 #include <net/secure_seq.h>
114
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117
118 #define IP_MAX_MTU      0xFFF0
119
120 #define RT_GC_TIMEOUT (300*HZ)
121
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
125 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
126 static int ip_rt_redirect_number __read_mostly  = 9;
127 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
128 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost __read_mostly       = HZ;
130 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
131 static int ip_rt_gc_elasticity __read_mostly    = 8;
132 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
134 static int ip_rt_min_advmss __read_mostly       = 256;
135 static int rt_chain_length_max __read_mostly    = 20;
136 static int redirect_genid;
137
138 static struct delayed_work expires_work;
139 static unsigned long expires_ljiffies;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
148 static void              ipv4_dst_destroy(struct dst_entry *dst);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154 static void __rt_garbage_collect(struct work_struct *w);
155 static DECLARE_WORK(rt_gc_worker, __rt_garbage_collect);
156
157 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
158                             int how)
159 {
160 }
161
162 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
163 {
164         struct rtable *rt = (struct rtable *) dst;
165         struct inet_peer *peer;
166         u32 *p = NULL;
167
168         if (!rt->peer)
169                 rt_bind_peer(rt, rt->rt_dst, 1);
170
171         peer = rt->peer;
172         if (peer) {
173                 u32 *old_p = __DST_METRICS_PTR(old);
174                 unsigned long prev, new;
175
176                 p = peer->metrics;
177                 if (inet_metrics_new(peer))
178                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
179
180                 new = (unsigned long) p;
181                 prev = cmpxchg(&dst->_metrics, old, new);
182
183                 if (prev != old) {
184                         p = __DST_METRICS_PTR(prev);
185                         if (prev & DST_METRICS_READ_ONLY)
186                                 p = NULL;
187                 } else {
188                         if (rt->fi) {
189                                 fib_info_put(rt->fi);
190                                 rt->fi = NULL;
191                         }
192                 }
193         }
194         return p;
195 }
196
197 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
198
199 static struct dst_ops ipv4_dst_ops = {
200         .family =               AF_INET,
201         .protocol =             cpu_to_be16(ETH_P_IP),
202         .gc =                   rt_garbage_collect,
203         .check =                ipv4_dst_check,
204         .default_advmss =       ipv4_default_advmss,
205         .mtu =                  ipv4_mtu,
206         .cow_metrics =          ipv4_cow_metrics,
207         .destroy =              ipv4_dst_destroy,
208         .ifdown =               ipv4_dst_ifdown,
209         .negative_advice =      ipv4_negative_advice,
210         .link_failure =         ipv4_link_failure,
211         .update_pmtu =          ip_rt_update_pmtu,
212         .local_out =            __ip_local_out,
213         .neigh_lookup =         ipv4_neigh_lookup,
214 };
215
216 #define ECN_OR_COST(class)      TC_PRIO_##class
217
218 const __u8 ip_tos2prio[16] = {
219         TC_PRIO_BESTEFFORT,
220         ECN_OR_COST(BESTEFFORT),
221         TC_PRIO_BESTEFFORT,
222         ECN_OR_COST(BESTEFFORT),
223         TC_PRIO_BULK,
224         ECN_OR_COST(BULK),
225         TC_PRIO_BULK,
226         ECN_OR_COST(BULK),
227         TC_PRIO_INTERACTIVE,
228         ECN_OR_COST(INTERACTIVE),
229         TC_PRIO_INTERACTIVE,
230         ECN_OR_COST(INTERACTIVE),
231         TC_PRIO_INTERACTIVE_BULK,
232         ECN_OR_COST(INTERACTIVE_BULK),
233         TC_PRIO_INTERACTIVE_BULK,
234         ECN_OR_COST(INTERACTIVE_BULK)
235 };
236
237
238 /*
239  * Route cache.
240  */
241
242 /* The locking scheme is rather straight forward:
243  *
244  * 1) Read-Copy Update protects the buckets of the central route hash.
245  * 2) Only writers remove entries, and they hold the lock
246  *    as they look at rtable reference counts.
247  * 3) Only readers acquire references to rtable entries,
248  *    they do so with atomic increments and with the
249  *    lock held.
250  */
251
252 struct rt_hash_bucket {
253         struct rtable __rcu     *chain;
254 };
255
256 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
257         defined(CONFIG_PROVE_LOCKING)
258 /*
259  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
260  * The size of this table is a power of two and depends on the number of CPUS.
261  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
262  */
263 #ifdef CONFIG_LOCKDEP
264 # define RT_HASH_LOCK_SZ        256
265 #else
266 # if NR_CPUS >= 32
267 #  define RT_HASH_LOCK_SZ       4096
268 # elif NR_CPUS >= 16
269 #  define RT_HASH_LOCK_SZ       2048
270 # elif NR_CPUS >= 8
271 #  define RT_HASH_LOCK_SZ       1024
272 # elif NR_CPUS >= 4
273 #  define RT_HASH_LOCK_SZ       512
274 # else
275 #  define RT_HASH_LOCK_SZ       256
276 # endif
277 #endif
278
279 static spinlock_t       *rt_hash_locks;
280 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
281
282 static __init void rt_hash_lock_init(void)
283 {
284         int i;
285
286         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
287                         GFP_KERNEL);
288         if (!rt_hash_locks)
289                 panic("IP: failed to allocate rt_hash_locks\n");
290
291         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
292                 spin_lock_init(&rt_hash_locks[i]);
293 }
294 #else
295 # define rt_hash_lock_addr(slot) NULL
296
297 static inline void rt_hash_lock_init(void)
298 {
299 }
300 #endif
301
302 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
303 static unsigned                 rt_hash_mask __read_mostly;
304 static unsigned int             rt_hash_log  __read_mostly;
305
306 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
307 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
308
309 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
310                                    int genid)
311 {
312         return jhash_3words((__force u32)daddr, (__force u32)saddr,
313                             idx, genid)
314                 & rt_hash_mask;
315 }
316
317 static inline int rt_genid(struct net *net)
318 {
319         return atomic_read(&net->ipv4.rt_genid);
320 }
321
322 #ifdef CONFIG_PROC_FS
323 struct rt_cache_iter_state {
324         struct seq_net_private p;
325         int bucket;
326         int genid;
327 };
328
329 static struct rtable *rt_cache_get_first(struct seq_file *seq)
330 {
331         struct rt_cache_iter_state *st = seq->private;
332         struct rtable *r = NULL;
333
334         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
335                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
336                         continue;
337                 rcu_read_lock_bh();
338                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
339                 while (r) {
340                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
341                             r->rt_genid == st->genid)
342                                 return r;
343                         r = rcu_dereference_bh(r->dst.rt_next);
344                 }
345                 rcu_read_unlock_bh();
346         }
347         return r;
348 }
349
350 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
351                                           struct rtable *r)
352 {
353         struct rt_cache_iter_state *st = seq->private;
354
355         r = rcu_dereference_bh(r->dst.rt_next);
356         while (!r) {
357                 rcu_read_unlock_bh();
358                 do {
359                         if (--st->bucket < 0)
360                                 return NULL;
361                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
362                 rcu_read_lock_bh();
363                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
364         }
365         return r;
366 }
367
368 static struct rtable *rt_cache_get_next(struct seq_file *seq,
369                                         struct rtable *r)
370 {
371         struct rt_cache_iter_state *st = seq->private;
372         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
373                 if (dev_net(r->dst.dev) != seq_file_net(seq))
374                         continue;
375                 if (r->rt_genid == st->genid)
376                         break;
377         }
378         return r;
379 }
380
381 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
382 {
383         struct rtable *r = rt_cache_get_first(seq);
384
385         if (r)
386                 while (pos && (r = rt_cache_get_next(seq, r)))
387                         --pos;
388         return pos ? NULL : r;
389 }
390
391 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
392 {
393         struct rt_cache_iter_state *st = seq->private;
394         if (*pos)
395                 return rt_cache_get_idx(seq, *pos - 1);
396         st->genid = rt_genid(seq_file_net(seq));
397         return SEQ_START_TOKEN;
398 }
399
400 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
401 {
402         struct rtable *r;
403
404         if (v == SEQ_START_TOKEN)
405                 r = rt_cache_get_first(seq);
406         else
407                 r = rt_cache_get_next(seq, v);
408         ++*pos;
409         return r;
410 }
411
412 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
413 {
414         if (v && v != SEQ_START_TOKEN)
415                 rcu_read_unlock_bh();
416 }
417
418 static int rt_cache_seq_show(struct seq_file *seq, void *v)
419 {
420         if (v == SEQ_START_TOKEN)
421                 seq_printf(seq, "%-127s\n",
422                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
423                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
424                            "HHUptod\tSpecDst");
425         else {
426                 struct rtable *r = v;
427                 struct neighbour *n;
428                 int len, HHUptod;
429
430                 rcu_read_lock();
431                 n = dst_get_neighbour(&r->dst);
432                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
433                 rcu_read_unlock();
434
435                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
436                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
437                         r->dst.dev ? r->dst.dev->name : "*",
438                         (__force u32)r->rt_dst,
439                         (__force u32)r->rt_gateway,
440                         r->rt_flags, atomic_read(&r->dst.__refcnt),
441                         r->dst.__use, 0, (__force u32)r->rt_src,
442                         dst_metric_advmss(&r->dst) + 40,
443                         dst_metric(&r->dst, RTAX_WINDOW),
444                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
445                               dst_metric(&r->dst, RTAX_RTTVAR)),
446                         r->rt_key_tos,
447                         -1,
448                         HHUptod,
449                         r->rt_spec_dst, &len);
450
451                 seq_printf(seq, "%*s\n", 127 - len, "");
452         }
453         return 0;
454 }
455
456 static const struct seq_operations rt_cache_seq_ops = {
457         .start  = rt_cache_seq_start,
458         .next   = rt_cache_seq_next,
459         .stop   = rt_cache_seq_stop,
460         .show   = rt_cache_seq_show,
461 };
462
463 static int rt_cache_seq_open(struct inode *inode, struct file *file)
464 {
465         return seq_open_net(inode, file, &rt_cache_seq_ops,
466                         sizeof(struct rt_cache_iter_state));
467 }
468
469 static const struct file_operations rt_cache_seq_fops = {
470         .owner   = THIS_MODULE,
471         .open    = rt_cache_seq_open,
472         .read    = seq_read,
473         .llseek  = seq_lseek,
474         .release = seq_release_net,
475 };
476
477
478 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
479 {
480         int cpu;
481
482         if (*pos == 0)
483                 return SEQ_START_TOKEN;
484
485         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
486                 if (!cpu_possible(cpu))
487                         continue;
488                 *pos = cpu+1;
489                 return &per_cpu(rt_cache_stat, cpu);
490         }
491         return NULL;
492 }
493
494 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
495 {
496         int cpu;
497
498         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
499                 if (!cpu_possible(cpu))
500                         continue;
501                 *pos = cpu+1;
502                 return &per_cpu(rt_cache_stat, cpu);
503         }
504         return NULL;
505
506 }
507
508 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
509 {
510
511 }
512
513 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
514 {
515         struct rt_cache_stat *st = v;
516
517         if (v == SEQ_START_TOKEN) {
518                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
519                 return 0;
520         }
521
522         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
523                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
524                    dst_entries_get_slow(&ipv4_dst_ops),
525                    st->in_hit,
526                    st->in_slow_tot,
527                    st->in_slow_mc,
528                    st->in_no_route,
529                    st->in_brd,
530                    st->in_martian_dst,
531                    st->in_martian_src,
532
533                    st->out_hit,
534                    st->out_slow_tot,
535                    st->out_slow_mc,
536
537                    st->gc_total,
538                    st->gc_ignored,
539                    st->gc_goal_miss,
540                    st->gc_dst_overflow,
541                    st->in_hlist_search,
542                    st->out_hlist_search
543                 );
544         return 0;
545 }
546
547 static const struct seq_operations rt_cpu_seq_ops = {
548         .start  = rt_cpu_seq_start,
549         .next   = rt_cpu_seq_next,
550         .stop   = rt_cpu_seq_stop,
551         .show   = rt_cpu_seq_show,
552 };
553
554
555 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
556 {
557         return seq_open(file, &rt_cpu_seq_ops);
558 }
559
560 static const struct file_operations rt_cpu_seq_fops = {
561         .owner   = THIS_MODULE,
562         .open    = rt_cpu_seq_open,
563         .read    = seq_read,
564         .llseek  = seq_lseek,
565         .release = seq_release,
566 };
567
568 #ifdef CONFIG_IP_ROUTE_CLASSID
569 static int rt_acct_proc_show(struct seq_file *m, void *v)
570 {
571         struct ip_rt_acct *dst, *src;
572         unsigned int i, j;
573
574         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
575         if (!dst)
576                 return -ENOMEM;
577
578         for_each_possible_cpu(i) {
579                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
580                 for (j = 0; j < 256; j++) {
581                         dst[j].o_bytes   += src[j].o_bytes;
582                         dst[j].o_packets += src[j].o_packets;
583                         dst[j].i_bytes   += src[j].i_bytes;
584                         dst[j].i_packets += src[j].i_packets;
585                 }
586         }
587
588         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
589         kfree(dst);
590         return 0;
591 }
592
593 static int rt_acct_proc_open(struct inode *inode, struct file *file)
594 {
595         return single_open(file, rt_acct_proc_show, NULL);
596 }
597
598 static const struct file_operations rt_acct_proc_fops = {
599         .owner          = THIS_MODULE,
600         .open           = rt_acct_proc_open,
601         .read           = seq_read,
602         .llseek         = seq_lseek,
603         .release        = single_release,
604 };
605 #endif
606
607 static int __net_init ip_rt_do_proc_init(struct net *net)
608 {
609         struct proc_dir_entry *pde;
610
611         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
612                         &rt_cache_seq_fops);
613         if (!pde)
614                 goto err1;
615
616         pde = proc_create("rt_cache", S_IRUGO,
617                           net->proc_net_stat, &rt_cpu_seq_fops);
618         if (!pde)
619                 goto err2;
620
621 #ifdef CONFIG_IP_ROUTE_CLASSID
622         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
623         if (!pde)
624                 goto err3;
625 #endif
626         return 0;
627
628 #ifdef CONFIG_IP_ROUTE_CLASSID
629 err3:
630         remove_proc_entry("rt_cache", net->proc_net_stat);
631 #endif
632 err2:
633         remove_proc_entry("rt_cache", net->proc_net);
634 err1:
635         return -ENOMEM;
636 }
637
638 static void __net_exit ip_rt_do_proc_exit(struct net *net)
639 {
640         remove_proc_entry("rt_cache", net->proc_net_stat);
641         remove_proc_entry("rt_cache", net->proc_net);
642 #ifdef CONFIG_IP_ROUTE_CLASSID
643         remove_proc_entry("rt_acct", net->proc_net);
644 #endif
645 }
646
647 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
648         .init = ip_rt_do_proc_init,
649         .exit = ip_rt_do_proc_exit,
650 };
651
652 static int __init ip_rt_proc_init(void)
653 {
654         return register_pernet_subsys(&ip_rt_proc_ops);
655 }
656
657 #else
658 static inline int ip_rt_proc_init(void)
659 {
660         return 0;
661 }
662 #endif /* CONFIG_PROC_FS */
663
664 static inline void rt_free(struct rtable *rt)
665 {
666         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
667 }
668
669 static inline void rt_drop(struct rtable *rt)
670 {
671         ip_rt_put(rt);
672         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
673 }
674
675 static inline int rt_fast_clean(struct rtable *rth)
676 {
677         /* Kill broadcast/multicast entries very aggresively, if they
678            collide in hash table with more useful entries */
679         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
680                 rt_is_input_route(rth) && rth->dst.rt_next;
681 }
682
683 static inline int rt_valuable(struct rtable *rth)
684 {
685         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
686                 (rth->peer && rth->peer->pmtu_expires);
687 }
688
689 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
690 {
691         unsigned long age;
692         int ret = 0;
693
694         if (atomic_read(&rth->dst.__refcnt))
695                 goto out;
696
697         age = jiffies - rth->dst.lastuse;
698         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
699             (age <= tmo2 && rt_valuable(rth)))
700                 goto out;
701         ret = 1;
702 out:    return ret;
703 }
704
705 /* Bits of score are:
706  * 31: very valuable
707  * 30: not quite useless
708  * 29..0: usage counter
709  */
710 static inline u32 rt_score(struct rtable *rt)
711 {
712         u32 score = jiffies - rt->dst.lastuse;
713
714         score = ~score & ~(3<<30);
715
716         if (rt_valuable(rt))
717                 score |= (1<<31);
718
719         if (rt_is_output_route(rt) ||
720             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
721                 score |= (1<<30);
722
723         return score;
724 }
725
726 static inline bool rt_caching(const struct net *net)
727 {
728         return net->ipv4.current_rt_cache_rebuild_count <=
729                 net->ipv4.sysctl_rt_cache_rebuild_count;
730 }
731
732 static inline bool compare_hash_inputs(const struct rtable *rt1,
733                                        const struct rtable *rt2)
734 {
735         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
736                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
737                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
738 }
739
740 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
741 {
742         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
743                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
744                 (rt1->rt_mark ^ rt2->rt_mark) |
745                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
746                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
747                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
748 }
749
750 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
751 {
752         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
753 }
754
755 static inline int rt_is_expired(struct rtable *rth)
756 {
757         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
758 }
759
760 /*
761  * Perform a full scan of hash table and free all entries.
762  * Can be called by a softirq or a process.
763  * In the later case, we want to be reschedule if necessary
764  */
765 static void rt_do_flush(struct net *net, int process_context)
766 {
767         unsigned int i;
768         struct rtable *rth, *next;
769
770         for (i = 0; i <= rt_hash_mask; i++) {
771                 struct rtable __rcu **pprev;
772                 struct rtable *list;
773
774                 if (process_context && need_resched())
775                         cond_resched();
776                 rth = rcu_access_pointer(rt_hash_table[i].chain);
777                 if (!rth)
778                         continue;
779
780                 spin_lock_bh(rt_hash_lock_addr(i));
781
782                 list = NULL;
783                 pprev = &rt_hash_table[i].chain;
784                 rth = rcu_dereference_protected(*pprev,
785                         lockdep_is_held(rt_hash_lock_addr(i)));
786
787                 while (rth) {
788                         next = rcu_dereference_protected(rth->dst.rt_next,
789                                 lockdep_is_held(rt_hash_lock_addr(i)));
790
791                         if (!net ||
792                             net_eq(dev_net(rth->dst.dev), net)) {
793                                 rcu_assign_pointer(*pprev, next);
794                                 rcu_assign_pointer(rth->dst.rt_next, list);
795                                 list = rth;
796                         } else {
797                                 pprev = &rth->dst.rt_next;
798                         }
799                         rth = next;
800                 }
801
802                 spin_unlock_bh(rt_hash_lock_addr(i));
803
804                 for (; list; list = next) {
805                         next = rcu_dereference_protected(list->dst.rt_next, 1);
806                         rt_free(list);
807                 }
808         }
809 }
810
811 /*
812  * While freeing expired entries, we compute average chain length
813  * and standard deviation, using fixed-point arithmetic.
814  * This to have an estimation of rt_chain_length_max
815  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
816  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
817  */
818
819 #define FRACT_BITS 3
820 #define ONE (1UL << FRACT_BITS)
821
822 /*
823  * Given a hash chain and an item in this hash chain,
824  * find if a previous entry has the same hash_inputs
825  * (but differs on tos, mark or oif)
826  * Returns 0 if an alias is found.
827  * Returns ONE if rth has no alias before itself.
828  */
829 static int has_noalias(const struct rtable *head, const struct rtable *rth)
830 {
831         const struct rtable *aux = head;
832
833         while (aux != rth) {
834                 if (compare_hash_inputs(aux, rth))
835                         return 0;
836                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
837         }
838         return ONE;
839 }
840
841 static void rt_check_expire(void)
842 {
843         static unsigned int rover;
844         unsigned int i = rover, goal;
845         struct rtable *rth;
846         struct rtable __rcu **rthp;
847         unsigned long samples = 0;
848         unsigned long sum = 0, sum2 = 0;
849         unsigned long delta;
850         u64 mult;
851
852         delta = jiffies - expires_ljiffies;
853         expires_ljiffies = jiffies;
854         mult = ((u64)delta) << rt_hash_log;
855         if (ip_rt_gc_timeout > 1)
856                 do_div(mult, ip_rt_gc_timeout);
857         goal = (unsigned int)mult;
858         if (goal > rt_hash_mask)
859                 goal = rt_hash_mask + 1;
860         for (; goal > 0; goal--) {
861                 unsigned long tmo = ip_rt_gc_timeout;
862                 unsigned long length;
863
864                 i = (i + 1) & rt_hash_mask;
865                 rthp = &rt_hash_table[i].chain;
866
867                 if (need_resched())
868                         cond_resched();
869
870                 samples++;
871
872                 if (rcu_dereference_raw(*rthp) == NULL)
873                         continue;
874                 length = 0;
875                 spin_lock_bh(rt_hash_lock_addr(i));
876                 while ((rth = rcu_dereference_protected(*rthp,
877                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
878                         prefetch(rth->dst.rt_next);
879                         if (rt_is_expired(rth)) {
880                                 *rthp = rth->dst.rt_next;
881                                 rt_free(rth);
882                                 continue;
883                         }
884                         if (rth->dst.expires) {
885                                 /* Entry is expired even if it is in use */
886                                 if (time_before_eq(jiffies, rth->dst.expires)) {
887 nofree:
888                                         tmo >>= 1;
889                                         rthp = &rth->dst.rt_next;
890                                         /*
891                                          * We only count entries on
892                                          * a chain with equal hash inputs once
893                                          * so that entries for different QOS
894                                          * levels, and other non-hash input
895                                          * attributes don't unfairly skew
896                                          * the length computation
897                                          */
898                                         length += has_noalias(rt_hash_table[i].chain, rth);
899                                         continue;
900                                 }
901                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
902                                 goto nofree;
903
904                         /* Cleanup aged off entries. */
905                         *rthp = rth->dst.rt_next;
906                         rt_free(rth);
907                 }
908                 spin_unlock_bh(rt_hash_lock_addr(i));
909                 sum += length;
910                 sum2 += length*length;
911         }
912         if (samples) {
913                 unsigned long avg = sum / samples;
914                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
915                 rt_chain_length_max = max_t(unsigned long,
916                                         ip_rt_gc_elasticity,
917                                         (avg + 4*sd) >> FRACT_BITS);
918         }
919         rover = i;
920 }
921
922 /*
923  * rt_worker_func() is run in process context.
924  * we call rt_check_expire() to scan part of the hash table
925  */
926 static void rt_worker_func(struct work_struct *work)
927 {
928         rt_check_expire();
929         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
930 }
931
932 /*
933  * Perturbation of rt_genid by a small quantity [1..256]
934  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
935  * many times (2^24) without giving recent rt_genid.
936  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
937  */
938 static void rt_cache_invalidate(struct net *net)
939 {
940         unsigned char shuffle;
941
942         get_random_bytes(&shuffle, sizeof(shuffle));
943         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
944         redirect_genid++;
945         inetpeer_invalidate_tree(AF_INET);
946 }
947
948 /*
949  * delay < 0  : invalidate cache (fast : entries will be deleted later)
950  * delay >= 0 : invalidate & flush cache (can be long)
951  */
952 void rt_cache_flush(struct net *net, int delay)
953 {
954         rt_cache_invalidate(net);
955         if (delay >= 0)
956                 rt_do_flush(net, !in_softirq());
957 }
958
959 /* Flush previous cache invalidated entries from the cache */
960 void rt_cache_flush_batch(struct net *net)
961 {
962         rt_do_flush(net, !in_softirq());
963 }
964
965 static void rt_emergency_hash_rebuild(struct net *net)
966 {
967         if (net_ratelimit())
968                 printk(KERN_WARNING "Route hash chain too long!\n");
969         rt_cache_invalidate(net);
970 }
971
972 /*
973    Short description of GC goals.
974
975    We want to build algorithm, which will keep routing cache
976    at some equilibrium point, when number of aged off entries
977    is kept approximately equal to newly generated ones.
978
979    Current expiration strength is variable "expire".
980    We try to adjust it dynamically, so that if networking
981    is idle expires is large enough to keep enough of warm entries,
982    and when load increases it reduces to limit cache size.
983  */
984
985 static void __do_rt_garbage_collect(int elasticity, int min_interval)
986 {
987         static unsigned long expire = RT_GC_TIMEOUT;
988         static unsigned long last_gc;
989         static int rover;
990         static int equilibrium;
991         static DEFINE_SPINLOCK(rt_gc_lock);
992         struct rtable *rth;
993         struct rtable __rcu **rthp;
994         unsigned long now = jiffies;
995         int goal;
996         int entries = dst_entries_get_fast(&ipv4_dst_ops);
997
998         /*
999          * Garbage collection is pretty expensive,
1000          * do not make it too frequently.
1001          */
1002
1003         spin_lock_bh(&rt_gc_lock);
1004
1005         RT_CACHE_STAT_INC(gc_total);
1006
1007         if (now - last_gc < min_interval &&
1008             entries < ip_rt_max_size) {
1009                 RT_CACHE_STAT_INC(gc_ignored);
1010                 goto out;
1011         }
1012
1013         entries = dst_entries_get_slow(&ipv4_dst_ops);
1014         /* Calculate number of entries, which we want to expire now. */
1015         goal = entries - (elasticity << rt_hash_log);
1016         if (goal <= 0) {
1017                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1018                         equilibrium = ipv4_dst_ops.gc_thresh;
1019                 goal = entries - equilibrium;
1020                 if (goal > 0) {
1021                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1022                         goal = entries - equilibrium;
1023                 }
1024         } else {
1025                 /* We are in dangerous area. Try to reduce cache really
1026                  * aggressively.
1027                  */
1028                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1029                 equilibrium = entries - goal;
1030         }
1031
1032         if (now - last_gc >= min_interval)
1033                 last_gc = now;
1034
1035         if (goal <= 0) {
1036                 equilibrium += goal;
1037                 goto work_done;
1038         }
1039
1040         do {
1041                 int i, k;
1042
1043                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1044                         unsigned long tmo = expire;
1045
1046                         k = (k + 1) & rt_hash_mask;
1047                         rthp = &rt_hash_table[k].chain;
1048                         spin_lock_bh(rt_hash_lock_addr(k));
1049                         while ((rth = rcu_dereference_protected(*rthp,
1050                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1051                                 if (!rt_is_expired(rth) &&
1052                                         !rt_may_expire(rth, tmo, expire)) {
1053                                         tmo >>= 1;
1054                                         rthp = &rth->dst.rt_next;
1055                                         continue;
1056                                 }
1057                                 *rthp = rth->dst.rt_next;
1058                                 rt_free(rth);
1059                                 goal--;
1060                         }
1061                         spin_unlock_bh(rt_hash_lock_addr(k));
1062                         if (goal <= 0)
1063                                 break;
1064                 }
1065                 rover = k;
1066
1067                 if (goal <= 0)
1068                         goto work_done;
1069
1070                 /* Goal is not achieved. We stop process if:
1071
1072                    - if expire reduced to zero. Otherwise, expire is halfed.
1073                    - if table is not full.
1074                    - if we are called from interrupt.
1075                    - jiffies check is just fallback/debug loop breaker.
1076                      We will not spin here for long time in any case.
1077                  */
1078
1079                 RT_CACHE_STAT_INC(gc_goal_miss);
1080
1081                 if (expire == 0)
1082                         break;
1083
1084                 expire >>= 1;
1085
1086                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1087                         goto out;
1088         } while (!in_softirq() && time_before_eq(jiffies, now));
1089
1090         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1091                 goto out;
1092         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1093                 goto out;
1094         if (net_ratelimit())
1095                 printk(KERN_WARNING "dst cache overflow\n");
1096         RT_CACHE_STAT_INC(gc_dst_overflow);
1097         goto out;
1098
1099 work_done:
1100         expire += min_interval;
1101         if (expire > ip_rt_gc_timeout ||
1102             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1103             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1104                 expire = ip_rt_gc_timeout;
1105 out:
1106         spin_unlock_bh(&rt_gc_lock);
1107 }
1108
1109 static void __rt_garbage_collect(struct work_struct *w)
1110 {
1111         __do_rt_garbage_collect(ip_rt_gc_elasticity, ip_rt_gc_min_interval);
1112 }
1113
1114 static int rt_garbage_collect(struct dst_ops *ops)
1115 {
1116         if (!work_pending(&rt_gc_worker))
1117                 schedule_work(&rt_gc_worker);
1118
1119         if (dst_entries_get_fast(&ipv4_dst_ops) >= ip_rt_max_size ||
1120             dst_entries_get_slow(&ipv4_dst_ops) >= ip_rt_max_size) {
1121                 RT_CACHE_STAT_INC(gc_dst_overflow);
1122                 return 1;
1123         }
1124         return 0;
1125 }
1126
1127 /*
1128  * Returns number of entries in a hash chain that have different hash_inputs
1129  */
1130 static int slow_chain_length(const struct rtable *head)
1131 {
1132         int length = 0;
1133         const struct rtable *rth = head;
1134
1135         while (rth) {
1136                 length += has_noalias(head, rth);
1137                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1138         }
1139         return length >> FRACT_BITS;
1140 }
1141
1142 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1143 {
1144         struct neigh_table *tbl = &arp_tbl;
1145         static const __be32 inaddr_any = 0;
1146         struct net_device *dev = dst->dev;
1147         const __be32 *pkey = daddr;
1148         struct neighbour *n;
1149
1150 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1151         if (dev->type == ARPHRD_ATM)
1152                 tbl = clip_tbl_hook;
1153 #endif
1154         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1155                 pkey = &inaddr_any;
1156
1157         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1158         if (n)
1159                 return n;
1160         return neigh_create(tbl, pkey, dev);
1161 }
1162
1163 static int rt_bind_neighbour(struct rtable *rt)
1164 {
1165         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1166         if (IS_ERR(n))
1167                 return PTR_ERR(n);
1168         dst_set_neighbour(&rt->dst, n);
1169
1170         return 0;
1171 }
1172
1173 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1174                                      struct sk_buff *skb, int ifindex)
1175 {
1176         struct rtable   *rth, *cand;
1177         struct rtable __rcu **rthp, **candp;
1178         unsigned long   now;
1179         u32             min_score;
1180         int             chain_length;
1181         int attempts = 1;
1182
1183 restart:
1184         chain_length = 0;
1185         min_score = ~(u32)0;
1186         cand = NULL;
1187         candp = NULL;
1188         now = jiffies;
1189
1190         if (!rt_caching(dev_net(rt->dst.dev))) {
1191                 /*
1192                  * If we're not caching, just tell the caller we
1193                  * were successful and don't touch the route.  The
1194                  * caller hold the sole reference to the cache entry, and
1195                  * it will be released when the caller is done with it.
1196                  * If we drop it here, the callers have no way to resolve routes
1197                  * when we're not caching.  Instead, just point *rp at rt, so
1198                  * the caller gets a single use out of the route
1199                  * Note that we do rt_free on this new route entry, so that
1200                  * once its refcount hits zero, we are still able to reap it
1201                  * (Thanks Alexey)
1202                  * Note: To avoid expensive rcu stuff for this uncached dst,
1203                  * we set DST_NOCACHE so that dst_release() can free dst without
1204                  * waiting a grace period.
1205                  */
1206
1207                 rt->dst.flags |= DST_NOCACHE;
1208                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1209                         int err = rt_bind_neighbour(rt);
1210                         if (err) {
1211                                 if (net_ratelimit())
1212                                         printk(KERN_WARNING
1213                                             "Neighbour table failure & not caching routes.\n");
1214                                 ip_rt_put(rt);
1215                                 return ERR_PTR(err);
1216                         }
1217                 }
1218
1219                 goto skip_hashing;
1220         }
1221
1222         rthp = &rt_hash_table[hash].chain;
1223
1224         spin_lock_bh(rt_hash_lock_addr(hash));
1225         while ((rth = rcu_dereference_protected(*rthp,
1226                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1227                 if (rt_is_expired(rth)) {
1228                         *rthp = rth->dst.rt_next;
1229                         rt_free(rth);
1230                         continue;
1231                 }
1232                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1233                         /* Put it first */
1234                         *rthp = rth->dst.rt_next;
1235                         /*
1236                          * Since lookup is lockfree, the deletion
1237                          * must be visible to another weakly ordered CPU before
1238                          * the insertion at the start of the hash chain.
1239                          */
1240                         rcu_assign_pointer(rth->dst.rt_next,
1241                                            rt_hash_table[hash].chain);
1242                         /*
1243                          * Since lookup is lockfree, the update writes
1244                          * must be ordered for consistency on SMP.
1245                          */
1246                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1247
1248                         dst_use(&rth->dst, now);
1249                         spin_unlock_bh(rt_hash_lock_addr(hash));
1250
1251                         rt_drop(rt);
1252                         if (skb)
1253                                 skb_dst_set(skb, &rth->dst);
1254                         return rth;
1255                 }
1256
1257                 if (!atomic_read(&rth->dst.__refcnt)) {
1258                         u32 score = rt_score(rth);
1259
1260                         if (score <= min_score) {
1261                                 cand = rth;
1262                                 candp = rthp;
1263                                 min_score = score;
1264                         }
1265                 }
1266
1267                 chain_length++;
1268
1269                 rthp = &rth->dst.rt_next;
1270         }
1271
1272         if (cand) {
1273                 /* ip_rt_gc_elasticity used to be average length of chain
1274                  * length, when exceeded gc becomes really aggressive.
1275                  *
1276                  * The second limit is less certain. At the moment it allows
1277                  * only 2 entries per bucket. We will see.
1278                  */
1279                 if (chain_length > ip_rt_gc_elasticity) {
1280                         *candp = cand->dst.rt_next;
1281                         rt_free(cand);
1282                 }
1283         } else {
1284                 if (chain_length > rt_chain_length_max &&
1285                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1286                         struct net *net = dev_net(rt->dst.dev);
1287                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1288                         if (!rt_caching(net)) {
1289                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1290                                         rt->dst.dev->name, num);
1291                         }
1292                         rt_emergency_hash_rebuild(net);
1293                         spin_unlock_bh(rt_hash_lock_addr(hash));
1294
1295                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1296                                         ifindex, rt_genid(net));
1297                         goto restart;
1298                 }
1299         }
1300
1301         /* Try to bind route to arp only if it is output
1302            route or unicast forwarding path.
1303          */
1304         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1305                 int err = rt_bind_neighbour(rt);
1306                 if (err) {
1307                         spin_unlock_bh(rt_hash_lock_addr(hash));
1308
1309                         if (err != -ENOBUFS) {
1310                                 rt_drop(rt);
1311                                 return ERR_PTR(err);
1312                         }
1313
1314                         /* Neighbour tables are full and nothing
1315                            can be released. Try to shrink route cache,
1316                            it is most likely it holds some neighbour records.
1317                          */
1318                         if (!in_softirq() && attempts-- > 0) {
1319                                 static DEFINE_SPINLOCK(lock);
1320
1321                                 if (spin_trylock(&lock)) {
1322                                         __do_rt_garbage_collect(1, 0);
1323                                         spin_unlock(&lock);
1324                                 } else {
1325                                         spin_unlock_wait(&lock);
1326                                 }
1327                                 goto restart;
1328                         }
1329
1330                         if (net_ratelimit())
1331                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1332                         rt_drop(rt);
1333                         return ERR_PTR(-ENOBUFS);
1334                 }
1335         }
1336
1337         rt->dst.rt_next = rt_hash_table[hash].chain;
1338
1339         /*
1340          * Since lookup is lockfree, we must make sure
1341          * previous writes to rt are committed to memory
1342          * before making rt visible to other CPUS.
1343          */
1344         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1345
1346         spin_unlock_bh(rt_hash_lock_addr(hash));
1347
1348 skip_hashing:
1349         if (skb)
1350                 skb_dst_set(skb, &rt->dst);
1351         return rt;
1352 }
1353
1354 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1355
1356 static u32 rt_peer_genid(void)
1357 {
1358         return atomic_read(&__rt_peer_genid);
1359 }
1360
1361 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1362 {
1363         struct inet_peer *peer;
1364
1365         peer = inet_getpeer_v4(daddr, create);
1366
1367         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1368                 inet_putpeer(peer);
1369         else
1370                 rt->rt_peer_genid = rt_peer_genid();
1371 }
1372
1373 #define IP_IDENTS_SZ 2048u
1374 struct ip_ident_bucket {
1375         atomic_t        id;
1376         u32             stamp32;
1377 };
1378
1379 static struct ip_ident_bucket *ip_idents __read_mostly;
1380
1381 /* In order to protect privacy, we add a perturbation to identifiers
1382  * if one generator is seldom used. This makes hard for an attacker
1383  * to infer how many packets were sent between two points in time.
1384  */
1385 u32 ip_idents_reserve(u32 hash, int segs)
1386 {
1387         struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
1388         u32 old = ACCESS_ONCE(bucket->stamp32);
1389         u32 now = (u32)jiffies;
1390         u32 delta = 0;
1391
1392         if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
1393                 u64 x = random32();
1394
1395                 x *= (now - old);
1396                 delta = (u32)(x >> 32);
1397         }
1398
1399         return atomic_add_return(segs + delta, &bucket->id) - segs;
1400 }
1401 EXPORT_SYMBOL(ip_idents_reserve);
1402
1403 void __ip_select_ident(struct iphdr *iph, int segs)
1404 {
1405         static u32 ip_idents_hashrnd __read_mostly;
1406         static bool hashrnd_initialized = false;
1407         u32 hash, id;
1408
1409         if (unlikely(!hashrnd_initialized)) {
1410                 hashrnd_initialized = true;
1411                 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1412         }
1413
1414         hash = jhash_3words((__force u32)iph->daddr,
1415                             (__force u32)iph->saddr,
1416                             iph->protocol,
1417                             ip_idents_hashrnd);
1418         id = ip_idents_reserve(hash, segs);
1419         iph->id = htons(id);
1420 }
1421 EXPORT_SYMBOL(__ip_select_ident);
1422
1423 static void rt_del(unsigned hash, struct rtable *rt)
1424 {
1425         struct rtable __rcu **rthp;
1426         struct rtable *aux;
1427
1428         rthp = &rt_hash_table[hash].chain;
1429         spin_lock_bh(rt_hash_lock_addr(hash));
1430         ip_rt_put(rt);
1431         while ((aux = rcu_dereference_protected(*rthp,
1432                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1433                 if (aux == rt || rt_is_expired(aux)) {
1434                         *rthp = aux->dst.rt_next;
1435                         rt_free(aux);
1436                         continue;
1437                 }
1438                 rthp = &aux->dst.rt_next;
1439         }
1440         spin_unlock_bh(rt_hash_lock_addr(hash));
1441 }
1442
1443 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1444 {
1445         struct rtable *rt = (struct rtable *) dst;
1446         __be32 orig_gw = rt->rt_gateway;
1447         struct neighbour *n, *old_n;
1448
1449         dst_confirm(&rt->dst);
1450
1451         rt->rt_gateway = peer->redirect_learned.a4;
1452
1453         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1454         if (IS_ERR(n)) {
1455                 rt->rt_gateway = orig_gw;
1456                 return;
1457         }
1458         old_n = xchg(&rt->dst._neighbour, n);
1459         if (old_n)
1460                 neigh_release(old_n);
1461         if (!(n->nud_state & NUD_VALID)) {
1462                 neigh_event_send(n, NULL);
1463         } else {
1464                 rt->rt_flags |= RTCF_REDIRECTED;
1465                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1466         }
1467 }
1468
1469 /* called in rcu_read_lock() section */
1470 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1471                     __be32 saddr, struct net_device *dev)
1472 {
1473         int s, i;
1474         struct in_device *in_dev = __in_dev_get_rcu(dev);
1475         __be32 skeys[2] = { saddr, 0 };
1476         int    ikeys[2] = { dev->ifindex, 0 };
1477         struct inet_peer *peer;
1478         struct net *net;
1479
1480         if (!in_dev)
1481                 return;
1482
1483         net = dev_net(dev);
1484         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1485             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1486             ipv4_is_zeronet(new_gw))
1487                 goto reject_redirect;
1488
1489         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1490                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1491                         goto reject_redirect;
1492                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1493                         goto reject_redirect;
1494         } else {
1495                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1496                         goto reject_redirect;
1497         }
1498
1499         for (s = 0; s < 2; s++) {
1500                 for (i = 0; i < 2; i++) {
1501                         unsigned int hash;
1502                         struct rtable __rcu **rthp;
1503                         struct rtable *rt;
1504
1505                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1506
1507                         rthp = &rt_hash_table[hash].chain;
1508
1509                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1510                                 rthp = &rt->dst.rt_next;
1511
1512                                 if (rt->rt_key_dst != daddr ||
1513                                     rt->rt_key_src != skeys[s] ||
1514                                     rt->rt_oif != ikeys[i] ||
1515                                     rt_is_input_route(rt) ||
1516                                     rt_is_expired(rt) ||
1517                                     !net_eq(dev_net(rt->dst.dev), net) ||
1518                                     rt->dst.error ||
1519                                     rt->dst.dev != dev ||
1520                                     rt->rt_gateway != old_gw)
1521                                         continue;
1522
1523                                 if (!rt->peer)
1524                                         rt_bind_peer(rt, rt->rt_dst, 1);
1525
1526                                 peer = rt->peer;
1527                                 if (peer) {
1528                                         if (peer->redirect_learned.a4 != new_gw ||
1529                                             peer->redirect_genid != redirect_genid) {
1530                                                 peer->redirect_learned.a4 = new_gw;
1531                                                 peer->redirect_genid = redirect_genid;
1532                                                 atomic_inc(&__rt_peer_genid);
1533                                         }
1534                                         check_peer_redir(&rt->dst, peer);
1535                                 }
1536                         }
1537                 }
1538         }
1539         return;
1540
1541 reject_redirect:
1542 #ifdef CONFIG_IP_ROUTE_VERBOSE
1543         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1544                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1545                         "  Advised path = %pI4 -> %pI4\n",
1546                        &old_gw, dev->name, &new_gw,
1547                        &saddr, &daddr);
1548 #endif
1549         ;
1550 }
1551
1552 static bool peer_pmtu_expired(struct inet_peer *peer)
1553 {
1554         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1555
1556         return orig &&
1557                time_after_eq(jiffies, orig) &&
1558                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1559 }
1560
1561 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1562 {
1563         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1564
1565         return orig &&
1566                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1567 }
1568
1569 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1570 {
1571         struct rtable *rt = (struct rtable *)dst;
1572         struct dst_entry *ret = dst;
1573
1574         if (rt) {
1575                 if (dst->obsolete > 0) {
1576                         ip_rt_put(rt);
1577                         ret = NULL;
1578                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1579                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1580                                                 rt->rt_oif,
1581                                                 rt_genid(dev_net(dst->dev)));
1582                         rt_del(hash, rt);
1583                         ret = NULL;
1584                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1585                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1586                 }
1587         }
1588         return ret;
1589 }
1590
1591 /*
1592  * Algorithm:
1593  *      1. The first ip_rt_redirect_number redirects are sent
1594  *         with exponential backoff, then we stop sending them at all,
1595  *         assuming that the host ignores our redirects.
1596  *      2. If we did not see packets requiring redirects
1597  *         during ip_rt_redirect_silence, we assume that the host
1598  *         forgot redirected route and start to send redirects again.
1599  *
1600  * This algorithm is much cheaper and more intelligent than dumb load limiting
1601  * in icmp.c.
1602  *
1603  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1604  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1605  */
1606
1607 void ip_rt_send_redirect(struct sk_buff *skb)
1608 {
1609         struct rtable *rt = skb_rtable(skb);
1610         struct in_device *in_dev;
1611         struct inet_peer *peer;
1612         int log_martians;
1613
1614         rcu_read_lock();
1615         in_dev = __in_dev_get_rcu(rt->dst.dev);
1616         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1617                 rcu_read_unlock();
1618                 return;
1619         }
1620         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1621         rcu_read_unlock();
1622
1623         if (!rt->peer)
1624                 rt_bind_peer(rt, rt->rt_dst, 1);
1625         peer = rt->peer;
1626         if (!peer) {
1627                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1628                 return;
1629         }
1630
1631         /* No redirected packets during ip_rt_redirect_silence;
1632          * reset the algorithm.
1633          */
1634         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1635                 peer->rate_tokens = 0;
1636
1637         /* Too many ignored redirects; do not send anything
1638          * set dst.rate_last to the last seen redirected packet.
1639          */
1640         if (peer->rate_tokens >= ip_rt_redirect_number) {
1641                 peer->rate_last = jiffies;
1642                 return;
1643         }
1644
1645         /* Check for load limit; set rate_last to the latest sent
1646          * redirect.
1647          */
1648         if (peer->rate_tokens == 0 ||
1649             time_after(jiffies,
1650                        (peer->rate_last +
1651                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1652                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1653                 peer->rate_last = jiffies;
1654                 ++peer->rate_tokens;
1655 #ifdef CONFIG_IP_ROUTE_VERBOSE
1656                 if (log_martians &&
1657                     peer->rate_tokens == ip_rt_redirect_number &&
1658                     net_ratelimit())
1659                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1660                                &ip_hdr(skb)->saddr, rt->rt_iif,
1661                                 &rt->rt_dst, &rt->rt_gateway);
1662 #endif
1663         }
1664 }
1665
1666 static int ip_error(struct sk_buff *skb)
1667 {
1668         struct rtable *rt = skb_rtable(skb);
1669         struct inet_peer *peer;
1670         unsigned long now;
1671         bool send;
1672         int code;
1673
1674         switch (rt->dst.error) {
1675         case EINVAL:
1676         default:
1677                 goto out;
1678         case EHOSTUNREACH:
1679                 code = ICMP_HOST_UNREACH;
1680                 break;
1681         case ENETUNREACH:
1682                 code = ICMP_NET_UNREACH;
1683                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1684                                 IPSTATS_MIB_INNOROUTES);
1685                 break;
1686         case EACCES:
1687                 code = ICMP_PKT_FILTERED;
1688                 break;
1689         }
1690
1691         if (!rt->peer)
1692                 rt_bind_peer(rt, rt->rt_dst, 1);
1693         peer = rt->peer;
1694
1695         send = true;
1696         if (peer) {
1697                 now = jiffies;
1698                 peer->rate_tokens += now - peer->rate_last;
1699                 if (peer->rate_tokens > ip_rt_error_burst)
1700                         peer->rate_tokens = ip_rt_error_burst;
1701                 peer->rate_last = now;
1702                 if (peer->rate_tokens >= ip_rt_error_cost)
1703                         peer->rate_tokens -= ip_rt_error_cost;
1704                 else
1705                         send = false;
1706         }
1707         if (send)
1708                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1709
1710 out:    kfree_skb(skb);
1711         return 0;
1712 }
1713
1714 /*
1715  *      The last two values are not from the RFC but
1716  *      are needed for AMPRnet AX.25 paths.
1717  */
1718
1719 static const unsigned short mtu_plateau[] =
1720 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1721
1722 static inline unsigned short guess_mtu(unsigned short old_mtu)
1723 {
1724         int i;
1725
1726         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1727                 if (old_mtu > mtu_plateau[i])
1728                         return mtu_plateau[i];
1729         return 68;
1730 }
1731
1732 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1733                                  unsigned short new_mtu,
1734                                  struct net_device *dev)
1735 {
1736         unsigned short old_mtu = ntohs(iph->tot_len);
1737         unsigned short est_mtu = 0;
1738         struct inet_peer *peer;
1739
1740         peer = inet_getpeer_v4(iph->daddr, 1);
1741         if (peer) {
1742                 unsigned short mtu = new_mtu;
1743
1744                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1745                         /* BSD 4.2 derived systems incorrectly adjust
1746                          * tot_len by the IP header length, and report
1747                          * a zero MTU in the ICMP message.
1748                          */
1749                         if (mtu == 0 &&
1750                             old_mtu >= 68 + (iph->ihl << 2))
1751                                 old_mtu -= iph->ihl << 2;
1752                         mtu = guess_mtu(old_mtu);
1753                 }
1754
1755                 if (mtu < ip_rt_min_pmtu)
1756                         mtu = ip_rt_min_pmtu;
1757                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1758                         unsigned long pmtu_expires;
1759
1760                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1761                         if (!pmtu_expires)
1762                                 pmtu_expires = 1UL;
1763
1764                         est_mtu = mtu;
1765                         peer->pmtu_learned = mtu;
1766                         peer->pmtu_expires = pmtu_expires;
1767                         atomic_inc(&__rt_peer_genid);
1768                 }
1769
1770                 inet_putpeer(peer);
1771         }
1772         return est_mtu ? : new_mtu;
1773 }
1774
1775 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1776 {
1777         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1778
1779         if (!expires)
1780                 return;
1781         if (time_before(jiffies, expires)) {
1782                 u32 orig_dst_mtu = dst_mtu(dst);
1783                 if (peer->pmtu_learned < orig_dst_mtu) {
1784                         if (!peer->pmtu_orig)
1785                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1786                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1787                 }
1788         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1789                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1790 }
1791
1792 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1793 {
1794         struct rtable *rt = (struct rtable *) dst;
1795         struct inet_peer *peer;
1796
1797         dst_confirm(dst);
1798
1799         if (!rt->peer)
1800                 rt_bind_peer(rt, rt->rt_dst, 1);
1801         peer = rt->peer;
1802         if (peer) {
1803                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1804
1805                 if (mtu < ip_rt_min_pmtu)
1806                         mtu = ip_rt_min_pmtu;
1807                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1808
1809                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1810                         if (!pmtu_expires)
1811                                 pmtu_expires = 1UL;
1812
1813                         peer->pmtu_learned = mtu;
1814                         peer->pmtu_expires = pmtu_expires;
1815
1816                         atomic_inc(&__rt_peer_genid);
1817                         rt->rt_peer_genid = rt_peer_genid();
1818                 }
1819                 check_peer_pmtu(dst, peer);
1820         }
1821 }
1822
1823
1824 static void ipv4_validate_peer(struct rtable *rt)
1825 {
1826         if (rt->rt_peer_genid != rt_peer_genid()) {
1827                 struct inet_peer *peer;
1828
1829                 if (!rt->peer)
1830                         rt_bind_peer(rt, rt->rt_dst, 0);
1831
1832                 peer = rt->peer;
1833                 if (peer) {
1834                         check_peer_pmtu(&rt->dst, peer);
1835
1836                         if (peer->redirect_genid != redirect_genid)
1837                                 peer->redirect_learned.a4 = 0;
1838                         if (peer->redirect_learned.a4 &&
1839                             peer->redirect_learned.a4 != rt->rt_gateway)
1840                                 check_peer_redir(&rt->dst, peer);
1841                 }
1842
1843                 rt->rt_peer_genid = rt_peer_genid();
1844         }
1845 }
1846
1847 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1848 {
1849         struct rtable *rt = (struct rtable *) dst;
1850
1851         if (rt_is_expired(rt))
1852                 return NULL;
1853         ipv4_validate_peer(rt);
1854         return dst;
1855 }
1856
1857 static void ipv4_dst_destroy(struct dst_entry *dst)
1858 {
1859         struct rtable *rt = (struct rtable *) dst;
1860         struct inet_peer *peer = rt->peer;
1861
1862         if (rt->fi) {
1863                 fib_info_put(rt->fi);
1864                 rt->fi = NULL;
1865         }
1866         if (peer) {
1867                 rt->peer = NULL;
1868                 inet_putpeer(peer);
1869         }
1870 }
1871
1872
1873 static void ipv4_link_failure(struct sk_buff *skb)
1874 {
1875         struct rtable *rt;
1876
1877         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1878
1879         rt = skb_rtable(skb);
1880         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1881                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1882 }
1883
1884 static int ip_rt_bug(struct sk_buff *skb)
1885 {
1886         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1887                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1888                 skb->dev ? skb->dev->name : "?");
1889         kfree_skb(skb);
1890         WARN_ON(1);
1891         return 0;
1892 }
1893
1894 /*
1895    We do not cache source address of outgoing interface,
1896    because it is used only by IP RR, TS and SRR options,
1897    so that it out of fast path.
1898
1899    BTW remember: "addr" is allowed to be not aligned
1900    in IP options!
1901  */
1902
1903 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1904 {
1905         __be32 src;
1906
1907         if (rt_is_output_route(rt))
1908                 src = ip_hdr(skb)->saddr;
1909         else {
1910                 struct fib_result res;
1911                 struct flowi4 fl4;
1912                 struct iphdr *iph;
1913
1914                 iph = ip_hdr(skb);
1915
1916                 memset(&fl4, 0, sizeof(fl4));
1917                 fl4.daddr = iph->daddr;
1918                 fl4.saddr = iph->saddr;
1919                 fl4.flowi4_tos = RT_TOS(iph->tos);
1920                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1921                 fl4.flowi4_iif = skb->dev->ifindex;
1922                 fl4.flowi4_mark = skb->mark;
1923
1924                 rcu_read_lock();
1925                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1926                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1927                 else
1928                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1929                                         RT_SCOPE_UNIVERSE);
1930                 rcu_read_unlock();
1931         }
1932         memcpy(addr, &src, 4);
1933 }
1934
1935 #ifdef CONFIG_IP_ROUTE_CLASSID
1936 static void set_class_tag(struct rtable *rt, u32 tag)
1937 {
1938         if (!(rt->dst.tclassid & 0xFFFF))
1939                 rt->dst.tclassid |= tag & 0xFFFF;
1940         if (!(rt->dst.tclassid & 0xFFFF0000))
1941                 rt->dst.tclassid |= tag & 0xFFFF0000;
1942 }
1943 #endif
1944
1945 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1946 {
1947         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1948
1949         if (advmss == 0) {
1950                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1951                                ip_rt_min_advmss);
1952                 if (advmss > 65535 - 40)
1953                         advmss = 65535 - 40;
1954         }
1955         return advmss;
1956 }
1957
1958 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1959 {
1960         const struct rtable *rt = (const struct rtable *) dst;
1961         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1962
1963         if (mtu && rt_is_output_route(rt))
1964                 return mtu;
1965
1966         mtu = dst->dev->mtu;
1967
1968         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1969
1970                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1971                         mtu = 576;
1972         }
1973
1974         if (mtu > IP_MAX_MTU)
1975                 mtu = IP_MAX_MTU;
1976
1977         return mtu;
1978 }
1979
1980 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1981                             struct fib_info *fi)
1982 {
1983         struct inet_peer *peer;
1984         int create = 0;
1985
1986         /* If a peer entry exists for this destination, we must hook
1987          * it up in order to get at cached metrics.
1988          */
1989         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1990                 create = 1;
1991
1992         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1993         if (peer) {
1994                 rt->rt_peer_genid = rt_peer_genid();
1995                 if (inet_metrics_new(peer))
1996                         memcpy(peer->metrics, fi->fib_metrics,
1997                                sizeof(u32) * RTAX_MAX);
1998                 dst_init_metrics(&rt->dst, peer->metrics, false);
1999
2000                 check_peer_pmtu(&rt->dst, peer);
2001                 if (peer->redirect_genid != redirect_genid)
2002                         peer->redirect_learned.a4 = 0;
2003                 if (peer->redirect_learned.a4 &&
2004                     peer->redirect_learned.a4 != rt->rt_gateway) {
2005                         rt->rt_gateway = peer->redirect_learned.a4;
2006                         rt->rt_flags |= RTCF_REDIRECTED;
2007                 }
2008         } else {
2009                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
2010                         rt->fi = fi;
2011                         atomic_inc(&fi->fib_clntref);
2012                 }
2013                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
2014         }
2015 }
2016
2017 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
2018                            const struct fib_result *res,
2019                            struct fib_info *fi, u16 type, u32 itag)
2020 {
2021         struct dst_entry *dst = &rt->dst;
2022
2023         if (fi) {
2024                 if (FIB_RES_GW(*res) &&
2025                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
2026                         rt->rt_gateway = FIB_RES_GW(*res);
2027                 rt_init_metrics(rt, fl4, fi);
2028 #ifdef CONFIG_IP_ROUTE_CLASSID
2029                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
2030 #endif
2031         }
2032
2033         if (dst_mtu(dst) > IP_MAX_MTU)
2034                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
2035         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
2036                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
2037
2038 #ifdef CONFIG_IP_ROUTE_CLASSID
2039 #ifdef CONFIG_IP_MULTIPLE_TABLES
2040         set_class_tag(rt, fib_rules_tclass(res));
2041 #endif
2042         set_class_tag(rt, itag);
2043 #endif
2044 }
2045
2046 static struct rtable *rt_dst_alloc(struct net_device *dev,
2047                                    bool nopolicy, bool noxfrm)
2048 {
2049         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2050                          DST_HOST |
2051                          (nopolicy ? DST_NOPOLICY : 0) |
2052                          (noxfrm ? DST_NOXFRM : 0));
2053 }
2054
2055 /* called in rcu_read_lock() section */
2056 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2057                                 u8 tos, struct net_device *dev, int our)
2058 {
2059         unsigned int hash;
2060         struct rtable *rth;
2061         __be32 spec_dst;
2062         struct in_device *in_dev = __in_dev_get_rcu(dev);
2063         u32 itag = 0;
2064         int err;
2065
2066         /* Primary sanity checks. */
2067
2068         if (in_dev == NULL)
2069                 return -EINVAL;
2070
2071         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2072             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2073                 goto e_inval;
2074
2075         if (ipv4_is_zeronet(saddr)) {
2076                 if (!ipv4_is_local_multicast(daddr))
2077                         goto e_inval;
2078                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2079         } else {
2080                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2081                                           &itag);
2082                 if (err < 0)
2083                         goto e_err;
2084         }
2085         rth = rt_dst_alloc(init_net.loopback_dev,
2086                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2087         if (!rth)
2088                 goto e_nobufs;
2089
2090 #ifdef CONFIG_IP_ROUTE_CLASSID
2091         rth->dst.tclassid = itag;
2092 #endif
2093         rth->dst.output = ip_rt_bug;
2094
2095         rth->rt_key_dst = daddr;
2096         rth->rt_key_src = saddr;
2097         rth->rt_genid   = rt_genid(dev_net(dev));
2098         rth->rt_flags   = RTCF_MULTICAST;
2099         rth->rt_type    = RTN_MULTICAST;
2100         rth->rt_key_tos = tos;
2101         rth->rt_dst     = daddr;
2102         rth->rt_src     = saddr;
2103         rth->rt_route_iif = dev->ifindex;
2104         rth->rt_iif     = dev->ifindex;
2105         rth->rt_oif     = 0;
2106         rth->rt_mark    = skb->mark;
2107         rth->rt_gateway = daddr;
2108         rth->rt_spec_dst= spec_dst;
2109         rth->rt_peer_genid = 0;
2110         rth->peer = NULL;
2111         rth->fi = NULL;
2112         if (our) {
2113                 rth->dst.input= ip_local_deliver;
2114                 rth->rt_flags |= RTCF_LOCAL;
2115         }
2116
2117 #ifdef CONFIG_IP_MROUTE
2118         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2119                 rth->dst.input = ip_mr_input;
2120 #endif
2121         RT_CACHE_STAT_INC(in_slow_mc);
2122
2123         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2124         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2125         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2126
2127 e_nobufs:
2128         return -ENOBUFS;
2129 e_inval:
2130         return -EINVAL;
2131 e_err:
2132         return err;
2133 }
2134
2135
2136 static void ip_handle_martian_source(struct net_device *dev,
2137                                      struct in_device *in_dev,
2138                                      struct sk_buff *skb,
2139                                      __be32 daddr,
2140                                      __be32 saddr)
2141 {
2142         RT_CACHE_STAT_INC(in_martian_src);
2143 #ifdef CONFIG_IP_ROUTE_VERBOSE
2144         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2145                 /*
2146                  *      RFC1812 recommendation, if source is martian,
2147                  *      the only hint is MAC header.
2148                  */
2149                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2150                         &daddr, &saddr, dev->name);
2151                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2152                         int i;
2153                         const unsigned char *p = skb_mac_header(skb);
2154                         printk(KERN_WARNING "ll header: ");
2155                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2156                                 printk("%02x", *p);
2157                                 if (i < (dev->hard_header_len - 1))
2158                                         printk(":");
2159                         }
2160                         printk("\n");
2161                 }
2162         }
2163 #endif
2164 }
2165
2166 /* called in rcu_read_lock() section */
2167 static int __mkroute_input(struct sk_buff *skb,
2168                            const struct fib_result *res,
2169                            struct in_device *in_dev,
2170                            __be32 daddr, __be32 saddr, u32 tos,
2171                            struct rtable **result)
2172 {
2173         struct rtable *rth;
2174         int err;
2175         struct in_device *out_dev;
2176         unsigned int flags = 0;
2177         __be32 spec_dst;
2178         u32 itag = 0;
2179
2180         /* get a working reference to the output device */
2181         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2182         if (out_dev == NULL) {
2183                 if (net_ratelimit())
2184                         printk(KERN_CRIT "Bug in ip_route_input" \
2185                                "_slow(). Please, report\n");
2186                 return -EINVAL;
2187         }
2188
2189
2190         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2191                                   in_dev->dev, &spec_dst, &itag);
2192         if (err < 0) {
2193                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2194                                          saddr);
2195
2196                 goto cleanup;
2197         }
2198
2199         if (err)
2200                 flags |= RTCF_DIRECTSRC;
2201
2202         if (out_dev == in_dev && err &&
2203             (IN_DEV_SHARED_MEDIA(out_dev) ||
2204              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2205                 flags |= RTCF_DOREDIRECT;
2206
2207         if (skb->protocol != htons(ETH_P_IP)) {
2208                 /* Not IP (i.e. ARP). Do not create route, if it is
2209                  * invalid for proxy arp. DNAT routes are always valid.
2210                  *
2211                  * Proxy arp feature have been extended to allow, ARP
2212                  * replies back to the same interface, to support
2213                  * Private VLAN switch technologies. See arp.c.
2214                  */
2215                 if (out_dev == in_dev &&
2216                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2217                         err = -EINVAL;
2218                         goto cleanup;
2219                 }
2220         }
2221
2222         rth = rt_dst_alloc(out_dev->dev,
2223                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2224                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2225         if (!rth) {
2226                 err = -ENOBUFS;
2227                 goto cleanup;
2228         }
2229
2230         rth->rt_key_dst = daddr;
2231         rth->rt_key_src = saddr;
2232         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2233         rth->rt_flags = flags;
2234         rth->rt_type = res->type;
2235         rth->rt_key_tos = tos;
2236         rth->rt_dst     = daddr;
2237         rth->rt_src     = saddr;
2238         rth->rt_route_iif = in_dev->dev->ifindex;
2239         rth->rt_iif     = in_dev->dev->ifindex;
2240         rth->rt_oif     = 0;
2241         rth->rt_mark    = skb->mark;
2242         rth->rt_gateway = daddr;
2243         rth->rt_spec_dst= spec_dst;
2244         rth->rt_peer_genid = 0;
2245         rth->peer = NULL;
2246         rth->fi = NULL;
2247
2248         rth->dst.input = ip_forward;
2249         rth->dst.output = ip_output;
2250
2251         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2252
2253         *result = rth;
2254         err = 0;
2255  cleanup:
2256         return err;
2257 }
2258
2259 static int ip_mkroute_input(struct sk_buff *skb,
2260                             struct fib_result *res,
2261                             const struct flowi4 *fl4,
2262                             struct in_device *in_dev,
2263                             __be32 daddr, __be32 saddr, u32 tos)
2264 {
2265         struct rtable* rth = NULL;
2266         int err;
2267         unsigned hash;
2268
2269 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2270         if (res->fi && res->fi->fib_nhs > 1)
2271                 fib_select_multipath(res);
2272 #endif
2273
2274         /* create a routing cache entry */
2275         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2276         if (err)
2277                 return err;
2278
2279         /* put it into the cache */
2280         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2281                        rt_genid(dev_net(rth->dst.dev)));
2282         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2283         if (IS_ERR(rth))
2284                 return PTR_ERR(rth);
2285         return 0;
2286 }
2287
2288 /*
2289  *      NOTE. We drop all the packets that has local source
2290  *      addresses, because every properly looped back packet
2291  *      must have correct destination already attached by output routine.
2292  *
2293  *      Such approach solves two big problems:
2294  *      1. Not simplex devices are handled properly.
2295  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2296  *      called with rcu_read_lock()
2297  */
2298
2299 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2300                                u8 tos, struct net_device *dev)
2301 {
2302         struct fib_result res;
2303         struct in_device *in_dev = __in_dev_get_rcu(dev);
2304         struct flowi4   fl4;
2305         unsigned        flags = 0;
2306         u32             itag = 0;
2307         struct rtable * rth;
2308         unsigned        hash;
2309         __be32          spec_dst;
2310         int             err = -EINVAL;
2311         struct net    * net = dev_net(dev);
2312
2313         /* IP on this device is disabled. */
2314
2315         if (!in_dev)
2316                 goto out;
2317
2318         /* Check for the most weird martians, which can be not detected
2319            by fib_lookup.
2320          */
2321
2322         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2323             ipv4_is_loopback(saddr))
2324                 goto martian_source;
2325
2326         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2327                 goto brd_input;
2328
2329         /* Accept zero addresses only to limited broadcast;
2330          * I even do not know to fix it or not. Waiting for complains :-)
2331          */
2332         if (ipv4_is_zeronet(saddr))
2333                 goto martian_source;
2334
2335         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2336                 goto martian_destination;
2337
2338         /*
2339          *      Now we are ready to route packet.
2340          */
2341         fl4.flowi4_oif = 0;
2342         fl4.flowi4_iif = dev->ifindex;
2343         fl4.flowi4_mark = skb->mark;
2344         fl4.flowi4_tos = tos;
2345         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2346         fl4.daddr = daddr;
2347         fl4.saddr = saddr;
2348         err = fib_lookup(net, &fl4, &res);
2349         if (err != 0) {
2350                 if (!IN_DEV_FORWARD(in_dev))
2351                         goto e_hostunreach;
2352                 goto no_route;
2353         }
2354
2355         RT_CACHE_STAT_INC(in_slow_tot);
2356
2357         if (res.type == RTN_BROADCAST)
2358                 goto brd_input;
2359
2360         if (res.type == RTN_LOCAL) {
2361                 err = fib_validate_source(skb, saddr, daddr, tos,
2362                                           net->loopback_dev->ifindex,
2363                                           dev, &spec_dst, &itag);
2364                 if (err < 0)
2365                         goto martian_source_keep_err;
2366                 if (err)
2367                         flags |= RTCF_DIRECTSRC;
2368                 spec_dst = daddr;
2369                 goto local_input;
2370         }
2371
2372         if (!IN_DEV_FORWARD(in_dev))
2373                 goto e_hostunreach;
2374         if (res.type != RTN_UNICAST)
2375                 goto martian_destination;
2376
2377         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2378 out:    return err;
2379
2380 brd_input:
2381         if (skb->protocol != htons(ETH_P_IP))
2382                 goto e_inval;
2383
2384         if (ipv4_is_zeronet(saddr))
2385                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2386         else {
2387                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2388                                           &itag);
2389                 if (err < 0)
2390                         goto martian_source_keep_err;
2391                 if (err)
2392                         flags |= RTCF_DIRECTSRC;
2393         }
2394         flags |= RTCF_BROADCAST;
2395         res.type = RTN_BROADCAST;
2396         RT_CACHE_STAT_INC(in_brd);
2397
2398 local_input:
2399         rth = rt_dst_alloc(net->loopback_dev,
2400                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2401         if (!rth)
2402                 goto e_nobufs;
2403
2404         rth->dst.input= ip_local_deliver;
2405         rth->dst.output= ip_rt_bug;
2406 #ifdef CONFIG_IP_ROUTE_CLASSID
2407         rth->dst.tclassid = itag;
2408 #endif
2409
2410         rth->rt_key_dst = daddr;
2411         rth->rt_key_src = saddr;
2412         rth->rt_genid = rt_genid(net);
2413         rth->rt_flags   = flags|RTCF_LOCAL;
2414         rth->rt_type    = res.type;
2415         rth->rt_key_tos = tos;
2416         rth->rt_dst     = daddr;
2417         rth->rt_src     = saddr;
2418 #ifdef CONFIG_IP_ROUTE_CLASSID
2419         rth->dst.tclassid = itag;
2420 #endif
2421         rth->rt_route_iif = dev->ifindex;
2422         rth->rt_iif     = dev->ifindex;
2423         rth->rt_oif     = 0;
2424         rth->rt_mark    = skb->mark;
2425         rth->rt_gateway = daddr;
2426         rth->rt_spec_dst= spec_dst;
2427         rth->rt_peer_genid = 0;
2428         rth->peer = NULL;
2429         rth->fi = NULL;
2430         if (res.type == RTN_UNREACHABLE) {
2431                 rth->dst.input= ip_error;
2432                 rth->dst.error= -err;
2433                 rth->rt_flags   &= ~RTCF_LOCAL;
2434         }
2435         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2436         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2437         err = 0;
2438         if (IS_ERR(rth))
2439                 err = PTR_ERR(rth);
2440         goto out;
2441
2442 no_route:
2443         RT_CACHE_STAT_INC(in_no_route);
2444         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2445         res.type = RTN_UNREACHABLE;
2446         if (err == -ESRCH)
2447                 err = -ENETUNREACH;
2448         goto local_input;
2449
2450         /*
2451          *      Do not cache martian addresses: they should be logged (RFC1812)
2452          */
2453 martian_destination:
2454         RT_CACHE_STAT_INC(in_martian_dst);
2455 #ifdef CONFIG_IP_ROUTE_VERBOSE
2456         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2457                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2458                         &daddr, &saddr, dev->name);
2459 #endif
2460
2461 e_hostunreach:
2462         err = -EHOSTUNREACH;
2463         goto out;
2464
2465 e_inval:
2466         err = -EINVAL;
2467         goto out;
2468
2469 e_nobufs:
2470         err = -ENOBUFS;
2471         goto out;
2472
2473 martian_source:
2474         err = -EINVAL;
2475 martian_source_keep_err:
2476         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2477         goto out;
2478 }
2479
2480 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2481                            u8 tos, struct net_device *dev, bool noref)
2482 {
2483         struct rtable * rth;
2484         unsigned        hash;
2485         int iif = dev->ifindex;
2486         struct net *net;
2487         int res;
2488
2489         net = dev_net(dev);
2490
2491         rcu_read_lock();
2492
2493         if (!rt_caching(net))
2494                 goto skip_cache;
2495
2496         tos &= IPTOS_RT_MASK;
2497         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2498
2499         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2500              rth = rcu_dereference(rth->dst.rt_next)) {
2501                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2502                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2503                      (rth->rt_route_iif ^ iif) |
2504                      (rth->rt_key_tos ^ tos)) == 0 &&
2505                     rth->rt_mark == skb->mark &&
2506                     net_eq(dev_net(rth->dst.dev), net) &&
2507                     !rt_is_expired(rth)) {
2508                         ipv4_validate_peer(rth);
2509                         if (noref) {
2510                                 dst_use_noref(&rth->dst, jiffies);
2511                                 skb_dst_set_noref(skb, &rth->dst);
2512                         } else {
2513                                 dst_use(&rth->dst, jiffies);
2514                                 skb_dst_set(skb, &rth->dst);
2515                         }
2516                         RT_CACHE_STAT_INC(in_hit);
2517                         rcu_read_unlock();
2518                         return 0;
2519                 }
2520                 RT_CACHE_STAT_INC(in_hlist_search);
2521         }
2522
2523 skip_cache:
2524         /* Multicast recognition logic is moved from route cache to here.
2525            The problem was that too many Ethernet cards have broken/missing
2526            hardware multicast filters :-( As result the host on multicasting
2527            network acquires a lot of useless route cache entries, sort of
2528            SDR messages from all the world. Now we try to get rid of them.
2529            Really, provided software IP multicast filter is organized
2530            reasonably (at least, hashed), it does not result in a slowdown
2531            comparing with route cache reject entries.
2532            Note, that multicast routers are not affected, because
2533            route cache entry is created eventually.
2534          */
2535         if (ipv4_is_multicast(daddr)) {
2536                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2537
2538                 if (in_dev) {
2539                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2540                                                   ip_hdr(skb)->protocol);
2541                         if (our
2542 #ifdef CONFIG_IP_MROUTE
2543                                 ||
2544                             (!ipv4_is_local_multicast(daddr) &&
2545                              IN_DEV_MFORWARD(in_dev))
2546 #endif
2547                            ) {
2548                                 int res = ip_route_input_mc(skb, daddr, saddr,
2549                                                             tos, dev, our);
2550                                 rcu_read_unlock();
2551                                 return res;
2552                         }
2553                 }
2554                 rcu_read_unlock();
2555                 return -EINVAL;
2556         }
2557         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2558         rcu_read_unlock();
2559         return res;
2560 }
2561 EXPORT_SYMBOL(ip_route_input_common);
2562
2563 /* called with rcu_read_lock() */
2564 static struct rtable *__mkroute_output(const struct fib_result *res,
2565                                        const struct flowi4 *fl4,
2566                                        __be32 orig_daddr, __be32 orig_saddr,
2567                                        int orig_oif, __u8 orig_rtos,
2568                                        struct net_device *dev_out,
2569                                        unsigned int flags)
2570 {
2571         struct fib_info *fi = res->fi;
2572         struct in_device *in_dev;
2573         u16 type = res->type;
2574         struct rtable *rth;
2575
2576         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2577                 return ERR_PTR(-EINVAL);
2578
2579         if (ipv4_is_lbcast(fl4->daddr))
2580                 type = RTN_BROADCAST;
2581         else if (ipv4_is_multicast(fl4->daddr))
2582                 type = RTN_MULTICAST;
2583         else if (ipv4_is_zeronet(fl4->daddr))
2584                 return ERR_PTR(-EINVAL);
2585
2586         if (dev_out->flags & IFF_LOOPBACK)
2587                 flags |= RTCF_LOCAL;
2588
2589         in_dev = __in_dev_get_rcu(dev_out);
2590         if (!in_dev)
2591                 return ERR_PTR(-EINVAL);
2592
2593         if (type == RTN_BROADCAST) {
2594                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2595                 fi = NULL;
2596         } else if (type == RTN_MULTICAST) {
2597                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2598                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2599                                      fl4->flowi4_proto))
2600                         flags &= ~RTCF_LOCAL;
2601                 /* If multicast route do not exist use
2602                  * default one, but do not gateway in this case.
2603                  * Yes, it is hack.
2604                  */
2605                 if (fi && res->prefixlen < 4)
2606                         fi = NULL;
2607         }
2608
2609         rth = rt_dst_alloc(dev_out,
2610                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2611                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2612         if (!rth)
2613                 return ERR_PTR(-ENOBUFS);
2614
2615         rth->dst.output = ip_output;
2616
2617         rth->rt_key_dst = orig_daddr;
2618         rth->rt_key_src = orig_saddr;
2619         rth->rt_genid = rt_genid(dev_net(dev_out));
2620         rth->rt_flags   = flags;
2621         rth->rt_type    = type;
2622         rth->rt_key_tos = orig_rtos;
2623         rth->rt_dst     = fl4->daddr;
2624         rth->rt_src     = fl4->saddr;
2625         rth->rt_route_iif = 0;
2626         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2627         rth->rt_oif     = orig_oif;
2628         rth->rt_mark    = fl4->flowi4_mark;
2629         rth->rt_gateway = fl4->daddr;
2630         rth->rt_spec_dst= fl4->saddr;
2631         rth->rt_peer_genid = 0;
2632         rth->peer = NULL;
2633         rth->fi = NULL;
2634
2635         RT_CACHE_STAT_INC(out_slow_tot);
2636
2637         if (flags & RTCF_LOCAL) {
2638                 rth->dst.input = ip_local_deliver;
2639                 rth->rt_spec_dst = fl4->daddr;
2640         }
2641         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2642                 rth->rt_spec_dst = fl4->saddr;
2643                 if (flags & RTCF_LOCAL &&
2644                     !(dev_out->flags & IFF_LOOPBACK)) {
2645                         rth->dst.output = ip_mc_output;
2646                         RT_CACHE_STAT_INC(out_slow_mc);
2647                 }
2648 #ifdef CONFIG_IP_MROUTE
2649                 if (type == RTN_MULTICAST) {
2650                         if (IN_DEV_MFORWARD(in_dev) &&
2651                             !ipv4_is_local_multicast(fl4->daddr)) {
2652                                 rth->dst.input = ip_mr_input;
2653                                 rth->dst.output = ip_mc_output;
2654                         }
2655                 }
2656 #endif
2657         }
2658
2659         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2660
2661         return rth;
2662 }
2663
2664 /*
2665  * Major route resolver routine.
2666  * called with rcu_read_lock();
2667  */
2668
2669 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2670 {
2671         struct net_device *dev_out = NULL;
2672         __u8 tos = RT_FL_TOS(fl4);
2673         unsigned int flags = 0;
2674         struct fib_result res;
2675         struct rtable *rth;
2676         __be32 orig_daddr;
2677         __be32 orig_saddr;
2678         int orig_oif;
2679
2680         res.fi          = NULL;
2681 #ifdef CONFIG_IP_MULTIPLE_TABLES
2682         res.r           = NULL;
2683 #endif
2684
2685         orig_daddr = fl4->daddr;
2686         orig_saddr = fl4->saddr;
2687         orig_oif = fl4->flowi4_oif;
2688
2689         fl4->flowi4_iif = net->loopback_dev->ifindex;
2690         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2691         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2692                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2693
2694         rcu_read_lock();
2695         if (fl4->saddr) {
2696                 rth = ERR_PTR(-EINVAL);
2697                 if (ipv4_is_multicast(fl4->saddr) ||
2698                     ipv4_is_lbcast(fl4->saddr) ||
2699                     ipv4_is_zeronet(fl4->saddr))
2700                         goto out;
2701
2702                 /* I removed check for oif == dev_out->oif here.
2703                    It was wrong for two reasons:
2704                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2705                       is assigned to multiple interfaces.
2706                    2. Moreover, we are allowed to send packets with saddr
2707                       of another iface. --ANK
2708                  */
2709
2710                 if (fl4->flowi4_oif == 0 &&
2711                     (ipv4_is_multicast(fl4->daddr) ||
2712                      ipv4_is_lbcast(fl4->daddr))) {
2713                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2714                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2715                         if (dev_out == NULL)
2716                                 goto out;
2717
2718                         /* Special hack: user can direct multicasts
2719                            and limited broadcast via necessary interface
2720                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2721                            This hack is not just for fun, it allows
2722                            vic,vat and friends to work.
2723                            They bind socket to loopback, set ttl to zero
2724                            and expect that it will work.
2725                            From the viewpoint of routing cache they are broken,
2726                            because we are not allowed to build multicast path
2727                            with loopback source addr (look, routing cache
2728                            cannot know, that ttl is zero, so that packet
2729                            will not leave this host and route is valid).
2730                            Luckily, this hack is good workaround.
2731                          */
2732
2733                         fl4->flowi4_oif = dev_out->ifindex;
2734                         goto make_route;
2735                 }
2736
2737                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2738                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2739                         if (!__ip_dev_find(net, fl4->saddr, false))
2740                                 goto out;
2741                 }
2742         }
2743
2744
2745         if (fl4->flowi4_oif) {
2746                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2747                 rth = ERR_PTR(-ENODEV);
2748                 if (dev_out == NULL)
2749                         goto out;
2750
2751                 /* RACE: Check return value of inet_select_addr instead. */
2752                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2753                         rth = ERR_PTR(-ENETUNREACH);
2754                         goto out;
2755                 }
2756                 if (ipv4_is_local_multicast(fl4->daddr) ||
2757                     ipv4_is_lbcast(fl4->daddr)) {
2758                         if (!fl4->saddr)
2759                                 fl4->saddr = inet_select_addr(dev_out, 0,
2760                                                               RT_SCOPE_LINK);
2761                         goto make_route;
2762                 }
2763                 if (!fl4->saddr) {
2764                         if (ipv4_is_multicast(fl4->daddr))
2765                                 fl4->saddr = inet_select_addr(dev_out, 0,
2766                                                               fl4->flowi4_scope);
2767                         else if (!fl4->daddr)
2768                                 fl4->saddr = inet_select_addr(dev_out, 0,
2769                                                               RT_SCOPE_HOST);
2770                 }
2771         }
2772
2773         if (!fl4->daddr) {
2774                 fl4->daddr = fl4->saddr;
2775                 if (!fl4->daddr)
2776                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2777                 dev_out = net->loopback_dev;
2778                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2779                 res.type = RTN_LOCAL;
2780                 flags |= RTCF_LOCAL;
2781                 goto make_route;
2782         }
2783
2784         if (fib_lookup(net, fl4, &res)) {
2785                 res.fi = NULL;
2786                 if (fl4->flowi4_oif) {
2787                         /* Apparently, routing tables are wrong. Assume,
2788                            that the destination is on link.
2789
2790                            WHY? DW.
2791                            Because we are allowed to send to iface
2792                            even if it has NO routes and NO assigned
2793                            addresses. When oif is specified, routing
2794                            tables are looked up with only one purpose:
2795                            to catch if destination is gatewayed, rather than
2796                            direct. Moreover, if MSG_DONTROUTE is set,
2797                            we send packet, ignoring both routing tables
2798                            and ifaddr state. --ANK
2799
2800
2801                            We could make it even if oif is unknown,
2802                            likely IPv6, but we do not.
2803                          */
2804
2805                         if (fl4->saddr == 0)
2806                                 fl4->saddr = inet_select_addr(dev_out, 0,
2807                                                               RT_SCOPE_LINK);
2808                         res.type = RTN_UNICAST;
2809                         goto make_route;
2810                 }
2811                 rth = ERR_PTR(-ENETUNREACH);
2812                 goto out;
2813         }
2814
2815         if (res.type == RTN_LOCAL) {
2816                 if (!fl4->saddr) {
2817                         if (res.fi->fib_prefsrc)
2818                                 fl4->saddr = res.fi->fib_prefsrc;
2819                         else
2820                                 fl4->saddr = fl4->daddr;
2821                 }
2822                 dev_out = net->loopback_dev;
2823                 fl4->flowi4_oif = dev_out->ifindex;
2824                 res.fi = NULL;
2825                 flags |= RTCF_LOCAL;
2826                 goto make_route;
2827         }
2828
2829 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2830         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2831                 fib_select_multipath(&res);
2832         else
2833 #endif
2834         if (!res.prefixlen &&
2835             res.table->tb_num_default > 1 &&
2836             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2837                 fib_select_default(&res);
2838
2839         if (!fl4->saddr)
2840                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2841
2842         dev_out = FIB_RES_DEV(res);
2843         fl4->flowi4_oif = dev_out->ifindex;
2844
2845
2846 make_route:
2847         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2848                                tos, dev_out, flags);
2849         if (!IS_ERR(rth)) {
2850                 unsigned int hash;
2851
2852                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2853                                rt_genid(dev_net(dev_out)));
2854                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2855         }
2856
2857 out:
2858         rcu_read_unlock();
2859         return rth;
2860 }
2861
2862 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2863 {
2864         struct rtable *rth;
2865         unsigned int hash;
2866
2867         if (!rt_caching(net))
2868                 goto slow_output;
2869
2870         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2871
2872         rcu_read_lock_bh();
2873         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2874                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2875                 if (rth->rt_key_dst == flp4->daddr &&
2876                     rth->rt_key_src == flp4->saddr &&
2877                     rt_is_output_route(rth) &&
2878                     rth->rt_oif == flp4->flowi4_oif &&
2879                     rth->rt_mark == flp4->flowi4_mark &&
2880                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2881                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2882                     net_eq(dev_net(rth->dst.dev), net) &&
2883                     !rt_is_expired(rth)) {
2884                         ipv4_validate_peer(rth);
2885                         dst_use(&rth->dst, jiffies);
2886                         RT_CACHE_STAT_INC(out_hit);
2887                         rcu_read_unlock_bh();
2888                         if (!flp4->saddr)
2889                                 flp4->saddr = rth->rt_src;
2890                         if (!flp4->daddr)
2891                                 flp4->daddr = rth->rt_dst;
2892                         return rth;
2893                 }
2894                 RT_CACHE_STAT_INC(out_hlist_search);
2895         }
2896         rcu_read_unlock_bh();
2897
2898 slow_output:
2899         return ip_route_output_slow(net, flp4);
2900 }
2901 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2902
2903 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2904 {
2905         return NULL;
2906 }
2907
2908 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2909 {
2910         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2911
2912         return mtu ? : dst->dev->mtu;
2913 }
2914
2915 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2916 {
2917 }
2918
2919 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2920                                           unsigned long old)
2921 {
2922         return NULL;
2923 }
2924
2925 static struct dst_ops ipv4_dst_blackhole_ops = {
2926         .family                 =       AF_INET,
2927         .protocol               =       cpu_to_be16(ETH_P_IP),
2928         .destroy                =       ipv4_dst_destroy,
2929         .check                  =       ipv4_blackhole_dst_check,
2930         .mtu                    =       ipv4_blackhole_mtu,
2931         .default_advmss         =       ipv4_default_advmss,
2932         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2933         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2934         .neigh_lookup           =       ipv4_neigh_lookup,
2935 };
2936
2937 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2938 {
2939         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2940         struct rtable *ort = (struct rtable *) dst_orig;
2941
2942         if (rt) {
2943                 struct dst_entry *new = &rt->dst;
2944
2945                 new->__use = 1;
2946                 new->input = dst_discard;
2947                 new->output = dst_discard;
2948                 dst_copy_metrics(new, &ort->dst);
2949
2950                 new->dev = ort->dst.dev;
2951                 if (new->dev)
2952                         dev_hold(new->dev);
2953
2954                 rt->rt_key_dst = ort->rt_key_dst;
2955                 rt->rt_key_src = ort->rt_key_src;
2956                 rt->rt_key_tos = ort->rt_key_tos;
2957                 rt->rt_route_iif = ort->rt_route_iif;
2958                 rt->rt_iif = ort->rt_iif;
2959                 rt->rt_oif = ort->rt_oif;
2960                 rt->rt_mark = ort->rt_mark;
2961
2962                 rt->rt_genid = rt_genid(net);
2963                 rt->rt_flags = ort->rt_flags;
2964                 rt->rt_type = ort->rt_type;
2965                 rt->rt_dst = ort->rt_dst;
2966                 rt->rt_src = ort->rt_src;
2967                 rt->rt_gateway = ort->rt_gateway;
2968                 rt->rt_spec_dst = ort->rt_spec_dst;
2969                 rt->peer = ort->peer;
2970                 if (rt->peer)
2971                         atomic_inc(&rt->peer->refcnt);
2972                 rt->fi = ort->fi;
2973                 if (rt->fi)
2974                         atomic_inc(&rt->fi->fib_clntref);
2975
2976                 dst_free(new);
2977         }
2978
2979         dst_release(dst_orig);
2980
2981         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2982 }
2983
2984 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2985                                     struct sock *sk)
2986 {
2987         struct rtable *rt = __ip_route_output_key(net, flp4);
2988
2989         if (IS_ERR(rt))
2990                 return rt;
2991
2992         if (flp4->flowi4_proto)
2993                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2994                                                    flowi4_to_flowi(flp4),
2995                                                    sk, 0);
2996
2997         return rt;
2998 }
2999 EXPORT_SYMBOL_GPL(ip_route_output_flow);
3000
3001 static int rt_fill_info(struct net *net,
3002                         struct sk_buff *skb, u32 pid, u32 seq, int event,
3003                         int nowait, unsigned int flags)
3004 {
3005         struct rtable *rt = skb_rtable(skb);
3006         struct rtmsg *r;
3007         struct nlmsghdr *nlh;
3008         unsigned long expires = 0;
3009         const struct inet_peer *peer = rt->peer;
3010         u32 id = 0, ts = 0, tsage = 0, error;
3011
3012         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
3013         if (nlh == NULL)
3014                 return -EMSGSIZE;
3015
3016         r = nlmsg_data(nlh);
3017         r->rtm_family    = AF_INET;
3018         r->rtm_dst_len  = 32;
3019         r->rtm_src_len  = 0;
3020         r->rtm_tos      = rt->rt_key_tos;
3021         r->rtm_table    = RT_TABLE_MAIN;
3022         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
3023         r->rtm_type     = rt->rt_type;
3024         r->rtm_scope    = RT_SCOPE_UNIVERSE;
3025         r->rtm_protocol = RTPROT_UNSPEC;
3026         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
3027         if (rt->rt_flags & RTCF_NOTIFY)
3028                 r->rtm_flags |= RTM_F_NOTIFY;
3029
3030         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
3031
3032         if (rt->rt_key_src) {
3033                 r->rtm_src_len = 32;
3034                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
3035         }
3036         if (rt->dst.dev)
3037                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
3038 #ifdef CONFIG_IP_ROUTE_CLASSID
3039         if (rt->dst.tclassid)
3040                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3041 #endif
3042         if (rt_is_input_route(rt))
3043                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3044         else if (rt->rt_src != rt->rt_key_src)
3045                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3046
3047         if (rt->rt_dst != rt->rt_gateway)
3048                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3049
3050         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3051                 goto nla_put_failure;
3052
3053         if (rt->rt_mark)
3054                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3055
3056         error = rt->dst.error;
3057         if (peer) {
3058                 inet_peer_refcheck(rt->peer);
3059                 if (peer->tcp_ts_stamp) {
3060                         ts = peer->tcp_ts;
3061                         tsage = get_seconds() - peer->tcp_ts_stamp;
3062                 }
3063                 expires = ACCESS_ONCE(peer->pmtu_expires);
3064                 if (expires) {
3065                         if (time_before(jiffies, expires))
3066                                 expires -= jiffies;
3067                         else
3068                                 expires = 0;
3069                 }
3070         }
3071
3072         if (rt_is_input_route(rt)) {
3073 #ifdef CONFIG_IP_MROUTE
3074                 __be32 dst = rt->rt_dst;
3075
3076                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3077                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3078                         int err = ipmr_get_route(net, skb,
3079                                                  rt->rt_src, rt->rt_dst,
3080                                                  r, nowait);
3081                         if (err <= 0) {
3082                                 if (!nowait) {
3083                                         if (err == 0)
3084                                                 return 0;
3085                                         goto nla_put_failure;
3086                                 } else {
3087                                         if (err == -EMSGSIZE)
3088                                                 goto nla_put_failure;
3089                                         error = err;
3090                                 }
3091                         }
3092                 } else
3093 #endif
3094                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3095         }
3096
3097         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3098                                expires, error) < 0)
3099                 goto nla_put_failure;
3100
3101         return nlmsg_end(skb, nlh);
3102
3103 nla_put_failure:
3104         nlmsg_cancel(skb, nlh);
3105         return -EMSGSIZE;
3106 }
3107
3108 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3109 {
3110         struct net *net = sock_net(in_skb->sk);
3111         struct rtmsg *rtm;
3112         struct nlattr *tb[RTA_MAX+1];
3113         struct rtable *rt = NULL;
3114         __be32 dst = 0;
3115         __be32 src = 0;
3116         u32 iif;
3117         int err;
3118         int mark;
3119         struct sk_buff *skb;
3120
3121         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3122         if (err < 0)
3123                 goto errout;
3124
3125         rtm = nlmsg_data(nlh);
3126
3127         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3128         if (skb == NULL) {
3129                 err = -ENOBUFS;
3130                 goto errout;
3131         }
3132
3133         /* Reserve room for dummy headers, this skb can pass
3134            through good chunk of routing engine.
3135          */
3136         skb_reset_mac_header(skb);
3137         skb_reset_network_header(skb);
3138
3139         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3140         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3141         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3142
3143         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3144         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3145         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3146         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3147
3148         if (iif) {
3149                 struct net_device *dev;
3150
3151                 dev = __dev_get_by_index(net, iif);
3152                 if (dev == NULL) {
3153                         err = -ENODEV;
3154                         goto errout_free;
3155                 }
3156
3157                 skb->protocol   = htons(ETH_P_IP);
3158                 skb->dev        = dev;
3159                 skb->mark       = mark;
3160                 local_bh_disable();
3161                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3162                 local_bh_enable();
3163
3164                 rt = skb_rtable(skb);
3165                 if (err == 0 && rt->dst.error)
3166                         err = -rt->dst.error;
3167         } else {
3168                 struct flowi4 fl4 = {
3169                         .daddr = dst,
3170                         .saddr = src,
3171                         .flowi4_tos = rtm->rtm_tos,
3172                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3173                         .flowi4_mark = mark,
3174                 };
3175                 rt = ip_route_output_key(net, &fl4);
3176
3177                 err = 0;
3178                 if (IS_ERR(rt))
3179                         err = PTR_ERR(rt);
3180         }
3181
3182         if (err)
3183                 goto errout_free;
3184
3185         skb_dst_set(skb, &rt->dst);
3186         if (rtm->rtm_flags & RTM_F_NOTIFY)
3187                 rt->rt_flags |= RTCF_NOTIFY;
3188
3189         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3190                            RTM_NEWROUTE, 0, 0);
3191         if (err <= 0)
3192                 goto errout_free;
3193
3194         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3195 errout:
3196         return err;
3197
3198 errout_free:
3199         kfree_skb(skb);
3200         goto errout;
3201 }
3202
3203 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3204 {
3205         struct rtable *rt;
3206         int h, s_h;
3207         int idx, s_idx;
3208         struct net *net;
3209
3210         net = sock_net(skb->sk);
3211
3212         s_h = cb->args[0];
3213         if (s_h < 0)
3214                 s_h = 0;
3215         s_idx = idx = cb->args[1];
3216         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3217                 if (!rt_hash_table[h].chain)
3218                         continue;
3219                 rcu_read_lock_bh();
3220                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3221                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3222                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3223                                 continue;
3224                         if (rt_is_expired(rt))
3225                                 continue;
3226                         skb_dst_set_noref(skb, &rt->dst);
3227                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3228                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3229                                          1, NLM_F_MULTI) <= 0) {
3230                                 skb_dst_drop(skb);
3231                                 rcu_read_unlock_bh();
3232                                 goto done;
3233                         }
3234                         skb_dst_drop(skb);
3235                 }
3236                 rcu_read_unlock_bh();
3237         }
3238
3239 done:
3240         cb->args[0] = h;
3241         cb->args[1] = idx;
3242         return skb->len;
3243 }
3244
3245 void ip_rt_multicast_event(struct in_device *in_dev)
3246 {
3247         rt_cache_flush(dev_net(in_dev->dev), 0);
3248 }
3249
3250 #ifdef CONFIG_SYSCTL
3251 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3252                                         void __user *buffer,
3253                                         size_t *lenp, loff_t *ppos)
3254 {
3255         if (write) {
3256                 int flush_delay;
3257                 ctl_table ctl;
3258                 struct net *net;
3259
3260                 memcpy(&ctl, __ctl, sizeof(ctl));
3261                 ctl.data = &flush_delay;
3262                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3263
3264                 net = (struct net *)__ctl->extra1;
3265                 rt_cache_flush(net, flush_delay);
3266                 return 0;
3267         }
3268
3269         return -EINVAL;
3270 }
3271
3272 static ctl_table ipv4_route_table[] = {
3273         {
3274                 .procname       = "gc_thresh",
3275                 .data           = &ipv4_dst_ops.gc_thresh,
3276                 .maxlen         = sizeof(int),
3277                 .mode           = 0644,
3278                 .proc_handler   = proc_dointvec,
3279         },
3280         {
3281                 .procname       = "max_size",
3282                 .data           = &ip_rt_max_size,
3283                 .maxlen         = sizeof(int),
3284                 .mode           = 0644,
3285                 .proc_handler   = proc_dointvec,
3286         },
3287         {
3288                 /*  Deprecated. Use gc_min_interval_ms */
3289
3290                 .procname       = "gc_min_interval",
3291                 .data           = &ip_rt_gc_min_interval,
3292                 .maxlen         = sizeof(int),
3293                 .mode           = 0644,
3294                 .proc_handler   = proc_dointvec_jiffies,
3295         },
3296         {
3297                 .procname       = "gc_min_interval_ms",
3298                 .data           = &ip_rt_gc_min_interval,
3299                 .maxlen         = sizeof(int),
3300                 .mode           = 0644,
3301                 .proc_handler   = proc_dointvec_ms_jiffies,
3302         },
3303         {
3304                 .procname       = "gc_timeout",
3305                 .data           = &ip_rt_gc_timeout,
3306                 .maxlen         = sizeof(int),
3307                 .mode           = 0644,
3308                 .proc_handler   = proc_dointvec_jiffies,
3309         },
3310         {
3311                 .procname       = "gc_interval",
3312                 .data           = &ip_rt_gc_interval,
3313                 .maxlen         = sizeof(int),
3314                 .mode           = 0644,
3315                 .proc_handler   = proc_dointvec_jiffies,
3316         },
3317         {
3318                 .procname       = "redirect_load",
3319                 .data           = &ip_rt_redirect_load,
3320                 .maxlen         = sizeof(int),
3321                 .mode           = 0644,
3322                 .proc_handler   = proc_dointvec,
3323         },
3324         {
3325                 .procname       = "redirect_number",
3326                 .data           = &ip_rt_redirect_number,
3327                 .maxlen         = sizeof(int),
3328                 .mode           = 0644,
3329                 .proc_handler   = proc_dointvec,
3330         },
3331         {
3332                 .procname       = "redirect_silence",
3333                 .data           = &ip_rt_redirect_silence,
3334                 .maxlen         = sizeof(int),
3335                 .mode           = 0644,
3336                 .proc_handler   = proc_dointvec,
3337         },
3338         {
3339                 .procname       = "error_cost",
3340                 .data           = &ip_rt_error_cost,
3341                 .maxlen         = sizeof(int),
3342                 .mode           = 0644,
3343                 .proc_handler   = proc_dointvec,
3344         },
3345         {
3346                 .procname       = "error_burst",
3347                 .data           = &ip_rt_error_burst,
3348                 .maxlen         = sizeof(int),
3349                 .mode           = 0644,
3350                 .proc_handler   = proc_dointvec,
3351         },
3352         {
3353                 .procname       = "gc_elasticity",
3354                 .data           = &ip_rt_gc_elasticity,
3355                 .maxlen         = sizeof(int),
3356                 .mode           = 0644,
3357                 .proc_handler   = proc_dointvec,
3358         },
3359         {
3360                 .procname       = "mtu_expires",
3361                 .data           = &ip_rt_mtu_expires,
3362                 .maxlen         = sizeof(int),
3363                 .mode           = 0644,
3364                 .proc_handler   = proc_dointvec_jiffies,
3365         },
3366         {
3367                 .procname       = "min_pmtu",
3368                 .data           = &ip_rt_min_pmtu,
3369                 .maxlen         = sizeof(int),
3370                 .mode           = 0644,
3371                 .proc_handler   = proc_dointvec,
3372         },
3373         {
3374                 .procname       = "min_adv_mss",
3375                 .data           = &ip_rt_min_advmss,
3376                 .maxlen         = sizeof(int),
3377                 .mode           = 0644,
3378                 .proc_handler   = proc_dointvec,
3379         },
3380         { }
3381 };
3382
3383 static struct ctl_table empty[1];
3384
3385 static struct ctl_table ipv4_skeleton[] =
3386 {
3387         { .procname = "route", 
3388           .mode = 0555, .child = ipv4_route_table},
3389         { .procname = "neigh", 
3390           .mode = 0555, .child = empty},
3391         { }
3392 };
3393
3394 static __net_initdata struct ctl_path ipv4_path[] = {
3395         { .procname = "net", },
3396         { .procname = "ipv4", },
3397         { },
3398 };
3399
3400 static struct ctl_table ipv4_route_flush_table[] = {
3401         {
3402                 .procname       = "flush",
3403                 .maxlen         = sizeof(int),
3404                 .mode           = 0200,
3405                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3406         },
3407         { },
3408 };
3409
3410 static __net_initdata struct ctl_path ipv4_route_path[] = {
3411         { .procname = "net", },
3412         { .procname = "ipv4", },
3413         { .procname = "route", },
3414         { },
3415 };
3416
3417 static __net_init int sysctl_route_net_init(struct net *net)
3418 {
3419         struct ctl_table *tbl;
3420
3421         tbl = ipv4_route_flush_table;
3422         if (!net_eq(net, &init_net)) {
3423                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3424                 if (tbl == NULL)
3425                         goto err_dup;
3426         }
3427         tbl[0].extra1 = net;
3428
3429         net->ipv4.route_hdr =
3430                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3431         if (net->ipv4.route_hdr == NULL)
3432                 goto err_reg;
3433         return 0;
3434
3435 err_reg:
3436         if (tbl != ipv4_route_flush_table)
3437                 kfree(tbl);
3438 err_dup:
3439         return -ENOMEM;
3440 }
3441
3442 static __net_exit void sysctl_route_net_exit(struct net *net)
3443 {
3444         struct ctl_table *tbl;
3445
3446         tbl = net->ipv4.route_hdr->ctl_table_arg;
3447         unregister_net_sysctl_table(net->ipv4.route_hdr);
3448         BUG_ON(tbl == ipv4_route_flush_table);
3449         kfree(tbl);
3450 }
3451
3452 static __net_initdata struct pernet_operations sysctl_route_ops = {
3453         .init = sysctl_route_net_init,
3454         .exit = sysctl_route_net_exit,
3455 };
3456 #endif
3457
3458 static __net_init int rt_genid_init(struct net *net)
3459 {
3460         get_random_bytes(&net->ipv4.rt_genid,
3461                          sizeof(net->ipv4.rt_genid));
3462         get_random_bytes(&net->ipv4.dev_addr_genid,
3463                          sizeof(net->ipv4.dev_addr_genid));
3464         return 0;
3465 }
3466
3467 static __net_initdata struct pernet_operations rt_genid_ops = {
3468         .init = rt_genid_init,
3469 };
3470
3471
3472 #ifdef CONFIG_IP_ROUTE_CLASSID
3473 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3474 #endif /* CONFIG_IP_ROUTE_CLASSID */
3475
3476 static __initdata unsigned long rhash_entries;
3477 static int __init set_rhash_entries(char *str)
3478 {
3479         if (!str)
3480                 return 0;
3481         rhash_entries = simple_strtoul(str, &str, 0);
3482         return 1;
3483 }
3484 __setup("rhash_entries=", set_rhash_entries);
3485
3486 int __init ip_rt_init(void)
3487 {
3488         int rc = 0;
3489
3490         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3491         if (!ip_idents)
3492                 panic("IP: failed to allocate ip_idents\n");
3493
3494         get_random_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3495
3496 #ifdef CONFIG_IP_ROUTE_CLASSID
3497         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3498         if (!ip_rt_acct)
3499                 panic("IP: failed to allocate ip_rt_acct\n");
3500 #endif
3501
3502         ipv4_dst_ops.kmem_cachep =
3503                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3504                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3505
3506         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3507
3508         if (dst_entries_init(&ipv4_dst_ops) < 0)
3509                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3510
3511         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3512                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3513
3514         rt_hash_table = (struct rt_hash_bucket *)
3515                 alloc_large_system_hash("IP route cache",
3516                                         sizeof(struct rt_hash_bucket),
3517                                         rhash_entries,
3518                                         (totalram_pages >= 128 * 1024) ?
3519                                         15 : 17,
3520                                         0,
3521                                         &rt_hash_log,
3522                                         &rt_hash_mask,
3523                                         rhash_entries ? 0 : 512 * 1024);
3524         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3525         rt_hash_lock_init();
3526
3527         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3528         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3529
3530         devinet_init();
3531         ip_fib_init();
3532
3533         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3534         expires_ljiffies = jiffies;
3535         schedule_delayed_work(&expires_work,
3536                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3537
3538         if (ip_rt_proc_init())
3539                 printk(KERN_ERR "Unable to create route proc files\n");
3540 #ifdef CONFIG_XFRM
3541         xfrm_init();
3542         xfrm4_init(ip_rt_max_size);
3543 #endif
3544         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3545
3546 #ifdef CONFIG_SYSCTL
3547         register_pernet_subsys(&sysctl_route_ops);
3548 #endif
3549         register_pernet_subsys(&rt_genid_ops);
3550         return rc;
3551 }
3552
3553 #ifdef CONFIG_SYSCTL
3554 /*
3555  * We really need to sanitize the damn ipv4 init order, then all
3556  * this nonsense will go away.
3557  */
3558 void __init ip_static_sysctl_init(void)
3559 {
3560         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3561 }
3562 #endif