Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wirel...
[pandora-kernel.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149                             int how)
150 {
151 }
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155         WARN_ON(1);
156         return NULL;
157 }
158
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160                                            struct sk_buff *skb,
161                                            const void *daddr);
162
163 static struct dst_ops ipv4_dst_ops = {
164         .family =               AF_INET,
165         .protocol =             cpu_to_be16(ETH_P_IP),
166         .check =                ipv4_dst_check,
167         .default_advmss =       ipv4_default_advmss,
168         .mtu =                  ipv4_mtu,
169         .cow_metrics =          ipv4_cow_metrics,
170         .destroy =              ipv4_dst_destroy,
171         .ifdown =               ipv4_dst_ifdown,
172         .negative_advice =      ipv4_negative_advice,
173         .link_failure =         ipv4_link_failure,
174         .update_pmtu =          ip_rt_update_pmtu,
175         .redirect =             ip_do_redirect,
176         .local_out =            __ip_local_out,
177         .neigh_lookup =         ipv4_neigh_lookup,
178 };
179
180 #define ECN_OR_COST(class)      TC_PRIO_##class
181
182 const __u8 ip_tos2prio[16] = {
183         TC_PRIO_BESTEFFORT,
184         ECN_OR_COST(BESTEFFORT),
185         TC_PRIO_BESTEFFORT,
186         ECN_OR_COST(BESTEFFORT),
187         TC_PRIO_BULK,
188         ECN_OR_COST(BULK),
189         TC_PRIO_BULK,
190         ECN_OR_COST(BULK),
191         TC_PRIO_INTERACTIVE,
192         ECN_OR_COST(INTERACTIVE),
193         TC_PRIO_INTERACTIVE,
194         ECN_OR_COST(INTERACTIVE),
195         TC_PRIO_INTERACTIVE_BULK,
196         ECN_OR_COST(INTERACTIVE_BULK),
197         TC_PRIO_INTERACTIVE_BULK,
198         ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207 {
208         if (*pos)
209                 return NULL;
210         return SEQ_START_TOKEN;
211 }
212
213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214 {
215         ++*pos;
216         return NULL;
217 }
218
219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220 {
221 }
222
223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 {
225         if (v == SEQ_START_TOKEN)
226                 seq_printf(seq, "%-127s\n",
227                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229                            "HHUptod\tSpecDst");
230         return 0;
231 }
232
233 static const struct seq_operations rt_cache_seq_ops = {
234         .start  = rt_cache_seq_start,
235         .next   = rt_cache_seq_next,
236         .stop   = rt_cache_seq_stop,
237         .show   = rt_cache_seq_show,
238 };
239
240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 {
242         return seq_open(file, &rt_cache_seq_ops);
243 }
244
245 static const struct file_operations rt_cache_seq_fops = {
246         .owner   = THIS_MODULE,
247         .open    = rt_cache_seq_open,
248         .read    = seq_read,
249         .llseek  = seq_lseek,
250         .release = seq_release,
251 };
252
253
254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255 {
256         int cpu;
257
258         if (*pos == 0)
259                 return SEQ_START_TOKEN;
260
261         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262                 if (!cpu_possible(cpu))
263                         continue;
264                 *pos = cpu+1;
265                 return &per_cpu(rt_cache_stat, cpu);
266         }
267         return NULL;
268 }
269
270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 {
272         int cpu;
273
274         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275                 if (!cpu_possible(cpu))
276                         continue;
277                 *pos = cpu+1;
278                 return &per_cpu(rt_cache_stat, cpu);
279         }
280         return NULL;
281
282 }
283
284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 {
286
287 }
288
289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 {
291         struct rt_cache_stat *st = v;
292
293         if (v == SEQ_START_TOKEN) {
294                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295                 return 0;
296         }
297
298         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
299                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300                    dst_entries_get_slow(&ipv4_dst_ops),
301                    st->in_hit,
302                    st->in_slow_tot,
303                    st->in_slow_mc,
304                    st->in_no_route,
305                    st->in_brd,
306                    st->in_martian_dst,
307                    st->in_martian_src,
308
309                    st->out_hit,
310                    st->out_slow_tot,
311                    st->out_slow_mc,
312
313                    st->gc_total,
314                    st->gc_ignored,
315                    st->gc_goal_miss,
316                    st->gc_dst_overflow,
317                    st->in_hlist_search,
318                    st->out_hlist_search
319                 );
320         return 0;
321 }
322
323 static const struct seq_operations rt_cpu_seq_ops = {
324         .start  = rt_cpu_seq_start,
325         .next   = rt_cpu_seq_next,
326         .stop   = rt_cpu_seq_stop,
327         .show   = rt_cpu_seq_show,
328 };
329
330
331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 {
333         return seq_open(file, &rt_cpu_seq_ops);
334 }
335
336 static const struct file_operations rt_cpu_seq_fops = {
337         .owner   = THIS_MODULE,
338         .open    = rt_cpu_seq_open,
339         .read    = seq_read,
340         .llseek  = seq_lseek,
341         .release = seq_release,
342 };
343
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file *m, void *v)
346 {
347         struct ip_rt_acct *dst, *src;
348         unsigned int i, j;
349
350         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351         if (!dst)
352                 return -ENOMEM;
353
354         for_each_possible_cpu(i) {
355                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356                 for (j = 0; j < 256; j++) {
357                         dst[j].o_bytes   += src[j].o_bytes;
358                         dst[j].o_packets += src[j].o_packets;
359                         dst[j].i_bytes   += src[j].i_bytes;
360                         dst[j].i_packets += src[j].i_packets;
361                 }
362         }
363
364         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365         kfree(dst);
366         return 0;
367 }
368
369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
370 {
371         return single_open(file, rt_acct_proc_show, NULL);
372 }
373
374 static const struct file_operations rt_acct_proc_fops = {
375         .owner          = THIS_MODULE,
376         .open           = rt_acct_proc_open,
377         .read           = seq_read,
378         .llseek         = seq_lseek,
379         .release        = single_release,
380 };
381 #endif
382
383 static int __net_init ip_rt_do_proc_init(struct net *net)
384 {
385         struct proc_dir_entry *pde;
386
387         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
388                         &rt_cache_seq_fops);
389         if (!pde)
390                 goto err1;
391
392         pde = proc_create("rt_cache", S_IRUGO,
393                           net->proc_net_stat, &rt_cpu_seq_fops);
394         if (!pde)
395                 goto err2;
396
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399         if (!pde)
400                 goto err3;
401 #endif
402         return 0;
403
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 err3:
406         remove_proc_entry("rt_cache", net->proc_net_stat);
407 #endif
408 err2:
409         remove_proc_entry("rt_cache", net->proc_net);
410 err1:
411         return -ENOMEM;
412 }
413
414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
415 {
416         remove_proc_entry("rt_cache", net->proc_net_stat);
417         remove_proc_entry("rt_cache", net->proc_net);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419         remove_proc_entry("rt_acct", net->proc_net);
420 #endif
421 }
422
423 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
424         .init = ip_rt_do_proc_init,
425         .exit = ip_rt_do_proc_exit,
426 };
427
428 static int __init ip_rt_proc_init(void)
429 {
430         return register_pernet_subsys(&ip_rt_proc_ops);
431 }
432
433 #else
434 static inline int ip_rt_proc_init(void)
435 {
436         return 0;
437 }
438 #endif /* CONFIG_PROC_FS */
439
440 static inline bool rt_is_expired(const struct rtable *rth)
441 {
442         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
443 }
444
445 void rt_cache_flush(struct net *net)
446 {
447         rt_genid_bump(net);
448 }
449
450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451                                            struct sk_buff *skb,
452                                            const void *daddr)
453 {
454         struct net_device *dev = dst->dev;
455         const __be32 *pkey = daddr;
456         const struct rtable *rt;
457         struct neighbour *n;
458
459         rt = (const struct rtable *) dst;
460         if (rt->rt_gateway)
461                 pkey = (const __be32 *) &rt->rt_gateway;
462         else if (skb)
463                 pkey = &ip_hdr(skb)->daddr;
464
465         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466         if (n)
467                 return n;
468         return neigh_create(&arp_tbl, pkey, dev);
469 }
470
471 /*
472  * Peer allocation may fail only in serious out-of-memory conditions.  However
473  * we still can generate some output.
474  * Random ID selection looks a bit dangerous because we have no chances to
475  * select ID being unique in a reasonable period of time.
476  * But broken packet identifier may be better than no packet at all.
477  */
478 static void ip_select_fb_ident(struct iphdr *iph)
479 {
480         static DEFINE_SPINLOCK(ip_fb_id_lock);
481         static u32 ip_fallback_id;
482         u32 salt;
483
484         spin_lock_bh(&ip_fb_id_lock);
485         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
486         iph->id = htons(salt & 0xFFFF);
487         ip_fallback_id = salt;
488         spin_unlock_bh(&ip_fb_id_lock);
489 }
490
491 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
492 {
493         struct net *net = dev_net(dst->dev);
494         struct inet_peer *peer;
495
496         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
497         if (peer) {
498                 iph->id = htons(inet_getid(peer, more));
499                 inet_putpeer(peer);
500                 return;
501         }
502
503         ip_select_fb_ident(iph);
504 }
505 EXPORT_SYMBOL(__ip_select_ident);
506
507 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
508                              const struct iphdr *iph,
509                              int oif, u8 tos,
510                              u8 prot, u32 mark, int flow_flags)
511 {
512         if (sk) {
513                 const struct inet_sock *inet = inet_sk(sk);
514
515                 oif = sk->sk_bound_dev_if;
516                 mark = sk->sk_mark;
517                 tos = RT_CONN_FLAGS(sk);
518                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
519         }
520         flowi4_init_output(fl4, oif, mark, tos,
521                            RT_SCOPE_UNIVERSE, prot,
522                            flow_flags,
523                            iph->daddr, iph->saddr, 0, 0);
524 }
525
526 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527                                const struct sock *sk)
528 {
529         const struct iphdr *iph = ip_hdr(skb);
530         int oif = skb->dev->ifindex;
531         u8 tos = RT_TOS(iph->tos);
532         u8 prot = iph->protocol;
533         u32 mark = skb->mark;
534
535         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
536 }
537
538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
539 {
540         const struct inet_sock *inet = inet_sk(sk);
541         const struct ip_options_rcu *inet_opt;
542         __be32 daddr = inet->inet_daddr;
543
544         rcu_read_lock();
545         inet_opt = rcu_dereference(inet->inet_opt);
546         if (inet_opt && inet_opt->opt.srr)
547                 daddr = inet_opt->opt.faddr;
548         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551                            inet_sk_flowi_flags(sk),
552                            daddr, inet->inet_saddr, 0, 0);
553         rcu_read_unlock();
554 }
555
556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557                                  const struct sk_buff *skb)
558 {
559         if (skb)
560                 build_skb_flow_key(fl4, skb, sk);
561         else
562                 build_sk_flow_key(fl4, sk);
563 }
564
565 static inline void rt_free(struct rtable *rt)
566 {
567         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568 }
569
570 static DEFINE_SPINLOCK(fnhe_lock);
571
572 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
573 {
574         struct fib_nh_exception *fnhe, *oldest;
575         struct rtable *orig;
576
577         oldest = rcu_dereference(hash->chain);
578         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579              fnhe = rcu_dereference(fnhe->fnhe_next)) {
580                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
581                         oldest = fnhe;
582         }
583         orig = rcu_dereference(oldest->fnhe_rth);
584         if (orig) {
585                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
586                 rt_free(orig);
587         }
588         return oldest;
589 }
590
591 static inline u32 fnhe_hashfun(__be32 daddr)
592 {
593         u32 hval;
594
595         hval = (__force u32) daddr;
596         hval ^= (hval >> 11) ^ (hval >> 22);
597
598         return hval & (FNHE_HASH_SIZE - 1);
599 }
600
601 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602                                   u32 pmtu, unsigned long expires)
603 {
604         struct fnhe_hash_bucket *hash;
605         struct fib_nh_exception *fnhe;
606         int depth;
607         u32 hval = fnhe_hashfun(daddr);
608
609         spin_lock_bh(&fnhe_lock);
610
611         hash = nh->nh_exceptions;
612         if (!hash) {
613                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
614                 if (!hash)
615                         goto out_unlock;
616                 nh->nh_exceptions = hash;
617         }
618
619         hash += hval;
620
621         depth = 0;
622         for (fnhe = rcu_dereference(hash->chain); fnhe;
623              fnhe = rcu_dereference(fnhe->fnhe_next)) {
624                 if (fnhe->fnhe_daddr == daddr)
625                         break;
626                 depth++;
627         }
628
629         if (fnhe) {
630                 if (gw)
631                         fnhe->fnhe_gw = gw;
632                 if (pmtu) {
633                         fnhe->fnhe_pmtu = pmtu;
634                         fnhe->fnhe_expires = expires;
635                 }
636         } else {
637                 if (depth > FNHE_RECLAIM_DEPTH)
638                         fnhe = fnhe_oldest(hash);
639                 else {
640                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
641                         if (!fnhe)
642                                 goto out_unlock;
643
644                         fnhe->fnhe_next = hash->chain;
645                         rcu_assign_pointer(hash->chain, fnhe);
646                 }
647                 fnhe->fnhe_daddr = daddr;
648                 fnhe->fnhe_gw = gw;
649                 fnhe->fnhe_pmtu = pmtu;
650                 fnhe->fnhe_expires = expires;
651         }
652
653         fnhe->fnhe_stamp = jiffies;
654
655 out_unlock:
656         spin_unlock_bh(&fnhe_lock);
657         return;
658 }
659
660 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
661                              bool kill_route)
662 {
663         __be32 new_gw = icmp_hdr(skb)->un.gateway;
664         __be32 old_gw = ip_hdr(skb)->saddr;
665         struct net_device *dev = skb->dev;
666         struct in_device *in_dev;
667         struct fib_result res;
668         struct neighbour *n;
669         struct net *net;
670
671         switch (icmp_hdr(skb)->code & 7) {
672         case ICMP_REDIR_NET:
673         case ICMP_REDIR_NETTOS:
674         case ICMP_REDIR_HOST:
675         case ICMP_REDIR_HOSTTOS:
676                 break;
677
678         default:
679                 return;
680         }
681
682         if (rt->rt_gateway != old_gw)
683                 return;
684
685         in_dev = __in_dev_get_rcu(dev);
686         if (!in_dev)
687                 return;
688
689         net = dev_net(dev);
690         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692             ipv4_is_zeronet(new_gw))
693                 goto reject_redirect;
694
695         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697                         goto reject_redirect;
698                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699                         goto reject_redirect;
700         } else {
701                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
702                         goto reject_redirect;
703         }
704
705         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
706         if (n) {
707                 if (!(n->nud_state & NUD_VALID)) {
708                         neigh_event_send(n, NULL);
709                 } else {
710                         if (fib_lookup(net, fl4, &res) == 0) {
711                                 struct fib_nh *nh = &FIB_RES_NH(res);
712
713                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
714                                                       0, 0);
715                         }
716                         if (kill_route)
717                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
718                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
719                 }
720                 neigh_release(n);
721         }
722         return;
723
724 reject_redirect:
725 #ifdef CONFIG_IP_ROUTE_VERBOSE
726         if (IN_DEV_LOG_MARTIANS(in_dev)) {
727                 const struct iphdr *iph = (const struct iphdr *) skb->data;
728                 __be32 daddr = iph->daddr;
729                 __be32 saddr = iph->saddr;
730
731                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732                                      "  Advised path = %pI4 -> %pI4\n",
733                                      &old_gw, dev->name, &new_gw,
734                                      &saddr, &daddr);
735         }
736 #endif
737         ;
738 }
739
740 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
741 {
742         struct rtable *rt;
743         struct flowi4 fl4;
744
745         rt = (struct rtable *) dst;
746
747         ip_rt_build_flow_key(&fl4, sk, skb);
748         __ip_do_redirect(rt, skb, &fl4, true);
749 }
750
751 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
752 {
753         struct rtable *rt = (struct rtable *)dst;
754         struct dst_entry *ret = dst;
755
756         if (rt) {
757                 if (dst->obsolete > 0) {
758                         ip_rt_put(rt);
759                         ret = NULL;
760                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
761                            rt->dst.expires) {
762                         ip_rt_put(rt);
763                         ret = NULL;
764                 }
765         }
766         return ret;
767 }
768
769 /*
770  * Algorithm:
771  *      1. The first ip_rt_redirect_number redirects are sent
772  *         with exponential backoff, then we stop sending them at all,
773  *         assuming that the host ignores our redirects.
774  *      2. If we did not see packets requiring redirects
775  *         during ip_rt_redirect_silence, we assume that the host
776  *         forgot redirected route and start to send redirects again.
777  *
778  * This algorithm is much cheaper and more intelligent than dumb load limiting
779  * in icmp.c.
780  *
781  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
782  * and "frag. need" (breaks PMTU discovery) in icmp.c.
783  */
784
785 void ip_rt_send_redirect(struct sk_buff *skb)
786 {
787         struct rtable *rt = skb_rtable(skb);
788         struct in_device *in_dev;
789         struct inet_peer *peer;
790         struct net *net;
791         int log_martians;
792
793         rcu_read_lock();
794         in_dev = __in_dev_get_rcu(rt->dst.dev);
795         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
796                 rcu_read_unlock();
797                 return;
798         }
799         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800         rcu_read_unlock();
801
802         net = dev_net(rt->dst.dev);
803         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
804         if (!peer) {
805                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
806                           rt_nexthop(rt, ip_hdr(skb)->daddr));
807                 return;
808         }
809
810         /* No redirected packets during ip_rt_redirect_silence;
811          * reset the algorithm.
812          */
813         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
814                 peer->rate_tokens = 0;
815
816         /* Too many ignored redirects; do not send anything
817          * set dst.rate_last to the last seen redirected packet.
818          */
819         if (peer->rate_tokens >= ip_rt_redirect_number) {
820                 peer->rate_last = jiffies;
821                 goto out_put_peer;
822         }
823
824         /* Check for load limit; set rate_last to the latest sent
825          * redirect.
826          */
827         if (peer->rate_tokens == 0 ||
828             time_after(jiffies,
829                        (peer->rate_last +
830                         (ip_rt_redirect_load << peer->rate_tokens)))) {
831                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
832
833                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
834                 peer->rate_last = jiffies;
835                 ++peer->rate_tokens;
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837                 if (log_martians &&
838                     peer->rate_tokens == ip_rt_redirect_number)
839                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
840                                              &ip_hdr(skb)->saddr, inet_iif(skb),
841                                              &ip_hdr(skb)->daddr, &gw);
842 #endif
843         }
844 out_put_peer:
845         inet_putpeer(peer);
846 }
847
848 static int ip_error(struct sk_buff *skb)
849 {
850         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
851         struct rtable *rt = skb_rtable(skb);
852         struct inet_peer *peer;
853         unsigned long now;
854         struct net *net;
855         bool send;
856         int code;
857
858         net = dev_net(rt->dst.dev);
859         if (!IN_DEV_FORWARD(in_dev)) {
860                 switch (rt->dst.error) {
861                 case EHOSTUNREACH:
862                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
863                         break;
864
865                 case ENETUNREACH:
866                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
867                         break;
868                 }
869                 goto out;
870         }
871
872         switch (rt->dst.error) {
873         case EINVAL:
874         default:
875                 goto out;
876         case EHOSTUNREACH:
877                 code = ICMP_HOST_UNREACH;
878                 break;
879         case ENETUNREACH:
880                 code = ICMP_NET_UNREACH;
881                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
882                 break;
883         case EACCES:
884                 code = ICMP_PKT_FILTERED;
885                 break;
886         }
887
888         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
889
890         send = true;
891         if (peer) {
892                 now = jiffies;
893                 peer->rate_tokens += now - peer->rate_last;
894                 if (peer->rate_tokens > ip_rt_error_burst)
895                         peer->rate_tokens = ip_rt_error_burst;
896                 peer->rate_last = now;
897                 if (peer->rate_tokens >= ip_rt_error_cost)
898                         peer->rate_tokens -= ip_rt_error_cost;
899                 else
900                         send = false;
901                 inet_putpeer(peer);
902         }
903         if (send)
904                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
905
906 out:    kfree_skb(skb);
907         return 0;
908 }
909
910 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
911 {
912         struct dst_entry *dst = &rt->dst;
913         struct fib_result res;
914
915         if (dst_metric_locked(dst, RTAX_MTU))
916                 return;
917
918         if (dst->dev->mtu < mtu)
919                 return;
920
921         if (mtu < ip_rt_min_pmtu)
922                 mtu = ip_rt_min_pmtu;
923
924         if (!rt->rt_pmtu) {
925                 dst->obsolete = DST_OBSOLETE_KILL;
926         } else {
927                 rt->rt_pmtu = mtu;
928                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
929         }
930
931         rcu_read_lock();
932         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
933                 struct fib_nh *nh = &FIB_RES_NH(res);
934
935                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
936                                       jiffies + ip_rt_mtu_expires);
937         }
938         rcu_read_unlock();
939 }
940
941 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
942                               struct sk_buff *skb, u32 mtu)
943 {
944         struct rtable *rt = (struct rtable *) dst;
945         struct flowi4 fl4;
946
947         ip_rt_build_flow_key(&fl4, sk, skb);
948         __ip_rt_update_pmtu(rt, &fl4, mtu);
949 }
950
951 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
952                       int oif, u32 mark, u8 protocol, int flow_flags)
953 {
954         const struct iphdr *iph = (const struct iphdr *) skb->data;
955         struct flowi4 fl4;
956         struct rtable *rt;
957
958         __build_flow_key(&fl4, NULL, iph, oif,
959                          RT_TOS(iph->tos), protocol, mark, flow_flags);
960         rt = __ip_route_output_key(net, &fl4);
961         if (!IS_ERR(rt)) {
962                 __ip_rt_update_pmtu(rt, &fl4, mtu);
963                 ip_rt_put(rt);
964         }
965 }
966 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
967
968 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
969 {
970         const struct iphdr *iph = (const struct iphdr *) skb->data;
971         struct flowi4 fl4;
972         struct rtable *rt;
973
974         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
975         rt = __ip_route_output_key(sock_net(sk), &fl4);
976         if (!IS_ERR(rt)) {
977                 __ip_rt_update_pmtu(rt, &fl4, mtu);
978                 ip_rt_put(rt);
979         }
980 }
981
982 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
983 {
984         const struct iphdr *iph = (const struct iphdr *) skb->data;
985         struct flowi4 fl4;
986         struct rtable *rt;
987         struct dst_entry *dst;
988         bool new = false;
989
990         bh_lock_sock(sk);
991         rt = (struct rtable *) __sk_dst_get(sk);
992
993         if (sock_owned_by_user(sk) || !rt) {
994                 __ipv4_sk_update_pmtu(skb, sk, mtu);
995                 goto out;
996         }
997
998         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
999
1000         if (!__sk_dst_check(sk, 0)) {
1001                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1002                 if (IS_ERR(rt))
1003                         goto out;
1004
1005                 new = true;
1006         }
1007
1008         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1009
1010         dst = dst_check(&rt->dst, 0);
1011         if (!dst) {
1012                 if (new)
1013                         dst_release(&rt->dst);
1014
1015                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1016                 if (IS_ERR(rt))
1017                         goto out;
1018
1019                 new = true;
1020         }
1021
1022         if (new)
1023                 __sk_dst_set(sk, &rt->dst);
1024
1025 out:
1026         bh_unlock_sock(sk);
1027 }
1028 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1029
1030 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1031                    int oif, u32 mark, u8 protocol, int flow_flags)
1032 {
1033         const struct iphdr *iph = (const struct iphdr *) skb->data;
1034         struct flowi4 fl4;
1035         struct rtable *rt;
1036
1037         __build_flow_key(&fl4, NULL, iph, oif,
1038                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1039         rt = __ip_route_output_key(net, &fl4);
1040         if (!IS_ERR(rt)) {
1041                 __ip_do_redirect(rt, skb, &fl4, false);
1042                 ip_rt_put(rt);
1043         }
1044 }
1045 EXPORT_SYMBOL_GPL(ipv4_redirect);
1046
1047 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1048 {
1049         const struct iphdr *iph = (const struct iphdr *) skb->data;
1050         struct flowi4 fl4;
1051         struct rtable *rt;
1052
1053         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1054         rt = __ip_route_output_key(sock_net(sk), &fl4);
1055         if (!IS_ERR(rt)) {
1056                 __ip_do_redirect(rt, skb, &fl4, false);
1057                 ip_rt_put(rt);
1058         }
1059 }
1060 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1061
1062 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1063 {
1064         struct rtable *rt = (struct rtable *) dst;
1065
1066         /* All IPV4 dsts are created with ->obsolete set to the value
1067          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1068          * into this function always.
1069          *
1070          * When a PMTU/redirect information update invalidates a
1071          * route, this is indicated by setting obsolete to
1072          * DST_OBSOLETE_KILL.
1073          */
1074         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1075                 return NULL;
1076         return dst;
1077 }
1078
1079 static void ipv4_link_failure(struct sk_buff *skb)
1080 {
1081         struct rtable *rt;
1082
1083         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1084
1085         rt = skb_rtable(skb);
1086         if (rt)
1087                 dst_set_expires(&rt->dst, 0);
1088 }
1089
1090 static int ip_rt_bug(struct sk_buff *skb)
1091 {
1092         pr_debug("%s: %pI4 -> %pI4, %s\n",
1093                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1094                  skb->dev ? skb->dev->name : "?");
1095         kfree_skb(skb);
1096         WARN_ON(1);
1097         return 0;
1098 }
1099
1100 /*
1101    We do not cache source address of outgoing interface,
1102    because it is used only by IP RR, TS and SRR options,
1103    so that it out of fast path.
1104
1105    BTW remember: "addr" is allowed to be not aligned
1106    in IP options!
1107  */
1108
1109 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1110 {
1111         __be32 src;
1112
1113         if (rt_is_output_route(rt))
1114                 src = ip_hdr(skb)->saddr;
1115         else {
1116                 struct fib_result res;
1117                 struct flowi4 fl4;
1118                 struct iphdr *iph;
1119
1120                 iph = ip_hdr(skb);
1121
1122                 memset(&fl4, 0, sizeof(fl4));
1123                 fl4.daddr = iph->daddr;
1124                 fl4.saddr = iph->saddr;
1125                 fl4.flowi4_tos = RT_TOS(iph->tos);
1126                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1127                 fl4.flowi4_iif = skb->dev->ifindex;
1128                 fl4.flowi4_mark = skb->mark;
1129
1130                 rcu_read_lock();
1131                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1132                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1133                 else
1134                         src = inet_select_addr(rt->dst.dev,
1135                                                rt_nexthop(rt, iph->daddr),
1136                                                RT_SCOPE_UNIVERSE);
1137                 rcu_read_unlock();
1138         }
1139         memcpy(addr, &src, 4);
1140 }
1141
1142 #ifdef CONFIG_IP_ROUTE_CLASSID
1143 static void set_class_tag(struct rtable *rt, u32 tag)
1144 {
1145         if (!(rt->dst.tclassid & 0xFFFF))
1146                 rt->dst.tclassid |= tag & 0xFFFF;
1147         if (!(rt->dst.tclassid & 0xFFFF0000))
1148                 rt->dst.tclassid |= tag & 0xFFFF0000;
1149 }
1150 #endif
1151
1152 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1153 {
1154         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1155
1156         if (advmss == 0) {
1157                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1158                                ip_rt_min_advmss);
1159                 if (advmss > 65535 - 40)
1160                         advmss = 65535 - 40;
1161         }
1162         return advmss;
1163 }
1164
1165 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1166 {
1167         const struct rtable *rt = (const struct rtable *) dst;
1168         unsigned int mtu = rt->rt_pmtu;
1169
1170         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1171                 mtu = dst_metric_raw(dst, RTAX_MTU);
1172
1173         if (mtu)
1174                 return mtu;
1175
1176         mtu = dst->dev->mtu;
1177
1178         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1179                 if (rt->rt_uses_gateway && mtu > 576)
1180                         mtu = 576;
1181         }
1182
1183         if (mtu > IP_MAX_MTU)
1184                 mtu = IP_MAX_MTU;
1185
1186         return mtu;
1187 }
1188
1189 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1190 {
1191         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1192         struct fib_nh_exception *fnhe;
1193         u32 hval;
1194
1195         if (!hash)
1196                 return NULL;
1197
1198         hval = fnhe_hashfun(daddr);
1199
1200         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1201              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1202                 if (fnhe->fnhe_daddr == daddr)
1203                         return fnhe;
1204         }
1205         return NULL;
1206 }
1207
1208 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1209                               __be32 daddr)
1210 {
1211         bool ret = false;
1212
1213         spin_lock_bh(&fnhe_lock);
1214
1215         if (daddr == fnhe->fnhe_daddr) {
1216                 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1217                 if (orig && rt_is_expired(orig)) {
1218                         fnhe->fnhe_gw = 0;
1219                         fnhe->fnhe_pmtu = 0;
1220                         fnhe->fnhe_expires = 0;
1221                 }
1222                 if (fnhe->fnhe_pmtu) {
1223                         unsigned long expires = fnhe->fnhe_expires;
1224                         unsigned long diff = expires - jiffies;
1225
1226                         if (time_before(jiffies, expires)) {
1227                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1228                                 dst_set_expires(&rt->dst, diff);
1229                         }
1230                 }
1231                 if (fnhe->fnhe_gw) {
1232                         rt->rt_flags |= RTCF_REDIRECTED;
1233                         rt->rt_gateway = fnhe->fnhe_gw;
1234                         rt->rt_uses_gateway = 1;
1235                 } else if (!rt->rt_gateway)
1236                         rt->rt_gateway = daddr;
1237
1238                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1239                 if (orig)
1240                         rt_free(orig);
1241
1242                 fnhe->fnhe_stamp = jiffies;
1243                 ret = true;
1244         }
1245         spin_unlock_bh(&fnhe_lock);
1246
1247         return ret;
1248 }
1249
1250 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1251 {
1252         struct rtable *orig, *prev, **p;
1253         bool ret = true;
1254
1255         if (rt_is_input_route(rt)) {
1256                 p = (struct rtable **)&nh->nh_rth_input;
1257         } else {
1258                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1259         }
1260         orig = *p;
1261
1262         prev = cmpxchg(p, orig, rt);
1263         if (prev == orig) {
1264                 if (orig)
1265                         rt_free(orig);
1266         } else
1267                 ret = false;
1268
1269         return ret;
1270 }
1271
1272 static DEFINE_SPINLOCK(rt_uncached_lock);
1273 static LIST_HEAD(rt_uncached_list);
1274
1275 static void rt_add_uncached_list(struct rtable *rt)
1276 {
1277         spin_lock_bh(&rt_uncached_lock);
1278         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1279         spin_unlock_bh(&rt_uncached_lock);
1280 }
1281
1282 static void ipv4_dst_destroy(struct dst_entry *dst)
1283 {
1284         struct rtable *rt = (struct rtable *) dst;
1285
1286         if (!list_empty(&rt->rt_uncached)) {
1287                 spin_lock_bh(&rt_uncached_lock);
1288                 list_del(&rt->rt_uncached);
1289                 spin_unlock_bh(&rt_uncached_lock);
1290         }
1291 }
1292
1293 void rt_flush_dev(struct net_device *dev)
1294 {
1295         if (!list_empty(&rt_uncached_list)) {
1296                 struct net *net = dev_net(dev);
1297                 struct rtable *rt;
1298
1299                 spin_lock_bh(&rt_uncached_lock);
1300                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1301                         if (rt->dst.dev != dev)
1302                                 continue;
1303                         rt->dst.dev = net->loopback_dev;
1304                         dev_hold(rt->dst.dev);
1305                         dev_put(dev);
1306                 }
1307                 spin_unlock_bh(&rt_uncached_lock);
1308         }
1309 }
1310
1311 static bool rt_cache_valid(const struct rtable *rt)
1312 {
1313         return  rt &&
1314                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1315                 !rt_is_expired(rt);
1316 }
1317
1318 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1319                            const struct fib_result *res,
1320                            struct fib_nh_exception *fnhe,
1321                            struct fib_info *fi, u16 type, u32 itag)
1322 {
1323         bool cached = false;
1324
1325         if (fi) {
1326                 struct fib_nh *nh = &FIB_RES_NH(*res);
1327
1328                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1329                         rt->rt_gateway = nh->nh_gw;
1330                         rt->rt_uses_gateway = 1;
1331                 }
1332                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1333 #ifdef CONFIG_IP_ROUTE_CLASSID
1334                 rt->dst.tclassid = nh->nh_tclassid;
1335 #endif
1336                 if (unlikely(fnhe))
1337                         cached = rt_bind_exception(rt, fnhe, daddr);
1338                 else if (!(rt->dst.flags & DST_NOCACHE))
1339                         cached = rt_cache_route(nh, rt);
1340                 if (unlikely(!cached)) {
1341                         /* Routes we intend to cache in nexthop exception or
1342                          * FIB nexthop have the DST_NOCACHE bit clear.
1343                          * However, if we are unsuccessful at storing this
1344                          * route into the cache we really need to set it.
1345                          */
1346                         rt->dst.flags |= DST_NOCACHE;
1347                         if (!rt->rt_gateway)
1348                                 rt->rt_gateway = daddr;
1349                         rt_add_uncached_list(rt);
1350                 }
1351         } else
1352                 rt_add_uncached_list(rt);
1353
1354 #ifdef CONFIG_IP_ROUTE_CLASSID
1355 #ifdef CONFIG_IP_MULTIPLE_TABLES
1356         set_class_tag(rt, res->tclassid);
1357 #endif
1358         set_class_tag(rt, itag);
1359 #endif
1360 }
1361
1362 static struct rtable *rt_dst_alloc(struct net_device *dev,
1363                                    bool nopolicy, bool noxfrm, bool will_cache)
1364 {
1365         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1366                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1367                          (nopolicy ? DST_NOPOLICY : 0) |
1368                          (noxfrm ? DST_NOXFRM : 0));
1369 }
1370
1371 /* called in rcu_read_lock() section */
1372 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1373                                 u8 tos, struct net_device *dev, int our)
1374 {
1375         struct rtable *rth;
1376         struct in_device *in_dev = __in_dev_get_rcu(dev);
1377         u32 itag = 0;
1378         int err;
1379
1380         /* Primary sanity checks. */
1381
1382         if (in_dev == NULL)
1383                 return -EINVAL;
1384
1385         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1386             skb->protocol != htons(ETH_P_IP))
1387                 goto e_inval;
1388
1389         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1390                 if (ipv4_is_loopback(saddr))
1391                         goto e_inval;
1392
1393         if (ipv4_is_zeronet(saddr)) {
1394                 if (!ipv4_is_local_multicast(daddr))
1395                         goto e_inval;
1396         } else {
1397                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1398                                           in_dev, &itag);
1399                 if (err < 0)
1400                         goto e_err;
1401         }
1402         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1403                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1404         if (!rth)
1405                 goto e_nobufs;
1406
1407 #ifdef CONFIG_IP_ROUTE_CLASSID
1408         rth->dst.tclassid = itag;
1409 #endif
1410         rth->dst.output = ip_rt_bug;
1411
1412         rth->rt_genid   = rt_genid(dev_net(dev));
1413         rth->rt_flags   = RTCF_MULTICAST;
1414         rth->rt_type    = RTN_MULTICAST;
1415         rth->rt_is_input= 1;
1416         rth->rt_iif     = 0;
1417         rth->rt_pmtu    = 0;
1418         rth->rt_gateway = 0;
1419         rth->rt_uses_gateway = 0;
1420         INIT_LIST_HEAD(&rth->rt_uncached);
1421         if (our) {
1422                 rth->dst.input= ip_local_deliver;
1423                 rth->rt_flags |= RTCF_LOCAL;
1424         }
1425
1426 #ifdef CONFIG_IP_MROUTE
1427         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1428                 rth->dst.input = ip_mr_input;
1429 #endif
1430         RT_CACHE_STAT_INC(in_slow_mc);
1431
1432         skb_dst_set(skb, &rth->dst);
1433         return 0;
1434
1435 e_nobufs:
1436         return -ENOBUFS;
1437 e_inval:
1438         return -EINVAL;
1439 e_err:
1440         return err;
1441 }
1442
1443
1444 static void ip_handle_martian_source(struct net_device *dev,
1445                                      struct in_device *in_dev,
1446                                      struct sk_buff *skb,
1447                                      __be32 daddr,
1448                                      __be32 saddr)
1449 {
1450         RT_CACHE_STAT_INC(in_martian_src);
1451 #ifdef CONFIG_IP_ROUTE_VERBOSE
1452         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1453                 /*
1454                  *      RFC1812 recommendation, if source is martian,
1455                  *      the only hint is MAC header.
1456                  */
1457                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1458                         &daddr, &saddr, dev->name);
1459                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1460                         print_hex_dump(KERN_WARNING, "ll header: ",
1461                                        DUMP_PREFIX_OFFSET, 16, 1,
1462                                        skb_mac_header(skb),
1463                                        dev->hard_header_len, true);
1464                 }
1465         }
1466 #endif
1467 }
1468
1469 /* called in rcu_read_lock() section */
1470 static int __mkroute_input(struct sk_buff *skb,
1471                            const struct fib_result *res,
1472                            struct in_device *in_dev,
1473                            __be32 daddr, __be32 saddr, u32 tos)
1474 {
1475         struct rtable *rth;
1476         int err;
1477         struct in_device *out_dev;
1478         unsigned int flags = 0;
1479         bool do_cache;
1480         u32 itag;
1481
1482         /* get a working reference to the output device */
1483         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1484         if (out_dev == NULL) {
1485                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1486                 return -EINVAL;
1487         }
1488
1489         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1490                                   in_dev->dev, in_dev, &itag);
1491         if (err < 0) {
1492                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1493                                          saddr);
1494
1495                 goto cleanup;
1496         }
1497
1498         do_cache = res->fi && !itag;
1499         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1500             (IN_DEV_SHARED_MEDIA(out_dev) ||
1501              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1502                 flags |= RTCF_DOREDIRECT;
1503                 do_cache = false;
1504         }
1505
1506         if (skb->protocol != htons(ETH_P_IP)) {
1507                 /* Not IP (i.e. ARP). Do not create route, if it is
1508                  * invalid for proxy arp. DNAT routes are always valid.
1509                  *
1510                  * Proxy arp feature have been extended to allow, ARP
1511                  * replies back to the same interface, to support
1512                  * Private VLAN switch technologies. See arp.c.
1513                  */
1514                 if (out_dev == in_dev &&
1515                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1516                         err = -EINVAL;
1517                         goto cleanup;
1518                 }
1519         }
1520
1521         if (do_cache) {
1522                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1523                 if (rt_cache_valid(rth)) {
1524                         skb_dst_set_noref(skb, &rth->dst);
1525                         goto out;
1526                 }
1527         }
1528
1529         rth = rt_dst_alloc(out_dev->dev,
1530                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1531                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1532         if (!rth) {
1533                 err = -ENOBUFS;
1534                 goto cleanup;
1535         }
1536
1537         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1538         rth->rt_flags = flags;
1539         rth->rt_type = res->type;
1540         rth->rt_is_input = 1;
1541         rth->rt_iif     = 0;
1542         rth->rt_pmtu    = 0;
1543         rth->rt_gateway = 0;
1544         rth->rt_uses_gateway = 0;
1545         INIT_LIST_HEAD(&rth->rt_uncached);
1546
1547         rth->dst.input = ip_forward;
1548         rth->dst.output = ip_output;
1549
1550         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1551         skb_dst_set(skb, &rth->dst);
1552 out:
1553         err = 0;
1554  cleanup:
1555         return err;
1556 }
1557
1558 static int ip_mkroute_input(struct sk_buff *skb,
1559                             struct fib_result *res,
1560                             const struct flowi4 *fl4,
1561                             struct in_device *in_dev,
1562                             __be32 daddr, __be32 saddr, u32 tos)
1563 {
1564 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1565         if (res->fi && res->fi->fib_nhs > 1)
1566                 fib_select_multipath(res);
1567 #endif
1568
1569         /* create a routing cache entry */
1570         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1571 }
1572
1573 /*
1574  *      NOTE. We drop all the packets that has local source
1575  *      addresses, because every properly looped back packet
1576  *      must have correct destination already attached by output routine.
1577  *
1578  *      Such approach solves two big problems:
1579  *      1. Not simplex devices are handled properly.
1580  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1581  *      called with rcu_read_lock()
1582  */
1583
1584 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1585                                u8 tos, struct net_device *dev)
1586 {
1587         struct fib_result res;
1588         struct in_device *in_dev = __in_dev_get_rcu(dev);
1589         struct flowi4   fl4;
1590         unsigned int    flags = 0;
1591         u32             itag = 0;
1592         struct rtable   *rth;
1593         int             err = -EINVAL;
1594         struct net    *net = dev_net(dev);
1595         bool do_cache;
1596
1597         /* IP on this device is disabled. */
1598
1599         if (!in_dev)
1600                 goto out;
1601
1602         /* Check for the most weird martians, which can be not detected
1603            by fib_lookup.
1604          */
1605
1606         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1607                 goto martian_source;
1608
1609         res.fi = NULL;
1610         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1611                 goto brd_input;
1612
1613         /* Accept zero addresses only to limited broadcast;
1614          * I even do not know to fix it or not. Waiting for complains :-)
1615          */
1616         if (ipv4_is_zeronet(saddr))
1617                 goto martian_source;
1618
1619         if (ipv4_is_zeronet(daddr))
1620                 goto martian_destination;
1621
1622         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1623          * and call it once if daddr or/and saddr are loopback addresses
1624          */
1625         if (ipv4_is_loopback(daddr)) {
1626                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1627                         goto martian_destination;
1628         } else if (ipv4_is_loopback(saddr)) {
1629                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1630                         goto martian_source;
1631         }
1632
1633         /*
1634          *      Now we are ready to route packet.
1635          */
1636         fl4.flowi4_oif = 0;
1637         fl4.flowi4_iif = dev->ifindex;
1638         fl4.flowi4_mark = skb->mark;
1639         fl4.flowi4_tos = tos;
1640         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1641         fl4.daddr = daddr;
1642         fl4.saddr = saddr;
1643         err = fib_lookup(net, &fl4, &res);
1644         if (err != 0)
1645                 goto no_route;
1646
1647         RT_CACHE_STAT_INC(in_slow_tot);
1648
1649         if (res.type == RTN_BROADCAST)
1650                 goto brd_input;
1651
1652         if (res.type == RTN_LOCAL) {
1653                 err = fib_validate_source(skb, saddr, daddr, tos,
1654                                           LOOPBACK_IFINDEX,
1655                                           dev, in_dev, &itag);
1656                 if (err < 0)
1657                         goto martian_source_keep_err;
1658                 goto local_input;
1659         }
1660
1661         if (!IN_DEV_FORWARD(in_dev))
1662                 goto no_route;
1663         if (res.type != RTN_UNICAST)
1664                 goto martian_destination;
1665
1666         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1667 out:    return err;
1668
1669 brd_input:
1670         if (skb->protocol != htons(ETH_P_IP))
1671                 goto e_inval;
1672
1673         if (!ipv4_is_zeronet(saddr)) {
1674                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1675                                           in_dev, &itag);
1676                 if (err < 0)
1677                         goto martian_source_keep_err;
1678         }
1679         flags |= RTCF_BROADCAST;
1680         res.type = RTN_BROADCAST;
1681         RT_CACHE_STAT_INC(in_brd);
1682
1683 local_input:
1684         do_cache = false;
1685         if (res.fi) {
1686                 if (!itag) {
1687                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1688                         if (rt_cache_valid(rth)) {
1689                                 skb_dst_set_noref(skb, &rth->dst);
1690                                 err = 0;
1691                                 goto out;
1692                         }
1693                         do_cache = true;
1694                 }
1695         }
1696
1697         rth = rt_dst_alloc(net->loopback_dev,
1698                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1699         if (!rth)
1700                 goto e_nobufs;
1701
1702         rth->dst.input= ip_local_deliver;
1703         rth->dst.output= ip_rt_bug;
1704 #ifdef CONFIG_IP_ROUTE_CLASSID
1705         rth->dst.tclassid = itag;
1706 #endif
1707
1708         rth->rt_genid = rt_genid(net);
1709         rth->rt_flags   = flags|RTCF_LOCAL;
1710         rth->rt_type    = res.type;
1711         rth->rt_is_input = 1;
1712         rth->rt_iif     = 0;
1713         rth->rt_pmtu    = 0;
1714         rth->rt_gateway = 0;
1715         rth->rt_uses_gateway = 0;
1716         INIT_LIST_HEAD(&rth->rt_uncached);
1717         if (res.type == RTN_UNREACHABLE) {
1718                 rth->dst.input= ip_error;
1719                 rth->dst.error= -err;
1720                 rth->rt_flags   &= ~RTCF_LOCAL;
1721         }
1722         if (do_cache)
1723                 rt_cache_route(&FIB_RES_NH(res), rth);
1724         skb_dst_set(skb, &rth->dst);
1725         err = 0;
1726         goto out;
1727
1728 no_route:
1729         RT_CACHE_STAT_INC(in_no_route);
1730         res.type = RTN_UNREACHABLE;
1731         if (err == -ESRCH)
1732                 err = -ENETUNREACH;
1733         goto local_input;
1734
1735         /*
1736          *      Do not cache martian addresses: they should be logged (RFC1812)
1737          */
1738 martian_destination:
1739         RT_CACHE_STAT_INC(in_martian_dst);
1740 #ifdef CONFIG_IP_ROUTE_VERBOSE
1741         if (IN_DEV_LOG_MARTIANS(in_dev))
1742                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1743                                      &daddr, &saddr, dev->name);
1744 #endif
1745
1746 e_inval:
1747         err = -EINVAL;
1748         goto out;
1749
1750 e_nobufs:
1751         err = -ENOBUFS;
1752         goto out;
1753
1754 martian_source:
1755         err = -EINVAL;
1756 martian_source_keep_err:
1757         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1758         goto out;
1759 }
1760
1761 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1762                          u8 tos, struct net_device *dev)
1763 {
1764         int res;
1765
1766         rcu_read_lock();
1767
1768         /* Multicast recognition logic is moved from route cache to here.
1769            The problem was that too many Ethernet cards have broken/missing
1770            hardware multicast filters :-( As result the host on multicasting
1771            network acquires a lot of useless route cache entries, sort of
1772            SDR messages from all the world. Now we try to get rid of them.
1773            Really, provided software IP multicast filter is organized
1774            reasonably (at least, hashed), it does not result in a slowdown
1775            comparing with route cache reject entries.
1776            Note, that multicast routers are not affected, because
1777            route cache entry is created eventually.
1778          */
1779         if (ipv4_is_multicast(daddr)) {
1780                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1781
1782                 if (in_dev) {
1783                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1784                                                   ip_hdr(skb)->protocol);
1785                         if (our
1786 #ifdef CONFIG_IP_MROUTE
1787                                 ||
1788                             (!ipv4_is_local_multicast(daddr) &&
1789                              IN_DEV_MFORWARD(in_dev))
1790 #endif
1791                            ) {
1792                                 int res = ip_route_input_mc(skb, daddr, saddr,
1793                                                             tos, dev, our);
1794                                 rcu_read_unlock();
1795                                 return res;
1796                         }
1797                 }
1798                 rcu_read_unlock();
1799                 return -EINVAL;
1800         }
1801         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1802         rcu_read_unlock();
1803         return res;
1804 }
1805 EXPORT_SYMBOL(ip_route_input_noref);
1806
1807 /* called with rcu_read_lock() */
1808 static struct rtable *__mkroute_output(const struct fib_result *res,
1809                                        const struct flowi4 *fl4, int orig_oif,
1810                                        struct net_device *dev_out,
1811                                        unsigned int flags)
1812 {
1813         struct fib_info *fi = res->fi;
1814         struct fib_nh_exception *fnhe;
1815         struct in_device *in_dev;
1816         u16 type = res->type;
1817         struct rtable *rth;
1818         bool do_cache;
1819
1820         in_dev = __in_dev_get_rcu(dev_out);
1821         if (!in_dev)
1822                 return ERR_PTR(-EINVAL);
1823
1824         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1825                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1826                         return ERR_PTR(-EINVAL);
1827
1828         if (ipv4_is_lbcast(fl4->daddr))
1829                 type = RTN_BROADCAST;
1830         else if (ipv4_is_multicast(fl4->daddr))
1831                 type = RTN_MULTICAST;
1832         else if (ipv4_is_zeronet(fl4->daddr))
1833                 return ERR_PTR(-EINVAL);
1834
1835         if (dev_out->flags & IFF_LOOPBACK)
1836                 flags |= RTCF_LOCAL;
1837
1838         do_cache = true;
1839         if (type == RTN_BROADCAST) {
1840                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1841                 fi = NULL;
1842         } else if (type == RTN_MULTICAST) {
1843                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1844                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1845                                      fl4->flowi4_proto))
1846                         flags &= ~RTCF_LOCAL;
1847                 else
1848                         do_cache = false;
1849                 /* If multicast route do not exist use
1850                  * default one, but do not gateway in this case.
1851                  * Yes, it is hack.
1852                  */
1853                 if (fi && res->prefixlen < 4)
1854                         fi = NULL;
1855         }
1856
1857         fnhe = NULL;
1858         do_cache &= fi != NULL;
1859         if (do_cache) {
1860                 struct rtable __rcu **prth;
1861                 struct fib_nh *nh = &FIB_RES_NH(*res);
1862
1863                 fnhe = find_exception(nh, fl4->daddr);
1864                 if (fnhe)
1865                         prth = &fnhe->fnhe_rth;
1866                 else {
1867                         if (unlikely(fl4->flowi4_flags &
1868                                      FLOWI_FLAG_KNOWN_NH &&
1869                                      !(nh->nh_gw &&
1870                                        nh->nh_scope == RT_SCOPE_LINK))) {
1871                                 do_cache = false;
1872                                 goto add;
1873                         }
1874                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1875                 }
1876                 rth = rcu_dereference(*prth);
1877                 if (rt_cache_valid(rth)) {
1878                         dst_hold(&rth->dst);
1879                         return rth;
1880                 }
1881         }
1882
1883 add:
1884         rth = rt_dst_alloc(dev_out,
1885                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1886                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1887                            do_cache);
1888         if (!rth)
1889                 return ERR_PTR(-ENOBUFS);
1890
1891         rth->dst.output = ip_output;
1892
1893         rth->rt_genid = rt_genid(dev_net(dev_out));
1894         rth->rt_flags   = flags;
1895         rth->rt_type    = type;
1896         rth->rt_is_input = 0;
1897         rth->rt_iif     = orig_oif ? : 0;
1898         rth->rt_pmtu    = 0;
1899         rth->rt_gateway = 0;
1900         rth->rt_uses_gateway = 0;
1901         INIT_LIST_HEAD(&rth->rt_uncached);
1902
1903         RT_CACHE_STAT_INC(out_slow_tot);
1904
1905         if (flags & RTCF_LOCAL)
1906                 rth->dst.input = ip_local_deliver;
1907         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1908                 if (flags & RTCF_LOCAL &&
1909                     !(dev_out->flags & IFF_LOOPBACK)) {
1910                         rth->dst.output = ip_mc_output;
1911                         RT_CACHE_STAT_INC(out_slow_mc);
1912                 }
1913 #ifdef CONFIG_IP_MROUTE
1914                 if (type == RTN_MULTICAST) {
1915                         if (IN_DEV_MFORWARD(in_dev) &&
1916                             !ipv4_is_local_multicast(fl4->daddr)) {
1917                                 rth->dst.input = ip_mr_input;
1918                                 rth->dst.output = ip_mc_output;
1919                         }
1920                 }
1921 #endif
1922         }
1923
1924         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1925
1926         return rth;
1927 }
1928
1929 /*
1930  * Major route resolver routine.
1931  */
1932
1933 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1934 {
1935         struct net_device *dev_out = NULL;
1936         __u8 tos = RT_FL_TOS(fl4);
1937         unsigned int flags = 0;
1938         struct fib_result res;
1939         struct rtable *rth;
1940         int orig_oif;
1941
1942         res.tclassid    = 0;
1943         res.fi          = NULL;
1944         res.table       = NULL;
1945
1946         orig_oif = fl4->flowi4_oif;
1947
1948         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1949         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1950         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1951                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1952
1953         rcu_read_lock();
1954         if (fl4->saddr) {
1955                 rth = ERR_PTR(-EINVAL);
1956                 if (ipv4_is_multicast(fl4->saddr) ||
1957                     ipv4_is_lbcast(fl4->saddr) ||
1958                     ipv4_is_zeronet(fl4->saddr))
1959                         goto out;
1960
1961                 /* I removed check for oif == dev_out->oif here.
1962                    It was wrong for two reasons:
1963                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1964                       is assigned to multiple interfaces.
1965                    2. Moreover, we are allowed to send packets with saddr
1966                       of another iface. --ANK
1967                  */
1968
1969                 if (fl4->flowi4_oif == 0 &&
1970                     (ipv4_is_multicast(fl4->daddr) ||
1971                      ipv4_is_lbcast(fl4->daddr))) {
1972                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1973                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1974                         if (dev_out == NULL)
1975                                 goto out;
1976
1977                         /* Special hack: user can direct multicasts
1978                            and limited broadcast via necessary interface
1979                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1980                            This hack is not just for fun, it allows
1981                            vic,vat and friends to work.
1982                            They bind socket to loopback, set ttl to zero
1983                            and expect that it will work.
1984                            From the viewpoint of routing cache they are broken,
1985                            because we are not allowed to build multicast path
1986                            with loopback source addr (look, routing cache
1987                            cannot know, that ttl is zero, so that packet
1988                            will not leave this host and route is valid).
1989                            Luckily, this hack is good workaround.
1990                          */
1991
1992                         fl4->flowi4_oif = dev_out->ifindex;
1993                         goto make_route;
1994                 }
1995
1996                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1997                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1998                         if (!__ip_dev_find(net, fl4->saddr, false))
1999                                 goto out;
2000                 }
2001         }
2002
2003
2004         if (fl4->flowi4_oif) {
2005                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2006                 rth = ERR_PTR(-ENODEV);
2007                 if (dev_out == NULL)
2008                         goto out;
2009
2010                 /* RACE: Check return value of inet_select_addr instead. */
2011                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2012                         rth = ERR_PTR(-ENETUNREACH);
2013                         goto out;
2014                 }
2015                 if (ipv4_is_local_multicast(fl4->daddr) ||
2016                     ipv4_is_lbcast(fl4->daddr)) {
2017                         if (!fl4->saddr)
2018                                 fl4->saddr = inet_select_addr(dev_out, 0,
2019                                                               RT_SCOPE_LINK);
2020                         goto make_route;
2021                 }
2022                 if (fl4->saddr) {
2023                         if (ipv4_is_multicast(fl4->daddr))
2024                                 fl4->saddr = inet_select_addr(dev_out, 0,
2025                                                               fl4->flowi4_scope);
2026                         else if (!fl4->daddr)
2027                                 fl4->saddr = inet_select_addr(dev_out, 0,
2028                                                               RT_SCOPE_HOST);
2029                 }
2030         }
2031
2032         if (!fl4->daddr) {
2033                 fl4->daddr = fl4->saddr;
2034                 if (!fl4->daddr)
2035                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2036                 dev_out = net->loopback_dev;
2037                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2038                 res.type = RTN_LOCAL;
2039                 flags |= RTCF_LOCAL;
2040                 goto make_route;
2041         }
2042
2043         if (fib_lookup(net, fl4, &res)) {
2044                 res.fi = NULL;
2045                 res.table = NULL;
2046                 if (fl4->flowi4_oif) {
2047                         /* Apparently, routing tables are wrong. Assume,
2048                            that the destination is on link.
2049
2050                            WHY? DW.
2051                            Because we are allowed to send to iface
2052                            even if it has NO routes and NO assigned
2053                            addresses. When oif is specified, routing
2054                            tables are looked up with only one purpose:
2055                            to catch if destination is gatewayed, rather than
2056                            direct. Moreover, if MSG_DONTROUTE is set,
2057                            we send packet, ignoring both routing tables
2058                            and ifaddr state. --ANK
2059
2060
2061                            We could make it even if oif is unknown,
2062                            likely IPv6, but we do not.
2063                          */
2064
2065                         if (fl4->saddr == 0)
2066                                 fl4->saddr = inet_select_addr(dev_out, 0,
2067                                                               RT_SCOPE_LINK);
2068                         res.type = RTN_UNICAST;
2069                         goto make_route;
2070                 }
2071                 rth = ERR_PTR(-ENETUNREACH);
2072                 goto out;
2073         }
2074
2075         if (res.type == RTN_LOCAL) {
2076                 if (!fl4->saddr) {
2077                         if (res.fi->fib_prefsrc)
2078                                 fl4->saddr = res.fi->fib_prefsrc;
2079                         else
2080                                 fl4->saddr = fl4->daddr;
2081                 }
2082                 dev_out = net->loopback_dev;
2083                 fl4->flowi4_oif = dev_out->ifindex;
2084                 flags |= RTCF_LOCAL;
2085                 goto make_route;
2086         }
2087
2088 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2089         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2090                 fib_select_multipath(&res);
2091         else
2092 #endif
2093         if (!res.prefixlen &&
2094             res.table->tb_num_default > 1 &&
2095             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2096                 fib_select_default(&res);
2097
2098         if (!fl4->saddr)
2099                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2100
2101         dev_out = FIB_RES_DEV(res);
2102         fl4->flowi4_oif = dev_out->ifindex;
2103
2104
2105 make_route:
2106         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2107
2108 out:
2109         rcu_read_unlock();
2110         return rth;
2111 }
2112 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2113
2114 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2115 {
2116         return NULL;
2117 }
2118
2119 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2120 {
2121         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2122
2123         return mtu ? : dst->dev->mtu;
2124 }
2125
2126 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2127                                           struct sk_buff *skb, u32 mtu)
2128 {
2129 }
2130
2131 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2132                                        struct sk_buff *skb)
2133 {
2134 }
2135
2136 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2137                                           unsigned long old)
2138 {
2139         return NULL;
2140 }
2141
2142 static struct dst_ops ipv4_dst_blackhole_ops = {
2143         .family                 =       AF_INET,
2144         .protocol               =       cpu_to_be16(ETH_P_IP),
2145         .check                  =       ipv4_blackhole_dst_check,
2146         .mtu                    =       ipv4_blackhole_mtu,
2147         .default_advmss         =       ipv4_default_advmss,
2148         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2149         .redirect               =       ipv4_rt_blackhole_redirect,
2150         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2151         .neigh_lookup           =       ipv4_neigh_lookup,
2152 };
2153
2154 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2155 {
2156         struct rtable *ort = (struct rtable *) dst_orig;
2157         struct rtable *rt;
2158
2159         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2160         if (rt) {
2161                 struct dst_entry *new = &rt->dst;
2162
2163                 new->__use = 1;
2164                 new->input = dst_discard;
2165                 new->output = dst_discard;
2166
2167                 new->dev = ort->dst.dev;
2168                 if (new->dev)
2169                         dev_hold(new->dev);
2170
2171                 rt->rt_is_input = ort->rt_is_input;
2172                 rt->rt_iif = ort->rt_iif;
2173                 rt->rt_pmtu = ort->rt_pmtu;
2174
2175                 rt->rt_genid = rt_genid(net);
2176                 rt->rt_flags = ort->rt_flags;
2177                 rt->rt_type = ort->rt_type;
2178                 rt->rt_gateway = ort->rt_gateway;
2179                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2180
2181                 INIT_LIST_HEAD(&rt->rt_uncached);
2182
2183                 dst_free(new);
2184         }
2185
2186         dst_release(dst_orig);
2187
2188         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2189 }
2190
2191 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2192                                     struct sock *sk)
2193 {
2194         struct rtable *rt = __ip_route_output_key(net, flp4);
2195
2196         if (IS_ERR(rt))
2197                 return rt;
2198
2199         if (flp4->flowi4_proto)
2200                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2201                                                    flowi4_to_flowi(flp4),
2202                                                    sk, 0);
2203
2204         return rt;
2205 }
2206 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2207
2208 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2209                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2210                         u32 seq, int event, int nowait, unsigned int flags)
2211 {
2212         struct rtable *rt = skb_rtable(skb);
2213         struct rtmsg *r;
2214         struct nlmsghdr *nlh;
2215         unsigned long expires = 0;
2216         u32 error;
2217         u32 metrics[RTAX_MAX];
2218
2219         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2220         if (nlh == NULL)
2221                 return -EMSGSIZE;
2222
2223         r = nlmsg_data(nlh);
2224         r->rtm_family    = AF_INET;
2225         r->rtm_dst_len  = 32;
2226         r->rtm_src_len  = 0;
2227         r->rtm_tos      = fl4->flowi4_tos;
2228         r->rtm_table    = RT_TABLE_MAIN;
2229         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2230                 goto nla_put_failure;
2231         r->rtm_type     = rt->rt_type;
2232         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2233         r->rtm_protocol = RTPROT_UNSPEC;
2234         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2235         if (rt->rt_flags & RTCF_NOTIFY)
2236                 r->rtm_flags |= RTM_F_NOTIFY;
2237
2238         if (nla_put_be32(skb, RTA_DST, dst))
2239                 goto nla_put_failure;
2240         if (src) {
2241                 r->rtm_src_len = 32;
2242                 if (nla_put_be32(skb, RTA_SRC, src))
2243                         goto nla_put_failure;
2244         }
2245         if (rt->dst.dev &&
2246             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2247                 goto nla_put_failure;
2248 #ifdef CONFIG_IP_ROUTE_CLASSID
2249         if (rt->dst.tclassid &&
2250             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2251                 goto nla_put_failure;
2252 #endif
2253         if (!rt_is_input_route(rt) &&
2254             fl4->saddr != src) {
2255                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2256                         goto nla_put_failure;
2257         }
2258         if (rt->rt_uses_gateway &&
2259             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2260                 goto nla_put_failure;
2261
2262         expires = rt->dst.expires;
2263         if (expires) {
2264                 unsigned long now = jiffies;
2265
2266                 if (time_before(now, expires))
2267                         expires -= now;
2268                 else
2269                         expires = 0;
2270         }
2271
2272         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2273         if (rt->rt_pmtu && expires)
2274                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2275         if (rtnetlink_put_metrics(skb, metrics) < 0)
2276                 goto nla_put_failure;
2277
2278         if (fl4->flowi4_mark &&
2279             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2280                 goto nla_put_failure;
2281
2282         error = rt->dst.error;
2283
2284         if (rt_is_input_route(rt)) {
2285 #ifdef CONFIG_IP_MROUTE
2286                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2287                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2288                         int err = ipmr_get_route(net, skb,
2289                                                  fl4->saddr, fl4->daddr,
2290                                                  r, nowait);
2291                         if (err <= 0) {
2292                                 if (!nowait) {
2293                                         if (err == 0)
2294                                                 return 0;
2295                                         goto nla_put_failure;
2296                                 } else {
2297                                         if (err == -EMSGSIZE)
2298                                                 goto nla_put_failure;
2299                                         error = err;
2300                                 }
2301                         }
2302                 } else
2303 #endif
2304                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2305                                 goto nla_put_failure;
2306         }
2307
2308         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2309                 goto nla_put_failure;
2310
2311         return nlmsg_end(skb, nlh);
2312
2313 nla_put_failure:
2314         nlmsg_cancel(skb, nlh);
2315         return -EMSGSIZE;
2316 }
2317
2318 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2319 {
2320         struct net *net = sock_net(in_skb->sk);
2321         struct rtmsg *rtm;
2322         struct nlattr *tb[RTA_MAX+1];
2323         struct rtable *rt = NULL;
2324         struct flowi4 fl4;
2325         __be32 dst = 0;
2326         __be32 src = 0;
2327         u32 iif;
2328         int err;
2329         int mark;
2330         struct sk_buff *skb;
2331
2332         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2333         if (err < 0)
2334                 goto errout;
2335
2336         rtm = nlmsg_data(nlh);
2337
2338         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2339         if (skb == NULL) {
2340                 err = -ENOBUFS;
2341                 goto errout;
2342         }
2343
2344         /* Reserve room for dummy headers, this skb can pass
2345            through good chunk of routing engine.
2346          */
2347         skb_reset_mac_header(skb);
2348         skb_reset_network_header(skb);
2349
2350         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2351         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2352         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2353
2354         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2355         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2356         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2357         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2358
2359         memset(&fl4, 0, sizeof(fl4));
2360         fl4.daddr = dst;
2361         fl4.saddr = src;
2362         fl4.flowi4_tos = rtm->rtm_tos;
2363         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2364         fl4.flowi4_mark = mark;
2365
2366         if (iif) {
2367                 struct net_device *dev;
2368
2369                 dev = __dev_get_by_index(net, iif);
2370                 if (dev == NULL) {
2371                         err = -ENODEV;
2372                         goto errout_free;
2373                 }
2374
2375                 skb->protocol   = htons(ETH_P_IP);
2376                 skb->dev        = dev;
2377                 skb->mark       = mark;
2378                 local_bh_disable();
2379                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2380                 local_bh_enable();
2381
2382                 rt = skb_rtable(skb);
2383                 if (err == 0 && rt->dst.error)
2384                         err = -rt->dst.error;
2385         } else {
2386                 rt = ip_route_output_key(net, &fl4);
2387
2388                 err = 0;
2389                 if (IS_ERR(rt))
2390                         err = PTR_ERR(rt);
2391         }
2392
2393         if (err)
2394                 goto errout_free;
2395
2396         skb_dst_set(skb, &rt->dst);
2397         if (rtm->rtm_flags & RTM_F_NOTIFY)
2398                 rt->rt_flags |= RTCF_NOTIFY;
2399
2400         err = rt_fill_info(net, dst, src, &fl4, skb,
2401                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2402                            RTM_NEWROUTE, 0, 0);
2403         if (err <= 0)
2404                 goto errout_free;
2405
2406         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2407 errout:
2408         return err;
2409
2410 errout_free:
2411         kfree_skb(skb);
2412         goto errout;
2413 }
2414
2415 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2416 {
2417         return skb->len;
2418 }
2419
2420 void ip_rt_multicast_event(struct in_device *in_dev)
2421 {
2422         rt_cache_flush(dev_net(in_dev->dev));
2423 }
2424
2425 #ifdef CONFIG_SYSCTL
2426 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2427                                         void __user *buffer,
2428                                         size_t *lenp, loff_t *ppos)
2429 {
2430         if (write) {
2431                 rt_cache_flush((struct net *)__ctl->extra1);
2432                 return 0;
2433         }
2434
2435         return -EINVAL;
2436 }
2437
2438 static ctl_table ipv4_route_table[] = {
2439         {
2440                 .procname       = "gc_thresh",
2441                 .data           = &ipv4_dst_ops.gc_thresh,
2442                 .maxlen         = sizeof(int),
2443                 .mode           = 0644,
2444                 .proc_handler   = proc_dointvec,
2445         },
2446         {
2447                 .procname       = "max_size",
2448                 .data           = &ip_rt_max_size,
2449                 .maxlen         = sizeof(int),
2450                 .mode           = 0644,
2451                 .proc_handler   = proc_dointvec,
2452         },
2453         {
2454                 /*  Deprecated. Use gc_min_interval_ms */
2455
2456                 .procname       = "gc_min_interval",
2457                 .data           = &ip_rt_gc_min_interval,
2458                 .maxlen         = sizeof(int),
2459                 .mode           = 0644,
2460                 .proc_handler   = proc_dointvec_jiffies,
2461         },
2462         {
2463                 .procname       = "gc_min_interval_ms",
2464                 .data           = &ip_rt_gc_min_interval,
2465                 .maxlen         = sizeof(int),
2466                 .mode           = 0644,
2467                 .proc_handler   = proc_dointvec_ms_jiffies,
2468         },
2469         {
2470                 .procname       = "gc_timeout",
2471                 .data           = &ip_rt_gc_timeout,
2472                 .maxlen         = sizeof(int),
2473                 .mode           = 0644,
2474                 .proc_handler   = proc_dointvec_jiffies,
2475         },
2476         {
2477                 .procname       = "gc_interval",
2478                 .data           = &ip_rt_gc_interval,
2479                 .maxlen         = sizeof(int),
2480                 .mode           = 0644,
2481                 .proc_handler   = proc_dointvec_jiffies,
2482         },
2483         {
2484                 .procname       = "redirect_load",
2485                 .data           = &ip_rt_redirect_load,
2486                 .maxlen         = sizeof(int),
2487                 .mode           = 0644,
2488                 .proc_handler   = proc_dointvec,
2489         },
2490         {
2491                 .procname       = "redirect_number",
2492                 .data           = &ip_rt_redirect_number,
2493                 .maxlen         = sizeof(int),
2494                 .mode           = 0644,
2495                 .proc_handler   = proc_dointvec,
2496         },
2497         {
2498                 .procname       = "redirect_silence",
2499                 .data           = &ip_rt_redirect_silence,
2500                 .maxlen         = sizeof(int),
2501                 .mode           = 0644,
2502                 .proc_handler   = proc_dointvec,
2503         },
2504         {
2505                 .procname       = "error_cost",
2506                 .data           = &ip_rt_error_cost,
2507                 .maxlen         = sizeof(int),
2508                 .mode           = 0644,
2509                 .proc_handler   = proc_dointvec,
2510         },
2511         {
2512                 .procname       = "error_burst",
2513                 .data           = &ip_rt_error_burst,
2514                 .maxlen         = sizeof(int),
2515                 .mode           = 0644,
2516                 .proc_handler   = proc_dointvec,
2517         },
2518         {
2519                 .procname       = "gc_elasticity",
2520                 .data           = &ip_rt_gc_elasticity,
2521                 .maxlen         = sizeof(int),
2522                 .mode           = 0644,
2523                 .proc_handler   = proc_dointvec,
2524         },
2525         {
2526                 .procname       = "mtu_expires",
2527                 .data           = &ip_rt_mtu_expires,
2528                 .maxlen         = sizeof(int),
2529                 .mode           = 0644,
2530                 .proc_handler   = proc_dointvec_jiffies,
2531         },
2532         {
2533                 .procname       = "min_pmtu",
2534                 .data           = &ip_rt_min_pmtu,
2535                 .maxlen         = sizeof(int),
2536                 .mode           = 0644,
2537                 .proc_handler   = proc_dointvec,
2538         },
2539         {
2540                 .procname       = "min_adv_mss",
2541                 .data           = &ip_rt_min_advmss,
2542                 .maxlen         = sizeof(int),
2543                 .mode           = 0644,
2544                 .proc_handler   = proc_dointvec,
2545         },
2546         { }
2547 };
2548
2549 static struct ctl_table ipv4_route_flush_table[] = {
2550         {
2551                 .procname       = "flush",
2552                 .maxlen         = sizeof(int),
2553                 .mode           = 0200,
2554                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2555         },
2556         { },
2557 };
2558
2559 static __net_init int sysctl_route_net_init(struct net *net)
2560 {
2561         struct ctl_table *tbl;
2562
2563         tbl = ipv4_route_flush_table;
2564         if (!net_eq(net, &init_net)) {
2565                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2566                 if (tbl == NULL)
2567                         goto err_dup;
2568
2569                 /* Don't export sysctls to unprivileged users */
2570                 if (net->user_ns != &init_user_ns)
2571                         tbl[0].procname = NULL;
2572         }
2573         tbl[0].extra1 = net;
2574
2575         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2576         if (net->ipv4.route_hdr == NULL)
2577                 goto err_reg;
2578         return 0;
2579
2580 err_reg:
2581         if (tbl != ipv4_route_flush_table)
2582                 kfree(tbl);
2583 err_dup:
2584         return -ENOMEM;
2585 }
2586
2587 static __net_exit void sysctl_route_net_exit(struct net *net)
2588 {
2589         struct ctl_table *tbl;
2590
2591         tbl = net->ipv4.route_hdr->ctl_table_arg;
2592         unregister_net_sysctl_table(net->ipv4.route_hdr);
2593         BUG_ON(tbl == ipv4_route_flush_table);
2594         kfree(tbl);
2595 }
2596
2597 static __net_initdata struct pernet_operations sysctl_route_ops = {
2598         .init = sysctl_route_net_init,
2599         .exit = sysctl_route_net_exit,
2600 };
2601 #endif
2602
2603 static __net_init int rt_genid_init(struct net *net)
2604 {
2605         atomic_set(&net->rt_genid, 0);
2606         get_random_bytes(&net->ipv4.dev_addr_genid,
2607                          sizeof(net->ipv4.dev_addr_genid));
2608         return 0;
2609 }
2610
2611 static __net_initdata struct pernet_operations rt_genid_ops = {
2612         .init = rt_genid_init,
2613 };
2614
2615 static int __net_init ipv4_inetpeer_init(struct net *net)
2616 {
2617         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2618
2619         if (!bp)
2620                 return -ENOMEM;
2621         inet_peer_base_init(bp);
2622         net->ipv4.peers = bp;
2623         return 0;
2624 }
2625
2626 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2627 {
2628         struct inet_peer_base *bp = net->ipv4.peers;
2629
2630         net->ipv4.peers = NULL;
2631         inetpeer_invalidate_tree(bp);
2632         kfree(bp);
2633 }
2634
2635 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2636         .init   =       ipv4_inetpeer_init,
2637         .exit   =       ipv4_inetpeer_exit,
2638 };
2639
2640 #ifdef CONFIG_IP_ROUTE_CLASSID
2641 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2642 #endif /* CONFIG_IP_ROUTE_CLASSID */
2643
2644 int __init ip_rt_init(void)
2645 {
2646         int rc = 0;
2647
2648 #ifdef CONFIG_IP_ROUTE_CLASSID
2649         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2650         if (!ip_rt_acct)
2651                 panic("IP: failed to allocate ip_rt_acct\n");
2652 #endif
2653
2654         ipv4_dst_ops.kmem_cachep =
2655                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2656                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2657
2658         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2659
2660         if (dst_entries_init(&ipv4_dst_ops) < 0)
2661                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2662
2663         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2664                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2665
2666         ipv4_dst_ops.gc_thresh = ~0;
2667         ip_rt_max_size = INT_MAX;
2668
2669         devinet_init();
2670         ip_fib_init();
2671
2672         if (ip_rt_proc_init())
2673                 pr_err("Unable to create route proc files\n");
2674 #ifdef CONFIG_XFRM
2675         xfrm_init();
2676         xfrm4_init();
2677 #endif
2678         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2679
2680 #ifdef CONFIG_SYSCTL
2681         register_pernet_subsys(&sysctl_route_ops);
2682 #endif
2683         register_pernet_subsys(&rt_genid_ops);
2684         register_pernet_subsys(&ipv4_inetpeer_ops);
2685         return rc;
2686 }
2687
2688 #ifdef CONFIG_SYSCTL
2689 /*
2690  * We really need to sanitize the damn ipv4 init order, then all
2691  * this nonsense will go away.
2692  */
2693 void __init ip_static_sysctl_init(void)
2694 {
2695         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2696 }
2697 #endif