pandora: defconfig: update
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77                                     const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int      ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void             ip6_dst_destroy(struct dst_entry *);
83 static void             ip6_dst_ifdown(struct dst_entry *,
84                                        struct net_device *dev, int how);
85 static int               ip6_dst_gc(struct dst_ops *ops);
86
87 static int              ip6_pkt_discard(struct sk_buff *skb);
88 static int              ip6_pkt_discard_out(struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94                                            const struct in6_addr *prefix, int prefixlen,
95                                            const struct in6_addr *gwaddr, int ifindex,
96                                            unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98                                            const struct in6_addr *prefix, int prefixlen,
99                                            const struct in6_addr *gwaddr, int ifindex);
100 #endif
101
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 {
104         struct rt6_info *rt = (struct rt6_info *) dst;
105         struct inet_peer *peer;
106         u32 *p = NULL;
107
108         if (!(rt->dst.flags & DST_HOST))
109                 return dst_cow_metrics_generic(dst, old);
110
111         if (!rt->rt6i_peer)
112                 rt6_bind_peer(rt, 1);
113
114         peer = rt->rt6i_peer;
115         if (peer) {
116                 u32 *old_p = __DST_METRICS_PTR(old);
117                 unsigned long prev, new;
118
119                 p = peer->metrics;
120                 if (inet_metrics_new(peer))
121                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122
123                 new = (unsigned long) p;
124                 prev = cmpxchg(&dst->_metrics, old, new);
125
126                 if (prev != old) {
127                         p = __DST_METRICS_PTR(prev);
128                         if (prev & DST_METRICS_READ_ONLY)
129                                 p = NULL;
130                 }
131         }
132         return p;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
138 }
139
140 static struct dst_ops ip6_dst_ops_template = {
141         .family                 =       AF_INET6,
142         .protocol               =       cpu_to_be16(ETH_P_IPV6),
143         .gc                     =       ip6_dst_gc,
144         .gc_thresh              =       1024,
145         .check                  =       ip6_dst_check,
146         .default_advmss         =       ip6_default_advmss,
147         .mtu                    =       ip6_mtu,
148         .cow_metrics            =       ipv6_cow_metrics,
149         .destroy                =       ip6_dst_destroy,
150         .ifdown                 =       ip6_dst_ifdown,
151         .negative_advice        =       ip6_negative_advice,
152         .link_failure           =       ip6_link_failure,
153         .update_pmtu            =       ip6_rt_update_pmtu,
154         .local_out              =       __ip6_local_out,
155         .neigh_lookup           =       ip6_neigh_lookup,
156 };
157
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
159 {
160         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
161
162         return mtu ? : dst->dev->mtu;
163 }
164
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 {
167 }
168
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170                                          unsigned long old)
171 {
172         return NULL;
173 }
174
175 static struct dst_ops ip6_dst_blackhole_ops = {
176         .family                 =       AF_INET6,
177         .protocol               =       cpu_to_be16(ETH_P_IPV6),
178         .destroy                =       ip6_dst_destroy,
179         .check                  =       ip6_dst_check,
180         .mtu                    =       ip6_blackhole_mtu,
181         .default_advmss         =       ip6_default_advmss,
182         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
183         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
184         .neigh_lookup           =       ip6_neigh_lookup,
185 };
186
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188         [RTAX_HOPLIMIT - 1] = 0,
189 };
190
191 static struct rt6_info ip6_null_entry_template = {
192         .dst = {
193                 .__refcnt       = ATOMIC_INIT(1),
194                 .__use          = 1,
195                 .obsolete       = -1,
196                 .error          = -ENETUNREACH,
197                 .input          = ip6_pkt_discard,
198                 .output         = ip6_pkt_discard_out,
199         },
200         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
201         .rt6i_protocol  = RTPROT_KERNEL,
202         .rt6i_metric    = ~(u32) 0,
203         .rt6i_ref       = ATOMIC_INIT(1),
204 };
205
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
207
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
210
211 static struct rt6_info ip6_prohibit_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -EACCES,
217                 .input          = ip6_pkt_prohibit,
218                 .output         = ip6_pkt_prohibit_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 static struct rt6_info ip6_blk_hole_entry_template = {
227         .dst = {
228                 .__refcnt       = ATOMIC_INIT(1),
229                 .__use          = 1,
230                 .obsolete       = -1,
231                 .error          = -EINVAL,
232                 .input          = dst_discard,
233                 .output         = dst_discard,
234         },
235         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
236         .rt6i_protocol  = RTPROT_KERNEL,
237         .rt6i_metric    = ~(u32) 0,
238         .rt6i_ref       = ATOMIC_INIT(1),
239 };
240
241 #endif
242
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245                                              struct net_device *dev,
246                                              int flags)
247 {
248         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
249
250         if (rt != NULL)
251                 memset(&rt->rt6i_table, 0,
252                         sizeof(*rt) - sizeof(struct dst_entry));
253
254         return rt;
255 }
256
257 static void ip6_dst_destroy(struct dst_entry *dst)
258 {
259         struct rt6_info *rt = (struct rt6_info *)dst;
260         struct inet6_dev *idev = rt->rt6i_idev;
261         struct inet_peer *peer = rt->rt6i_peer;
262
263         if (!(rt->dst.flags & DST_HOST))
264                 dst_destroy_metrics_generic(dst);
265
266         if (idev != NULL) {
267                 rt->rt6i_idev = NULL;
268                 in6_dev_put(idev);
269         }
270         if (peer) {
271                 rt->rt6i_peer = NULL;
272                 inet_putpeer(peer);
273         }
274 }
275
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
277
278 static u32 rt6_peer_genid(void)
279 {
280         return atomic_read(&__rt6_peer_genid);
281 }
282
283 void rt6_bind_peer(struct rt6_info *rt, int create)
284 {
285         struct inet_peer *peer;
286
287         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289                 inet_putpeer(peer);
290         else
291                 rt->rt6i_peer_genid = rt6_peer_genid();
292 }
293
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295                            int how)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299         struct net_device *loopback_dev =
300                 dev_net(dev)->loopback_dev;
301
302         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303                 struct inet6_dev *loopback_idev =
304                         in6_dev_get(loopback_dev);
305                 if (loopback_idev != NULL) {
306                         rt->rt6i_idev = loopback_idev;
307                         in6_dev_put(idev);
308                 }
309         }
310 }
311
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
313 {
314         return (rt->rt6i_flags & RTF_EXPIRES) &&
315                 time_after(jiffies, rt->rt6i_expires);
316 }
317
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
319 {
320         return ipv6_addr_type(daddr) &
321                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
322 }
323
324 /*
325  *      Route lookup. Any table->tb6_lock is implied.
326  */
327
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329                                                     struct rt6_info *rt,
330                                                     const struct in6_addr *saddr,
331                                                     int oif,
332                                                     int flags)
333 {
334         struct rt6_info *local = NULL;
335         struct rt6_info *sprt;
336
337         if (!oif && ipv6_addr_any(saddr))
338                 goto out;
339
340         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341                 struct net_device *dev = sprt->rt6i_dev;
342
343                 if (oif) {
344                         if (dev->ifindex == oif)
345                                 return sprt;
346                         if (dev->flags & IFF_LOOPBACK) {
347                                 if (sprt->rt6i_idev == NULL ||
348                                     sprt->rt6i_idev->dev->ifindex != oif) {
349                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
350                                                 continue;
351                                         if (local && (!oif ||
352                                                       local->rt6i_idev->dev->ifindex == oif))
353                                                 continue;
354                                 }
355                                 local = sprt;
356                         }
357                 } else {
358                         if (ipv6_chk_addr(net, saddr, dev,
359                                           flags & RT6_LOOKUP_F_IFACE))
360                                 return sprt;
361                 }
362         }
363
364         if (oif) {
365                 if (local)
366                         return local;
367
368                 if (flags & RT6_LOOKUP_F_IFACE)
369                         return net->ipv6.ip6_null_entry;
370         }
371 out:
372         return rt;
373 }
374
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
377 {
378         struct neighbour *neigh;
379         /*
380          * Okay, this does not seem to be appropriate
381          * for now, however, we need to check if it
382          * is really so; aka Router Reachability Probing.
383          *
384          * Router Reachability Probe MUST be rate-limited
385          * to no more than one per minute.
386          */
387         rcu_read_lock();
388         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389         if (!neigh || (neigh->nud_state & NUD_VALID))
390                 goto out;
391         read_lock_bh(&neigh->lock);
392         if (!(neigh->nud_state & NUD_VALID) &&
393             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394                 struct in6_addr mcaddr;
395                 struct in6_addr *target;
396
397                 neigh->updated = jiffies;
398                 read_unlock_bh(&neigh->lock);
399
400                 target = (struct in6_addr *)&neigh->primary_key;
401                 addrconf_addr_solict_mult(target, &mcaddr);
402                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403         } else {
404                 read_unlock_bh(&neigh->lock);
405         }
406 out:
407         rcu_read_unlock();
408 }
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
411 {
412 }
413 #endif
414
415 /*
416  * Default Router Selection (RFC 2461 6.3.6)
417  */
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
419 {
420         struct net_device *dev = rt->rt6i_dev;
421         if (!oif || dev->ifindex == oif)
422                 return 2;
423         if ((dev->flags & IFF_LOOPBACK) &&
424             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425                 return 1;
426         return 0;
427 }
428
429 static inline int rt6_check_neigh(struct rt6_info *rt)
430 {
431         struct neighbour *neigh;
432         int m;
433
434         rcu_read_lock();
435         neigh = dst_get_neighbour(&rt->dst);
436         if (rt->rt6i_flags & RTF_NONEXTHOP ||
437             !(rt->rt6i_flags & RTF_GATEWAY))
438                 m = 1;
439         else if (neigh) {
440                 read_lock_bh(&neigh->lock);
441                 if (neigh->nud_state & NUD_VALID)
442                         m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444                 else if (neigh->nud_state & NUD_FAILED)
445                         m = 0;
446 #endif
447                 else
448                         m = 1;
449                 read_unlock_bh(&neigh->lock);
450         } else
451                 m = 0;
452         rcu_read_unlock();
453         return m;
454 }
455
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457                            int strict)
458 {
459         int m, n;
460
461         m = rt6_check_dev(rt, oif);
462         if (!m && (strict & RT6_LOOKUP_F_IFACE))
463                 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467         n = rt6_check_neigh(rt);
468         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469                 return -1;
470         return m;
471 }
472
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474                                    int *mpri, struct rt6_info *match)
475 {
476         int m;
477
478         if (rt6_check_expired(rt))
479                 goto out;
480
481         m = rt6_score_route(rt, oif, strict);
482         if (m < 0)
483                 goto out;
484
485         if (m > *mpri) {
486                 if (strict & RT6_LOOKUP_F_REACHABLE)
487                         rt6_probe(match);
488                 *mpri = m;
489                 match = rt;
490         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491                 rt6_probe(rt);
492         }
493
494 out:
495         return match;
496 }
497
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499                                      struct rt6_info *rr_head,
500                                      u32 metric, int oif, int strict)
501 {
502         struct rt6_info *rt, *match;
503         int mpri = -1;
504
505         match = NULL;
506         for (rt = rr_head; rt && rt->rt6i_metric == metric;
507              rt = rt->dst.rt6_next)
508                 match = find_match(rt, oif, strict, &mpri, match);
509         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510              rt = rt->dst.rt6_next)
511                 match = find_match(rt, oif, strict, &mpri, match);
512
513         return match;
514 }
515
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
517 {
518         struct rt6_info *match, *rt0;
519         struct net *net;
520
521         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522                   __func__, fn->leaf, oif);
523
524         rt0 = fn->rr_ptr;
525         if (!rt0)
526                 fn->rr_ptr = rt0 = fn->leaf;
527
528         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
529
530         if (!match &&
531             (strict & RT6_LOOKUP_F_REACHABLE)) {
532                 struct rt6_info *next = rt0->dst.rt6_next;
533
534                 /* no entries matched; do round-robin */
535                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536                         next = fn->leaf;
537
538                 if (next != rt0)
539                         fn->rr_ptr = next;
540         }
541
542         RT6_TRACE("%s() => %p\n",
543                   __func__, match);
544
545         net = dev_net(rt0->rt6i_dev);
546         return match ? match : net->ipv6.ip6_null_entry;
547 }
548
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551                   const struct in6_addr *gwaddr)
552 {
553         struct net *net = dev_net(dev);
554         struct route_info *rinfo = (struct route_info *) opt;
555         struct in6_addr prefix_buf, *prefix;
556         unsigned int pref;
557         unsigned long lifetime;
558         struct rt6_info *rt;
559
560         if (len < sizeof(struct route_info)) {
561                 return -EINVAL;
562         }
563
564         /* Sanity check for prefix_len and length */
565         if (rinfo->length > 3) {
566                 return -EINVAL;
567         } else if (rinfo->prefix_len > 128) {
568                 return -EINVAL;
569         } else if (rinfo->prefix_len > 64) {
570                 if (rinfo->length < 2) {
571                         return -EINVAL;
572                 }
573         } else if (rinfo->prefix_len > 0) {
574                 if (rinfo->length < 1) {
575                         return -EINVAL;
576                 }
577         }
578
579         pref = rinfo->route_pref;
580         if (pref == ICMPV6_ROUTER_PREF_INVALID)
581                 return -EINVAL;
582
583         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584
585         if (rinfo->length == 3)
586                 prefix = (struct in6_addr *)rinfo->prefix;
587         else {
588                 /* this function is safe */
589                 ipv6_addr_prefix(&prefix_buf,
590                                  (struct in6_addr *)rinfo->prefix,
591                                  rinfo->prefix_len);
592                 prefix = &prefix_buf;
593         }
594
595         if (rinfo->prefix_len == 0)
596                 rt = rt6_get_dflt_router(gwaddr, dev);
597         else
598                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
599                                         gwaddr, dev->ifindex);
600
601         if (rt && !lifetime) {
602                 ip6_del_rt(rt);
603                 rt = NULL;
604         }
605
606         if (!rt && lifetime)
607                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
608                                         pref);
609         else if (rt)
610                 rt->rt6i_flags = RTF_ROUTEINFO |
611                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
612
613         if (rt) {
614                 if (!addrconf_finite_timeout(lifetime)) {
615                         rt->rt6i_flags &= ~RTF_EXPIRES;
616                 } else {
617                         rt->rt6i_expires = jiffies + HZ * lifetime;
618                         rt->rt6i_flags |= RTF_EXPIRES;
619                 }
620                 dst_release(&rt->dst);
621         }
622         return 0;
623 }
624 #endif
625
626 #define BACKTRACK(__net, saddr)                 \
627 do { \
628         if (rt == __net->ipv6.ip6_null_entry) { \
629                 struct fib6_node *pn; \
630                 while (1) { \
631                         if (fn->fn_flags & RTN_TL_ROOT) \
632                                 goto out; \
633                         pn = fn->parent; \
634                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
635                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
636                         else \
637                                 fn = pn; \
638                         if (fn->fn_flags & RTN_RTINFO) \
639                                 goto restart; \
640                 } \
641         } \
642 } while(0)
643
644 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
645                                              struct fib6_table *table,
646                                              struct flowi6 *fl6, int flags)
647 {
648         struct fib6_node *fn;
649         struct rt6_info *rt;
650
651         read_lock_bh(&table->tb6_lock);
652         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
653 restart:
654         rt = fn->leaf;
655         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
656         BACKTRACK(net, &fl6->saddr);
657 out:
658         dst_use(&rt->dst, jiffies);
659         read_unlock_bh(&table->tb6_lock);
660         return rt;
661
662 }
663
664 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
665                             const struct in6_addr *saddr, int oif, int strict)
666 {
667         struct flowi6 fl6 = {
668                 .flowi6_oif = oif,
669                 .daddr = *daddr,
670         };
671         struct dst_entry *dst;
672         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
673
674         if (saddr) {
675                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
676                 flags |= RT6_LOOKUP_F_HAS_SADDR;
677         }
678
679         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
680         if (dst->error == 0)
681                 return (struct rt6_info *) dst;
682
683         dst_release(dst);
684
685         return NULL;
686 }
687
688 EXPORT_SYMBOL(rt6_lookup);
689
690 /* ip6_ins_rt is called with FREE table->tb6_lock.
691    It takes new route entry, the addition fails by any reason the
692    route is freed. In any case, if caller does not hold it, it may
693    be destroyed.
694  */
695
696 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
697 {
698         int err;
699         struct fib6_table *table;
700
701         table = rt->rt6i_table;
702         write_lock_bh(&table->tb6_lock);
703         err = fib6_add(&table->tb6_root, rt, info);
704         write_unlock_bh(&table->tb6_lock);
705
706         return err;
707 }
708
709 int ip6_ins_rt(struct rt6_info *rt)
710 {
711         struct nl_info info = {
712                 .nl_net = dev_net(rt->rt6i_dev),
713         };
714         return __ip6_ins_rt(rt, &info);
715 }
716
717 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
718                                       const struct in6_addr *daddr,
719                                       const struct in6_addr *saddr)
720 {
721         struct rt6_info *rt;
722
723         /*
724          *      Clone the route.
725          */
726
727         rt = ip6_rt_copy(ort, daddr);
728
729         if (rt) {
730                 struct neighbour *neigh;
731                 int attempts = !in_softirq();
732
733                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
734                         if (ort->rt6i_dst.plen != 128 &&
735                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
736                                 rt->rt6i_flags |= RTF_ANYCAST;
737                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
738                 }
739
740                 rt->rt6i_flags |= RTF_CACHE;
741
742 #ifdef CONFIG_IPV6_SUBTREES
743                 if (rt->rt6i_src.plen && saddr) {
744                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
745                         rt->rt6i_src.plen = 128;
746                 }
747 #endif
748
749         retry:
750                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
751                 if (IS_ERR(neigh)) {
752                         struct net *net = dev_net(rt->rt6i_dev);
753                         int saved_rt_min_interval =
754                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
755                         int saved_rt_elasticity =
756                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
757
758                         if (attempts-- > 0) {
759                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
760                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
761
762                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
763
764                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
765                                         saved_rt_elasticity;
766                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
767                                         saved_rt_min_interval;
768                                 goto retry;
769                         }
770
771                         if (net_ratelimit())
772                                 printk(KERN_WARNING
773                                        "ipv6: Neighbour table overflow.\n");
774                         dst_free(&rt->dst);
775                         return NULL;
776                 }
777                 dst_set_neighbour(&rt->dst, neigh);
778
779         }
780
781         return rt;
782 }
783
784 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
785                                         const struct in6_addr *daddr)
786 {
787         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
788
789         if (rt) {
790                 rt->rt6i_flags |= RTF_CACHE;
791                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
792         }
793         return rt;
794 }
795
796 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
797                                       struct flowi6 *fl6, int flags, bool input)
798 {
799         struct fib6_node *fn;
800         struct rt6_info *rt, *nrt;
801         int strict = 0;
802         int attempts = 3;
803         int err;
804         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
805         int local = RTF_NONEXTHOP;
806
807         strict |= flags & RT6_LOOKUP_F_IFACE;
808         if (input)
809                 local |= RTF_LOCAL;
810
811 relookup:
812         read_lock_bh(&table->tb6_lock);
813
814 restart_2:
815         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
816
817 restart:
818         rt = rt6_select(fn, oif, strict | reachable);
819
820         BACKTRACK(net, &fl6->saddr);
821         if (rt == net->ipv6.ip6_null_entry ||
822             rt->rt6i_flags & RTF_CACHE)
823                 goto out;
824
825         dst_hold(&rt->dst);
826         read_unlock_bh(&table->tb6_lock);
827
828         if (!dst_get_neighbour_raw(&rt->dst)
829             && !(rt->rt6i_flags & local))
830                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
831         else if (!(rt->dst.flags & DST_HOST))
832                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
833         else
834                 goto out2;
835
836         dst_release(&rt->dst);
837         rt = nrt ? : net->ipv6.ip6_null_entry;
838
839         dst_hold(&rt->dst);
840         if (nrt) {
841                 err = ip6_ins_rt(nrt);
842                 if (!err)
843                         goto out2;
844         }
845
846         if (--attempts <= 0)
847                 goto out2;
848
849         /*
850          * Race condition! In the gap, when table->tb6_lock was
851          * released someone could insert this route.  Relookup.
852          */
853         dst_release(&rt->dst);
854         goto relookup;
855
856 out:
857         if (reachable) {
858                 reachable = 0;
859                 goto restart_2;
860         }
861         dst_hold(&rt->dst);
862         read_unlock_bh(&table->tb6_lock);
863 out2:
864         rt->dst.lastuse = jiffies;
865         rt->dst.__use++;
866
867         return rt;
868 }
869
870 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
871                                             struct flowi6 *fl6, int flags)
872 {
873         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags, true);
874 }
875
876 void ip6_route_input(struct sk_buff *skb)
877 {
878         const struct ipv6hdr *iph = ipv6_hdr(skb);
879         struct net *net = dev_net(skb->dev);
880         int flags = RT6_LOOKUP_F_HAS_SADDR;
881         struct flowi6 fl6 = {
882                 .flowi6_iif = skb->dev->ifindex,
883                 .daddr = iph->daddr,
884                 .saddr = iph->saddr,
885                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
886                 .flowi6_mark = skb->mark,
887                 .flowi6_proto = iph->nexthdr,
888         };
889
890         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
891                 flags |= RT6_LOOKUP_F_IFACE;
892
893         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
894 }
895
896 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
897                                              struct flowi6 *fl6, int flags)
898 {
899         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags, false);
900 }
901
902 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
903                                     struct flowi6 *fl6)
904 {
905         int flags = 0;
906
907         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
908                 flags |= RT6_LOOKUP_F_IFACE;
909
910         if (!ipv6_addr_any(&fl6->saddr))
911                 flags |= RT6_LOOKUP_F_HAS_SADDR;
912         else if (sk)
913                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
914
915         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
916 }
917
918 EXPORT_SYMBOL(ip6_route_output);
919
920 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
921 {
922         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
923         struct dst_entry *new = NULL;
924
925         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
926         if (rt) {
927                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
928
929                 new = &rt->dst;
930
931                 new->__use = 1;
932                 new->input = dst_discard;
933                 new->output = dst_discard;
934
935                 if (dst_metrics_read_only(&ort->dst))
936                         new->_metrics = ort->dst._metrics;
937                 else
938                         dst_copy_metrics(new, &ort->dst);
939                 rt->rt6i_idev = ort->rt6i_idev;
940                 if (rt->rt6i_idev)
941                         in6_dev_hold(rt->rt6i_idev);
942                 rt->rt6i_expires = 0;
943
944                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
945                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
946                 rt->rt6i_metric = 0;
947
948                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
949 #ifdef CONFIG_IPV6_SUBTREES
950                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
951 #endif
952
953                 dst_free(new);
954         }
955
956         dst_release(dst_orig);
957         return new ? new : ERR_PTR(-ENOMEM);
958 }
959
960 /*
961  *      Destination cache support functions
962  */
963
964 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
965 {
966         struct rt6_info *rt;
967         u32 rt_cookie = 0;
968
969         rt = (struct rt6_info *) dst;
970
971         if (rt6_get_cookie_safe(rt, &rt_cookie) && rt_cookie == cookie) {
972                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
973                         if (!rt->rt6i_peer)
974                                 rt6_bind_peer(rt, 0);
975                         rt->rt6i_peer_genid = rt6_peer_genid();
976                 }
977                 return dst;
978         }
979         return NULL;
980 }
981
982 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
983 {
984         struct rt6_info *rt = (struct rt6_info *) dst;
985
986         if (rt) {
987                 if (rt->rt6i_flags & RTF_CACHE) {
988                         if (rt6_check_expired(rt)) {
989                                 ip6_del_rt(rt);
990                                 dst = NULL;
991                         }
992                 } else {
993                         dst_release(dst);
994                         dst = NULL;
995                 }
996         }
997         return dst;
998 }
999
1000 static void ip6_link_failure(struct sk_buff *skb)
1001 {
1002         struct rt6_info *rt;
1003
1004         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1005
1006         rt = (struct rt6_info *) skb_dst(skb);
1007         if (rt) {
1008                 if (rt->rt6i_flags&RTF_CACHE) {
1009                         dst_set_expires(&rt->dst, 0);
1010                         rt->rt6i_flags |= RTF_EXPIRES;
1011                 } else {
1012                         struct fib6_node *fn;
1013
1014                         rcu_read_lock();
1015                         fn = rcu_dereference(rt->rt6i_node);
1016                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1017                                 fn->fn_sernum = -1;
1018                         rcu_read_unlock();
1019                 }
1020         }
1021 }
1022
1023 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1024 {
1025         struct rt6_info *rt6 = (struct rt6_info*)dst;
1026
1027         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1028                 rt6->rt6i_flags |= RTF_MODIFIED;
1029                 if (mtu < IPV6_MIN_MTU)
1030                         mtu = IPV6_MIN_MTU;
1031
1032                 dst_metric_set(dst, RTAX_MTU, mtu);
1033         }
1034 }
1035
1036 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1037 {
1038         struct net_device *dev = dst->dev;
1039         unsigned int mtu = dst_mtu(dst);
1040         struct net *net = dev_net(dev);
1041
1042         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1043
1044         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1045                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1046
1047         /*
1048          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1049          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1050          * IPV6_MAXPLEN is also valid and means: "any MSS,
1051          * rely only on pmtu discovery"
1052          */
1053         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1054                 mtu = IPV6_MAXPLEN;
1055         return mtu;
1056 }
1057
1058 static unsigned int ip6_mtu(const struct dst_entry *dst)
1059 {
1060         struct inet6_dev *idev;
1061         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1062
1063         if (mtu)
1064                 goto out;
1065
1066         mtu = IPV6_MIN_MTU;
1067
1068         rcu_read_lock();
1069         idev = __in6_dev_get(dst->dev);
1070         if (idev)
1071                 mtu = idev->cnf.mtu6;
1072         rcu_read_unlock();
1073
1074 out:
1075         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1076 }
1077
1078 static struct dst_entry *icmp6_dst_gc_list;
1079 static DEFINE_SPINLOCK(icmp6_dst_lock);
1080
1081 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1082                                   struct neighbour *neigh,
1083                                   const struct in6_addr *addr)
1084 {
1085         struct rt6_info *rt;
1086         struct inet6_dev *idev = in6_dev_get(dev);
1087         struct net *net = dev_net(dev);
1088
1089         if (unlikely(idev == NULL))
1090                 return NULL;
1091
1092         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1093         if (unlikely(rt == NULL)) {
1094                 in6_dev_put(idev);
1095                 goto out;
1096         }
1097
1098         if (neigh)
1099                 neigh_hold(neigh);
1100         else {
1101                 neigh = ndisc_get_neigh(dev, addr);
1102                 if (IS_ERR(neigh))
1103                         neigh = NULL;
1104         }
1105
1106         rt->dst.flags |= DST_HOST;
1107         rt->dst.output  = ip6_output;
1108         dst_set_neighbour(&rt->dst, neigh);
1109         atomic_set(&rt->dst.__refcnt, 1);
1110         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1111         rt->rt6i_dst.plen = 128;
1112         rt->rt6i_idev     = idev;
1113         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1114
1115         spin_lock_bh(&icmp6_dst_lock);
1116         rt->dst.next = icmp6_dst_gc_list;
1117         icmp6_dst_gc_list = &rt->dst;
1118         spin_unlock_bh(&icmp6_dst_lock);
1119
1120         fib6_force_start_gc(net);
1121
1122 out:
1123         return &rt->dst;
1124 }
1125
1126 int icmp6_dst_gc(void)
1127 {
1128         struct dst_entry *dst, **pprev;
1129         int more = 0;
1130
1131         spin_lock_bh(&icmp6_dst_lock);
1132         pprev = &icmp6_dst_gc_list;
1133
1134         while ((dst = *pprev) != NULL) {
1135                 if (!atomic_read(&dst->__refcnt)) {
1136                         *pprev = dst->next;
1137                         dst_free(dst);
1138                 } else {
1139                         pprev = &dst->next;
1140                         ++more;
1141                 }
1142         }
1143
1144         spin_unlock_bh(&icmp6_dst_lock);
1145
1146         return more;
1147 }
1148
1149 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1150                             void *arg)
1151 {
1152         struct dst_entry *dst, **pprev;
1153
1154         spin_lock_bh(&icmp6_dst_lock);
1155         pprev = &icmp6_dst_gc_list;
1156         while ((dst = *pprev) != NULL) {
1157                 struct rt6_info *rt = (struct rt6_info *) dst;
1158                 if (func(rt, arg)) {
1159                         *pprev = dst->next;
1160                         dst_free(dst);
1161                 } else {
1162                         pprev = &dst->next;
1163                 }
1164         }
1165         spin_unlock_bh(&icmp6_dst_lock);
1166 }
1167
1168 static int ip6_dst_gc(struct dst_ops *ops)
1169 {
1170         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1171         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1172         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1173         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1174         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1175         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1176         int entries;
1177
1178         entries = dst_entries_get_fast(ops);
1179         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1180             entries <= rt_max_size)
1181                 goto out;
1182
1183         net->ipv6.ip6_rt_gc_expire++;
1184         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, entries > rt_max_size);
1185         entries = dst_entries_get_slow(ops);
1186         if (entries < ops->gc_thresh)
1187                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1188 out:
1189         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1190         return entries > rt_max_size;
1191 }
1192
1193 /* Clean host part of a prefix. Not necessary in radix tree,
1194    but results in cleaner routing tables.
1195
1196    Remove it only when all the things will work!
1197  */
1198
1199 int ip6_dst_hoplimit(struct dst_entry *dst)
1200 {
1201         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1202         if (hoplimit == 0) {
1203                 struct net_device *dev = dst->dev;
1204                 struct inet6_dev *idev;
1205
1206                 rcu_read_lock();
1207                 idev = __in6_dev_get(dev);
1208                 if (idev)
1209                         hoplimit = idev->cnf.hop_limit;
1210                 else
1211                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1212                 rcu_read_unlock();
1213         }
1214         return hoplimit;
1215 }
1216 EXPORT_SYMBOL(ip6_dst_hoplimit);
1217
1218 /*
1219  *
1220  */
1221
1222 int ip6_route_add(struct fib6_config *cfg)
1223 {
1224         int err;
1225         struct net *net = cfg->fc_nlinfo.nl_net;
1226         struct rt6_info *rt = NULL;
1227         struct net_device *dev = NULL;
1228         struct inet6_dev *idev = NULL;
1229         struct fib6_table *table;
1230         int addr_type;
1231
1232         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1233                 return -EINVAL;
1234 #ifndef CONFIG_IPV6_SUBTREES
1235         if (cfg->fc_src_len)
1236                 return -EINVAL;
1237 #endif
1238         if (cfg->fc_ifindex) {
1239                 err = -ENODEV;
1240                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1241                 if (!dev)
1242                         goto out;
1243                 idev = in6_dev_get(dev);
1244                 if (!idev)
1245                         goto out;
1246         }
1247
1248         if (cfg->fc_metric == 0)
1249                 cfg->fc_metric = IP6_RT_PRIO_USER;
1250
1251         table = fib6_new_table(net, cfg->fc_table);
1252         if (table == NULL) {
1253                 err = -ENOBUFS;
1254                 goto out;
1255         }
1256
1257         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1258
1259         if (rt == NULL) {
1260                 err = -ENOMEM;
1261                 goto out;
1262         }
1263
1264         rt->dst.obsolete = -1;
1265         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1266                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1267                                 0;
1268
1269         if (cfg->fc_protocol == RTPROT_UNSPEC)
1270                 cfg->fc_protocol = RTPROT_BOOT;
1271         rt->rt6i_protocol = cfg->fc_protocol;
1272
1273         addr_type = ipv6_addr_type(&cfg->fc_dst);
1274
1275         if (addr_type & IPV6_ADDR_MULTICAST)
1276                 rt->dst.input = ip6_mc_input;
1277         else if (cfg->fc_flags & RTF_LOCAL)
1278                 rt->dst.input = ip6_input;
1279         else
1280                 rt->dst.input = ip6_forward;
1281
1282         rt->dst.output = ip6_output;
1283
1284         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1285         rt->rt6i_dst.plen = cfg->fc_dst_len;
1286         if (rt->rt6i_dst.plen == 128)
1287                rt->dst.flags |= DST_HOST;
1288
1289         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1290                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1291                 if (!metrics) {
1292                         err = -ENOMEM;
1293                         goto out;
1294                 }
1295                 dst_init_metrics(&rt->dst, metrics, 0);
1296         }
1297 #ifdef CONFIG_IPV6_SUBTREES
1298         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1299         rt->rt6i_src.plen = cfg->fc_src_len;
1300 #endif
1301
1302         rt->rt6i_metric = cfg->fc_metric;
1303
1304         /* We cannot add true routes via loopback here,
1305            they would result in kernel looping; promote them to reject routes
1306          */
1307         if ((cfg->fc_flags & RTF_REJECT) ||
1308             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1309                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1310                 /* hold loopback dev/idev if we haven't done so. */
1311                 if (dev != net->loopback_dev) {
1312                         if (dev) {
1313                                 dev_put(dev);
1314                                 in6_dev_put(idev);
1315                         }
1316                         dev = net->loopback_dev;
1317                         dev_hold(dev);
1318                         idev = in6_dev_get(dev);
1319                         if (!idev) {
1320                                 err = -ENODEV;
1321                                 goto out;
1322                         }
1323                 }
1324                 rt->dst.output = ip6_pkt_discard_out;
1325                 rt->dst.input = ip6_pkt_discard;
1326                 rt->dst.error = -ENETUNREACH;
1327                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1328                 goto install_route;
1329         }
1330
1331         if (cfg->fc_flags & RTF_GATEWAY) {
1332                 const struct in6_addr *gw_addr;
1333                 int gwa_type;
1334
1335                 gw_addr = &cfg->fc_gateway;
1336                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1337                 gwa_type = ipv6_addr_type(gw_addr);
1338
1339                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1340                         struct rt6_info *grt;
1341
1342                         /* IPv6 strictly inhibits using not link-local
1343                            addresses as nexthop address.
1344                            Otherwise, router will not able to send redirects.
1345                            It is very good, but in some (rare!) circumstances
1346                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1347                            some exceptions. --ANK
1348                          */
1349                         err = -EINVAL;
1350                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1351                                 goto out;
1352
1353                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1354
1355                         err = -EHOSTUNREACH;
1356                         if (grt == NULL)
1357                                 goto out;
1358                         if (dev) {
1359                                 if (dev != grt->rt6i_dev) {
1360                                         dst_release(&grt->dst);
1361                                         goto out;
1362                                 }
1363                         } else {
1364                                 dev = grt->rt6i_dev;
1365                                 idev = grt->rt6i_idev;
1366                                 dev_hold(dev);
1367                                 in6_dev_hold(grt->rt6i_idev);
1368                         }
1369                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1370                                 err = 0;
1371                         dst_release(&grt->dst);
1372
1373                         if (err)
1374                                 goto out;
1375                 }
1376                 err = -EINVAL;
1377                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1378                         goto out;
1379         }
1380
1381         err = -ENODEV;
1382         if (dev == NULL)
1383                 goto out;
1384
1385         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1386                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1387                         err = -EINVAL;
1388                         goto out;
1389                 }
1390                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1391                 rt->rt6i_prefsrc.plen = 128;
1392         } else
1393                 rt->rt6i_prefsrc.plen = 0;
1394
1395         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1396                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1397                 if (IS_ERR(n)) {
1398                         err = PTR_ERR(n);
1399                         goto out;
1400                 }
1401                 dst_set_neighbour(&rt->dst, n);
1402         }
1403
1404         rt->rt6i_flags = cfg->fc_flags;
1405
1406 install_route:
1407         if (cfg->fc_mx) {
1408                 struct nlattr *nla;
1409                 int remaining;
1410
1411                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1412                         int type = nla_type(nla);
1413
1414                         if (type) {
1415                                 if (type > RTAX_MAX) {
1416                                         err = -EINVAL;
1417                                         goto out;
1418                                 }
1419
1420                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1421                         }
1422                 }
1423         }
1424
1425         rt->dst.dev = dev;
1426         rt->rt6i_idev = idev;
1427         rt->rt6i_table = table;
1428
1429         cfg->fc_nlinfo.nl_net = dev_net(dev);
1430
1431         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1432
1433 out:
1434         if (dev)
1435                 dev_put(dev);
1436         if (idev)
1437                 in6_dev_put(idev);
1438         if (rt)
1439                 dst_free(&rt->dst);
1440         return err;
1441 }
1442
1443 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1444 {
1445         int err;
1446         struct fib6_table *table;
1447         struct net *net = dev_net(rt->rt6i_dev);
1448
1449         if (rt == net->ipv6.ip6_null_entry) {
1450                 err = -ENOENT;
1451                 goto out;
1452         }
1453
1454         table = rt->rt6i_table;
1455         write_lock_bh(&table->tb6_lock);
1456         err = fib6_del(rt, info);
1457         write_unlock_bh(&table->tb6_lock);
1458
1459 out:
1460         dst_release(&rt->dst);
1461         return err;
1462 }
1463
1464 int ip6_del_rt(struct rt6_info *rt)
1465 {
1466         struct nl_info info = {
1467                 .nl_net = dev_net(rt->rt6i_dev),
1468         };
1469         return __ip6_del_rt(rt, &info);
1470 }
1471
1472 static int ip6_route_del(struct fib6_config *cfg)
1473 {
1474         struct fib6_table *table;
1475         struct fib6_node *fn;
1476         struct rt6_info *rt;
1477         int err = -ESRCH;
1478
1479         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1480         if (table == NULL)
1481                 return err;
1482
1483         read_lock_bh(&table->tb6_lock);
1484
1485         fn = fib6_locate(&table->tb6_root,
1486                          &cfg->fc_dst, cfg->fc_dst_len,
1487                          &cfg->fc_src, cfg->fc_src_len);
1488
1489         if (fn) {
1490                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1491                         if (cfg->fc_ifindex &&
1492                             (rt->rt6i_dev == NULL ||
1493                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1494                                 continue;
1495                         if (cfg->fc_flags & RTF_GATEWAY &&
1496                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1497                                 continue;
1498                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1499                                 continue;
1500                         dst_hold(&rt->dst);
1501                         read_unlock_bh(&table->tb6_lock);
1502
1503                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1504                 }
1505         }
1506         read_unlock_bh(&table->tb6_lock);
1507
1508         return err;
1509 }
1510
1511 /*
1512  *      Handle redirects
1513  */
1514 struct ip6rd_flowi {
1515         struct flowi6 fl6;
1516         struct in6_addr gateway;
1517 };
1518
1519 static struct rt6_info *__ip6_route_redirect(struct net *net,
1520                                              struct fib6_table *table,
1521                                              struct flowi6 *fl6,
1522                                              int flags)
1523 {
1524         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1525         struct rt6_info *rt;
1526         struct fib6_node *fn;
1527
1528         /*
1529          * Get the "current" route for this destination and
1530          * check if the redirect has come from approriate router.
1531          *
1532          * RFC 2461 specifies that redirects should only be
1533          * accepted if they come from the nexthop to the target.
1534          * Due to the way the routes are chosen, this notion
1535          * is a bit fuzzy and one might need to check all possible
1536          * routes.
1537          */
1538
1539         read_lock_bh(&table->tb6_lock);
1540         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1541 restart:
1542         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1543                 /*
1544                  * Current route is on-link; redirect is always invalid.
1545                  *
1546                  * Seems, previous statement is not true. It could
1547                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1548                  * But then router serving it might decide, that we should
1549                  * know truth 8)8) --ANK (980726).
1550                  */
1551                 if (rt6_check_expired(rt))
1552                         continue;
1553                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1554                         continue;
1555                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1556                         continue;
1557                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1558                         continue;
1559                 break;
1560         }
1561
1562         if (!rt)
1563                 rt = net->ipv6.ip6_null_entry;
1564         BACKTRACK(net, &fl6->saddr);
1565 out:
1566         dst_hold(&rt->dst);
1567
1568         read_unlock_bh(&table->tb6_lock);
1569
1570         return rt;
1571 };
1572
1573 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1574                                            const struct in6_addr *src,
1575                                            const struct in6_addr *gateway,
1576                                            struct net_device *dev)
1577 {
1578         int flags = RT6_LOOKUP_F_HAS_SADDR;
1579         struct net *net = dev_net(dev);
1580         struct ip6rd_flowi rdfl = {
1581                 .fl6 = {
1582                         .flowi6_oif = dev->ifindex,
1583                         .daddr = *dest,
1584                         .saddr = *src,
1585                 },
1586         };
1587
1588         ipv6_addr_copy(&rdfl.gateway, gateway);
1589
1590         if (rt6_need_strict(dest))
1591                 flags |= RT6_LOOKUP_F_IFACE;
1592
1593         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1594                                                    flags, __ip6_route_redirect);
1595 }
1596
1597 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1598                   const struct in6_addr *saddr,
1599                   struct neighbour *neigh, u8 *lladdr, int on_link)
1600 {
1601         struct rt6_info *rt, *nrt = NULL;
1602         struct netevent_redirect netevent;
1603         struct net *net = dev_net(neigh->dev);
1604
1605         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1606
1607         if (rt == net->ipv6.ip6_null_entry) {
1608                 if (net_ratelimit())
1609                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1610                                "for redirect target\n");
1611                 goto out;
1612         }
1613
1614         /*
1615          *      We have finally decided to accept it.
1616          */
1617
1618         neigh_update(neigh, lladdr, NUD_STALE,
1619                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1620                      NEIGH_UPDATE_F_OVERRIDE|
1621                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1622                                      NEIGH_UPDATE_F_ISROUTER))
1623                      );
1624
1625         /*
1626          * Redirect received -> path was valid.
1627          * Look, redirects are sent only in response to data packets,
1628          * so that this nexthop apparently is reachable. --ANK
1629          */
1630         dst_confirm(&rt->dst);
1631
1632         /* Duplicate redirect: silently ignore. */
1633         if (neigh == dst_get_neighbour_raw(&rt->dst))
1634                 goto out;
1635
1636         nrt = ip6_rt_copy(rt, dest);
1637         if (nrt == NULL)
1638                 goto out;
1639
1640         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1641         if (on_link)
1642                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1643
1644         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1645         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1646
1647         if (ip6_ins_rt(nrt))
1648                 goto out;
1649
1650         netevent.old = &rt->dst;
1651         netevent.new = &nrt->dst;
1652         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1653
1654         if (rt->rt6i_flags&RTF_CACHE) {
1655                 ip6_del_rt(rt);
1656                 return;
1657         }
1658
1659 out:
1660         dst_release(&rt->dst);
1661 }
1662
1663 /*
1664  *      Handle ICMP "packet too big" messages
1665  *      i.e. Path MTU discovery
1666  */
1667
1668 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1669                              struct net *net, u32 pmtu, int ifindex)
1670 {
1671         struct rt6_info *rt, *nrt;
1672         int allfrag = 0;
1673 again:
1674         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1675         if (rt == NULL)
1676                 return;
1677
1678         if (rt6_check_expired(rt)) {
1679                 ip6_del_rt(rt);
1680                 goto again;
1681         }
1682
1683         if (pmtu >= dst_mtu(&rt->dst))
1684                 goto out;
1685
1686         if (pmtu < IPV6_MIN_MTU) {
1687                 /*
1688                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1689                  * MTU (1280) and a fragment header should always be included
1690                  * after a node receiving Too Big message reporting PMTU is
1691                  * less than the IPv6 Minimum Link MTU.
1692                  */
1693                 pmtu = IPV6_MIN_MTU;
1694                 allfrag = 1;
1695         }
1696
1697         /* New mtu received -> path was valid.
1698            They are sent only in response to data packets,
1699            so that this nexthop apparently is reachable. --ANK
1700          */
1701         dst_confirm(&rt->dst);
1702
1703         /* Host route. If it is static, it would be better
1704            not to override it, but add new one, so that
1705            when cache entry will expire old pmtu
1706            would return automatically.
1707          */
1708         if (rt->rt6i_flags & RTF_CACHE) {
1709                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1710                 if (allfrag) {
1711                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1712                         features |= RTAX_FEATURE_ALLFRAG;
1713                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1714                 }
1715                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1716                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1717                 goto out;
1718         }
1719
1720         /* Network route.
1721            Two cases are possible:
1722            1. It is connected route. Action: COW
1723            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1724          */
1725         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1726                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1727         else
1728                 nrt = rt6_alloc_clone(rt, daddr);
1729
1730         if (nrt) {
1731                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1732                 if (allfrag) {
1733                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1734                         features |= RTAX_FEATURE_ALLFRAG;
1735                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1736                 }
1737
1738                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1739                  * happened within 5 mins, the recommended timer is 10 mins.
1740                  * Here this route expiration time is set to ip6_rt_mtu_expires
1741                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1742                  * and detecting PMTU increase will be automatically happened.
1743                  */
1744                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1745                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1746
1747                 ip6_ins_rt(nrt);
1748         }
1749 out:
1750         dst_release(&rt->dst);
1751 }
1752
1753 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1754                         struct net_device *dev, u32 pmtu)
1755 {
1756         struct net *net = dev_net(dev);
1757
1758         /*
1759          * RFC 1981 states that a node "MUST reduce the size of the packets it
1760          * is sending along the path" that caused the Packet Too Big message.
1761          * Since it's not possible in the general case to determine which
1762          * interface was used to send the original packet, we update the MTU
1763          * on the interface that will be used to send future packets. We also
1764          * update the MTU on the interface that received the Packet Too Big in
1765          * case the original packet was forced out that interface with
1766          * SO_BINDTODEVICE or similar. This is the next best thing to the
1767          * correct behaviour, which would be to update the MTU on all
1768          * interfaces.
1769          */
1770         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1771         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1772 }
1773
1774 /*
1775  *      Misc support functions
1776  */
1777
1778 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1779                                     const struct in6_addr *dest)
1780 {
1781         struct net *net = dev_net(ort->rt6i_dev);
1782         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1783                                             ort->dst.dev, 0);
1784
1785         if (rt) {
1786                 rt->dst.input = ort->dst.input;
1787                 rt->dst.output = ort->dst.output;
1788                 rt->dst.flags |= DST_HOST;
1789
1790                 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1791                 rt->rt6i_dst.plen = 128;
1792                 dst_copy_metrics(&rt->dst, &ort->dst);
1793                 rt->dst.error = ort->dst.error;
1794                 rt->rt6i_idev = ort->rt6i_idev;
1795                 if (rt->rt6i_idev)
1796                         in6_dev_hold(rt->rt6i_idev);
1797                 rt->dst.lastuse = jiffies;
1798                 rt->rt6i_expires = 0;
1799
1800                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1801                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1802                 rt->rt6i_metric = 0;
1803
1804 #ifdef CONFIG_IPV6_SUBTREES
1805                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1806 #endif
1807                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1808                 rt->rt6i_table = ort->rt6i_table;
1809         }
1810         return rt;
1811 }
1812
1813 #ifdef CONFIG_IPV6_ROUTE_INFO
1814 static struct rt6_info *rt6_get_route_info(struct net *net,
1815                                            const struct in6_addr *prefix, int prefixlen,
1816                                            const struct in6_addr *gwaddr, int ifindex)
1817 {
1818         struct fib6_node *fn;
1819         struct rt6_info *rt = NULL;
1820         struct fib6_table *table;
1821
1822         table = fib6_get_table(net, RT6_TABLE_INFO);
1823         if (table == NULL)
1824                 return NULL;
1825
1826         write_lock_bh(&table->tb6_lock);
1827         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1828         if (!fn)
1829                 goto out;
1830
1831         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1832                 if (rt->rt6i_dev->ifindex != ifindex)
1833                         continue;
1834                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1835                         continue;
1836                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1837                         continue;
1838                 dst_hold(&rt->dst);
1839                 break;
1840         }
1841 out:
1842         write_unlock_bh(&table->tb6_lock);
1843         return rt;
1844 }
1845
1846 static struct rt6_info *rt6_add_route_info(struct net *net,
1847                                            const struct in6_addr *prefix, int prefixlen,
1848                                            const struct in6_addr *gwaddr, int ifindex,
1849                                            unsigned pref)
1850 {
1851         struct fib6_config cfg = {
1852                 .fc_table       = RT6_TABLE_INFO,
1853                 .fc_metric      = IP6_RT_PRIO_USER,
1854                 .fc_ifindex     = ifindex,
1855                 .fc_dst_len     = prefixlen,
1856                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1857                                   RTF_UP | RTF_PREF(pref),
1858                 .fc_nlinfo.pid = 0,
1859                 .fc_nlinfo.nlh = NULL,
1860                 .fc_nlinfo.nl_net = net,
1861         };
1862
1863         ipv6_addr_copy(&cfg.fc_dst, prefix);
1864         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1865
1866         /* We should treat it as a default route if prefix length is 0. */
1867         if (!prefixlen)
1868                 cfg.fc_flags |= RTF_DEFAULT;
1869
1870         ip6_route_add(&cfg);
1871
1872         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1873 }
1874 #endif
1875
1876 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1877 {
1878         struct rt6_info *rt;
1879         struct fib6_table *table;
1880
1881         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1882         if (table == NULL)
1883                 return NULL;
1884
1885         write_lock_bh(&table->tb6_lock);
1886         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1887                 if (dev == rt->rt6i_dev &&
1888                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1889                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1890                         break;
1891         }
1892         if (rt)
1893                 dst_hold(&rt->dst);
1894         write_unlock_bh(&table->tb6_lock);
1895         return rt;
1896 }
1897
1898 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1899                                      struct net_device *dev,
1900                                      unsigned int pref)
1901 {
1902         struct fib6_config cfg = {
1903                 .fc_table       = RT6_TABLE_DFLT,
1904                 .fc_metric      = IP6_RT_PRIO_USER,
1905                 .fc_ifindex     = dev->ifindex,
1906                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1907                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1908                 .fc_nlinfo.pid = 0,
1909                 .fc_nlinfo.nlh = NULL,
1910                 .fc_nlinfo.nl_net = dev_net(dev),
1911         };
1912
1913         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1914
1915         ip6_route_add(&cfg);
1916
1917         return rt6_get_dflt_router(gwaddr, dev);
1918 }
1919
1920 void rt6_purge_dflt_routers(struct net *net)
1921 {
1922         struct rt6_info *rt;
1923         struct fib6_table *table;
1924
1925         /* NOTE: Keep consistent with rt6_get_dflt_router */
1926         table = fib6_get_table(net, RT6_TABLE_DFLT);
1927         if (table == NULL)
1928                 return;
1929
1930 restart:
1931         read_lock_bh(&table->tb6_lock);
1932         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1933                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1934                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1935                         dst_hold(&rt->dst);
1936                         read_unlock_bh(&table->tb6_lock);
1937                         ip6_del_rt(rt);
1938                         goto restart;
1939                 }
1940         }
1941         read_unlock_bh(&table->tb6_lock);
1942 }
1943
1944 static void rtmsg_to_fib6_config(struct net *net,
1945                                  struct in6_rtmsg *rtmsg,
1946                                  struct fib6_config *cfg)
1947 {
1948         memset(cfg, 0, sizeof(*cfg));
1949
1950         cfg->fc_table = RT6_TABLE_MAIN;
1951         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1952         cfg->fc_metric = rtmsg->rtmsg_metric;
1953         cfg->fc_expires = rtmsg->rtmsg_info;
1954         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1955         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1956         cfg->fc_flags = rtmsg->rtmsg_flags;
1957
1958         cfg->fc_nlinfo.nl_net = net;
1959
1960         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1961         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1962         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1963 }
1964
1965 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1966 {
1967         struct fib6_config cfg;
1968         struct in6_rtmsg rtmsg;
1969         int err;
1970
1971         switch(cmd) {
1972         case SIOCADDRT:         /* Add a route */
1973         case SIOCDELRT:         /* Delete a route */
1974                 if (!capable(CAP_NET_ADMIN))
1975                         return -EPERM;
1976                 err = copy_from_user(&rtmsg, arg,
1977                                      sizeof(struct in6_rtmsg));
1978                 if (err)
1979                         return -EFAULT;
1980
1981                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1982
1983                 rtnl_lock();
1984                 switch (cmd) {
1985                 case SIOCADDRT:
1986                         err = ip6_route_add(&cfg);
1987                         break;
1988                 case SIOCDELRT:
1989                         err = ip6_route_del(&cfg);
1990                         break;
1991                 default:
1992                         err = -EINVAL;
1993                 }
1994                 rtnl_unlock();
1995
1996                 return err;
1997         }
1998
1999         return -EINVAL;
2000 }
2001
2002 /*
2003  *      Drop the packet on the floor
2004  */
2005
2006 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2007 {
2008         int type;
2009         struct dst_entry *dst = skb_dst(skb);
2010         switch (ipstats_mib_noroutes) {
2011         case IPSTATS_MIB_INNOROUTES:
2012                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2013                 if (type == IPV6_ADDR_ANY) {
2014                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2015                                       IPSTATS_MIB_INADDRERRORS);
2016                         break;
2017                 }
2018                 /* FALLTHROUGH */
2019         case IPSTATS_MIB_OUTNOROUTES:
2020                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2021                               ipstats_mib_noroutes);
2022                 break;
2023         }
2024         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2025         kfree_skb(skb);
2026         return 0;
2027 }
2028
2029 static int ip6_pkt_discard(struct sk_buff *skb)
2030 {
2031         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2032 }
2033
2034 static int ip6_pkt_discard_out(struct sk_buff *skb)
2035 {
2036         skb->dev = skb_dst(skb)->dev;
2037         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2038 }
2039
2040 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2041
2042 static int ip6_pkt_prohibit(struct sk_buff *skb)
2043 {
2044         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2045 }
2046
2047 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2048 {
2049         skb->dev = skb_dst(skb)->dev;
2050         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2051 }
2052
2053 #endif
2054
2055 /*
2056  *      Allocate a dst for local (unicast / anycast) address.
2057  */
2058
2059 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2060                                     const struct in6_addr *addr,
2061                                     int anycast)
2062 {
2063         struct net *net = dev_net(idev->dev);
2064         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2065                                             net->loopback_dev, DST_NOCOUNT);
2066         struct neighbour *neigh;
2067
2068         if (rt == NULL)
2069                 return ERR_PTR(-ENOMEM);
2070
2071         in6_dev_hold(idev);
2072
2073         rt->dst.flags |= DST_HOST;
2074         rt->dst.input = ip6_input;
2075         rt->dst.output = ip6_output;
2076         rt->rt6i_idev = idev;
2077         rt->dst.obsolete = -1;
2078
2079         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2080         if (anycast)
2081                 rt->rt6i_flags |= RTF_ANYCAST;
2082         else
2083                 rt->rt6i_flags |= RTF_LOCAL;
2084         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2085         if (IS_ERR(neigh)) {
2086                 dst_free(&rt->dst);
2087
2088                 return ERR_CAST(neigh);
2089         }
2090         dst_set_neighbour(&rt->dst, neigh);
2091
2092         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2093         rt->rt6i_dst.plen = 128;
2094         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2095
2096         atomic_set(&rt->dst.__refcnt, 1);
2097
2098         return rt;
2099 }
2100
2101 int ip6_route_get_saddr(struct net *net,
2102                         struct rt6_info *rt,
2103                         const struct in6_addr *daddr,
2104                         unsigned int prefs,
2105                         struct in6_addr *saddr)
2106 {
2107         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2108         int err = 0;
2109         if (rt->rt6i_prefsrc.plen)
2110                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2111         else
2112                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2113                                          daddr, prefs, saddr);
2114         return err;
2115 }
2116
2117 /* remove deleted ip from prefsrc entries */
2118 struct arg_dev_net_ip {
2119         struct net_device *dev;
2120         struct net *net;
2121         struct in6_addr *addr;
2122 };
2123
2124 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2125 {
2126         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2127         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2128         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2129
2130         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2131             rt != net->ipv6.ip6_null_entry &&
2132             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2133                 /* remove prefsrc entry */
2134                 rt->rt6i_prefsrc.plen = 0;
2135         }
2136         return 0;
2137 }
2138
2139 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2140 {
2141         struct net *net = dev_net(ifp->idev->dev);
2142         struct arg_dev_net_ip adni = {
2143                 .dev = ifp->idev->dev,
2144                 .net = net,
2145                 .addr = &ifp->addr,
2146         };
2147         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2148 }
2149
2150 struct arg_dev_net {
2151         struct net_device *dev;
2152         struct net *net;
2153 };
2154
2155 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2156 {
2157         const struct arg_dev_net *adn = arg;
2158         const struct net_device *dev = adn->dev;
2159
2160         if ((rt->rt6i_dev == dev || dev == NULL) &&
2161             rt != adn->net->ipv6.ip6_null_entry) {
2162                 RT6_TRACE("deleted by ifdown %p\n", rt);
2163                 return -1;
2164         }
2165         return 0;
2166 }
2167
2168 void rt6_ifdown(struct net *net, struct net_device *dev)
2169 {
2170         struct arg_dev_net adn = {
2171                 .dev = dev,
2172                 .net = net,
2173         };
2174
2175         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2176         icmp6_clean_all(fib6_ifdown, &adn);
2177 }
2178
2179 struct rt6_mtu_change_arg
2180 {
2181         struct net_device *dev;
2182         unsigned mtu;
2183 };
2184
2185 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2186 {
2187         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2188         struct inet6_dev *idev;
2189
2190         /* In IPv6 pmtu discovery is not optional,
2191            so that RTAX_MTU lock cannot disable it.
2192            We still use this lock to block changes
2193            caused by addrconf/ndisc.
2194         */
2195
2196         idev = __in6_dev_get(arg->dev);
2197         if (idev == NULL)
2198                 return 0;
2199
2200         /* For administrative MTU increase, there is no way to discover
2201            IPv6 PMTU increase, so PMTU increase should be updated here.
2202            Since RFC 1981 doesn't include administrative MTU increase
2203            update PMTU increase is a MUST. (i.e. jumbo frame)
2204          */
2205         /*
2206            If new MTU is less than route PMTU, this new MTU will be the
2207            lowest MTU in the path, update the route PMTU to reflect PMTU
2208            decreases; if new MTU is greater than route PMTU, and the
2209            old MTU is the lowest MTU in the path, update the route PMTU
2210            to reflect the increase. In this case if the other nodes' MTU
2211            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2212            PMTU discouvery.
2213          */
2214         if (rt->rt6i_dev == arg->dev &&
2215             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2216             (dst_mtu(&rt->dst) >= arg->mtu ||
2217              (dst_mtu(&rt->dst) < arg->mtu &&
2218               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2219                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2220         }
2221         return 0;
2222 }
2223
2224 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2225 {
2226         struct rt6_mtu_change_arg arg = {
2227                 .dev = dev,
2228                 .mtu = mtu,
2229         };
2230
2231         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2232 }
2233
2234 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2235         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2236         [RTA_OIF]               = { .type = NLA_U32 },
2237         [RTA_IIF]               = { .type = NLA_U32 },
2238         [RTA_PRIORITY]          = { .type = NLA_U32 },
2239         [RTA_METRICS]           = { .type = NLA_NESTED },
2240 };
2241
2242 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2243                               struct fib6_config *cfg)
2244 {
2245         struct rtmsg *rtm;
2246         struct nlattr *tb[RTA_MAX+1];
2247         int err;
2248
2249         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2250         if (err < 0)
2251                 goto errout;
2252
2253         err = -EINVAL;
2254         rtm = nlmsg_data(nlh);
2255         memset(cfg, 0, sizeof(*cfg));
2256
2257         cfg->fc_table = rtm->rtm_table;
2258         cfg->fc_dst_len = rtm->rtm_dst_len;
2259         cfg->fc_src_len = rtm->rtm_src_len;
2260         cfg->fc_flags = RTF_UP;
2261         cfg->fc_protocol = rtm->rtm_protocol;
2262
2263         if (rtm->rtm_type == RTN_UNREACHABLE)
2264                 cfg->fc_flags |= RTF_REJECT;
2265
2266         if (rtm->rtm_type == RTN_LOCAL)
2267                 cfg->fc_flags |= RTF_LOCAL;
2268
2269         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2270         cfg->fc_nlinfo.nlh = nlh;
2271         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2272
2273         if (tb[RTA_GATEWAY]) {
2274                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2275                 cfg->fc_flags |= RTF_GATEWAY;
2276         }
2277
2278         if (tb[RTA_DST]) {
2279                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2280
2281                 if (nla_len(tb[RTA_DST]) < plen)
2282                         goto errout;
2283
2284                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2285         }
2286
2287         if (tb[RTA_SRC]) {
2288                 int plen = (rtm->rtm_src_len + 7) >> 3;
2289
2290                 if (nla_len(tb[RTA_SRC]) < plen)
2291                         goto errout;
2292
2293                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2294         }
2295
2296         if (tb[RTA_PREFSRC])
2297                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2298
2299         if (tb[RTA_OIF])
2300                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2301
2302         if (tb[RTA_PRIORITY])
2303                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2304
2305         if (tb[RTA_METRICS]) {
2306                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2307                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2308         }
2309
2310         if (tb[RTA_TABLE])
2311                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2312
2313         err = 0;
2314 errout:
2315         return err;
2316 }
2317
2318 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2319 {
2320         struct fib6_config cfg;
2321         int err;
2322
2323         err = rtm_to_fib6_config(skb, nlh, &cfg);
2324         if (err < 0)
2325                 return err;
2326
2327         return ip6_route_del(&cfg);
2328 }
2329
2330 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2331 {
2332         struct fib6_config cfg;
2333         int err;
2334
2335         err = rtm_to_fib6_config(skb, nlh, &cfg);
2336         if (err < 0)
2337                 return err;
2338
2339         return ip6_route_add(&cfg);
2340 }
2341
2342 static inline size_t rt6_nlmsg_size(void)
2343 {
2344         return NLMSG_ALIGN(sizeof(struct rtmsg))
2345                + nla_total_size(16) /* RTA_SRC */
2346                + nla_total_size(16) /* RTA_DST */
2347                + nla_total_size(16) /* RTA_GATEWAY */
2348                + nla_total_size(16) /* RTA_PREFSRC */
2349                + nla_total_size(4) /* RTA_TABLE */
2350                + nla_total_size(4) /* RTA_IIF */
2351                + nla_total_size(4) /* RTA_OIF */
2352                + nla_total_size(4) /* RTA_PRIORITY */
2353                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2354                + nla_total_size(sizeof(struct rta_cacheinfo));
2355 }
2356
2357 static int rt6_fill_node(struct net *net,
2358                          struct sk_buff *skb, struct rt6_info *rt,
2359                          struct in6_addr *dst, struct in6_addr *src,
2360                          int iif, int type, u32 pid, u32 seq,
2361                          int prefix, int nowait, unsigned int flags)
2362 {
2363         struct rtmsg *rtm;
2364         struct nlmsghdr *nlh;
2365         long expires;
2366         u32 table;
2367         struct neighbour *n;
2368
2369         if (prefix) {   /* user wants prefix routes only */
2370                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2371                         /* success since this is not a prefix route */
2372                         return 1;
2373                 }
2374         }
2375
2376         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2377         if (nlh == NULL)
2378                 return -EMSGSIZE;
2379
2380         rtm = nlmsg_data(nlh);
2381         rtm->rtm_family = AF_INET6;
2382         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2383         rtm->rtm_src_len = rt->rt6i_src.plen;
2384         rtm->rtm_tos = 0;
2385         if (rt->rt6i_table)
2386                 table = rt->rt6i_table->tb6_id;
2387         else
2388                 table = RT6_TABLE_UNSPEC;
2389         rtm->rtm_table = table;
2390         NLA_PUT_U32(skb, RTA_TABLE, table);
2391         if (rt->rt6i_flags&RTF_REJECT)
2392                 rtm->rtm_type = RTN_UNREACHABLE;
2393         else if (rt->rt6i_flags&RTF_LOCAL)
2394                 rtm->rtm_type = RTN_LOCAL;
2395         else if (rt->rt6i_flags & RTF_ANYCAST)
2396                 rtm->rtm_type = RTN_ANYCAST;
2397         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2398                 rtm->rtm_type = RTN_LOCAL;
2399         else
2400                 rtm->rtm_type = RTN_UNICAST;
2401         rtm->rtm_flags = 0;
2402         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2403         rtm->rtm_protocol = rt->rt6i_protocol;
2404         if (rt->rt6i_flags&RTF_DYNAMIC)
2405                 rtm->rtm_protocol = RTPROT_REDIRECT;
2406         else if (rt->rt6i_flags & RTF_ADDRCONF)
2407                 rtm->rtm_protocol = RTPROT_KERNEL;
2408         else if (rt->rt6i_flags&RTF_DEFAULT)
2409                 rtm->rtm_protocol = RTPROT_RA;
2410
2411         if (rt->rt6i_flags&RTF_CACHE)
2412                 rtm->rtm_flags |= RTM_F_CLONED;
2413
2414         if (dst) {
2415                 NLA_PUT(skb, RTA_DST, 16, dst);
2416                 rtm->rtm_dst_len = 128;
2417         } else if (rtm->rtm_dst_len)
2418                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2419 #ifdef CONFIG_IPV6_SUBTREES
2420         if (src) {
2421                 NLA_PUT(skb, RTA_SRC, 16, src);
2422                 rtm->rtm_src_len = 128;
2423         } else if (rtm->rtm_src_len)
2424                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2425 #endif
2426         if (iif) {
2427 #ifdef CONFIG_IPV6_MROUTE
2428                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2429                         int err = ip6mr_get_route(net, skb, rtm, nowait,
2430                                                   pid);
2431
2432                         if (err <= 0) {
2433                                 if (!nowait) {
2434                                         if (err == 0)
2435                                                 return 0;
2436                                         goto nla_put_failure;
2437                                 } else {
2438                                         if (err == -EMSGSIZE)
2439                                                 goto nla_put_failure;
2440                                 }
2441                         }
2442                 } else
2443 #endif
2444                         NLA_PUT_U32(skb, RTA_IIF, iif);
2445         } else if (dst) {
2446                 struct in6_addr saddr_buf;
2447                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2448                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2449         }
2450
2451         if (rt->rt6i_prefsrc.plen) {
2452                 struct in6_addr saddr_buf;
2453                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2454                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2455         }
2456
2457         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2458                 goto nla_put_failure;
2459
2460         rcu_read_lock();
2461         n = dst_get_neighbour(&rt->dst);
2462         if (n) {
2463                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2464                         rcu_read_unlock();
2465                         goto nla_put_failure;
2466                 }
2467         }
2468         rcu_read_unlock();
2469
2470         if (rt->dst.dev)
2471                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2472
2473         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2474
2475         if (!(rt->rt6i_flags & RTF_EXPIRES))
2476                 expires = 0;
2477         else if (rt->rt6i_expires - jiffies < INT_MAX)
2478                 expires = rt->rt6i_expires - jiffies;
2479         else
2480                 expires = INT_MAX;
2481
2482         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2483                                expires, rt->dst.error) < 0)
2484                 goto nla_put_failure;
2485
2486         return nlmsg_end(skb, nlh);
2487
2488 nla_put_failure:
2489         nlmsg_cancel(skb, nlh);
2490         return -EMSGSIZE;
2491 }
2492
2493 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2494 {
2495         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2496         int prefix;
2497
2498         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2499                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2500                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2501         } else
2502                 prefix = 0;
2503
2504         return rt6_fill_node(arg->net,
2505                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2506                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2507                      prefix, 0, NLM_F_MULTI);
2508 }
2509
2510 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2511 {
2512         struct net *net = sock_net(in_skb->sk);
2513         struct nlattr *tb[RTA_MAX+1];
2514         struct rt6_info *rt;
2515         struct sk_buff *skb;
2516         struct rtmsg *rtm;
2517         struct flowi6 fl6;
2518         int err, iif = 0;
2519
2520         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2521         if (err < 0)
2522                 goto errout;
2523
2524         err = -EINVAL;
2525         memset(&fl6, 0, sizeof(fl6));
2526
2527         if (tb[RTA_SRC]) {
2528                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2529                         goto errout;
2530
2531                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2532         }
2533
2534         if (tb[RTA_DST]) {
2535                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2536                         goto errout;
2537
2538                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2539         }
2540
2541         if (tb[RTA_IIF])
2542                 iif = nla_get_u32(tb[RTA_IIF]);
2543
2544         if (tb[RTA_OIF])
2545                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2546
2547         if (iif) {
2548                 struct net_device *dev;
2549                 dev = __dev_get_by_index(net, iif);
2550                 if (!dev) {
2551                         err = -ENODEV;
2552                         goto errout;
2553                 }
2554         }
2555
2556         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2557         if (skb == NULL) {
2558                 err = -ENOBUFS;
2559                 goto errout;
2560         }
2561
2562         /* Reserve room for dummy headers, this skb can pass
2563            through good chunk of routing engine.
2564          */
2565         skb_reset_mac_header(skb);
2566         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2567
2568         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2569         skb_dst_set(skb, &rt->dst);
2570
2571         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2572                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2573                             nlh->nlmsg_seq, 0, 0, 0);
2574         if (err < 0) {
2575                 kfree_skb(skb);
2576                 goto errout;
2577         }
2578
2579         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2580 errout:
2581         return err;
2582 }
2583
2584 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2585 {
2586         struct sk_buff *skb;
2587         struct net *net = info->nl_net;
2588         u32 seq;
2589         int err;
2590
2591         err = -ENOBUFS;
2592         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2593
2594         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2595         if (skb == NULL)
2596                 goto errout;
2597
2598         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2599                                 event, info->pid, seq, 0, 0, 0);
2600         if (err < 0) {
2601                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2602                 WARN_ON(err == -EMSGSIZE);
2603                 kfree_skb(skb);
2604                 goto errout;
2605         }
2606         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2607                     info->nlh, gfp_any());
2608         return;
2609 errout:
2610         if (err < 0)
2611                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2612 }
2613
2614 static int ip6_route_dev_notify(struct notifier_block *this,
2615                                 unsigned long event, void *data)
2616 {
2617         struct net_device *dev = (struct net_device *)data;
2618         struct net *net = dev_net(dev);
2619
2620         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2621                 net->ipv6.ip6_null_entry->dst.dev = dev;
2622                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2623 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2624                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2625                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2626                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2627                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2628 #endif
2629         }
2630
2631         return NOTIFY_OK;
2632 }
2633
2634 /*
2635  *      /proc
2636  */
2637
2638 #ifdef CONFIG_PROC_FS
2639
2640 struct rt6_proc_arg
2641 {
2642         char *buffer;
2643         int offset;
2644         int length;
2645         int skip;
2646         int len;
2647 };
2648
2649 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2650 {
2651         struct seq_file *m = p_arg;
2652         struct neighbour *n;
2653
2654         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2655
2656 #ifdef CONFIG_IPV6_SUBTREES
2657         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2658 #else
2659         seq_puts(m, "00000000000000000000000000000000 00 ");
2660 #endif
2661         rcu_read_lock();
2662         n = dst_get_neighbour(&rt->dst);
2663         if (n) {
2664                 seq_printf(m, "%pi6", n->primary_key);
2665         } else {
2666                 seq_puts(m, "00000000000000000000000000000000");
2667         }
2668         rcu_read_unlock();
2669         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2670                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2671                    rt->dst.__use, rt->rt6i_flags,
2672                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2673         return 0;
2674 }
2675
2676 static int ipv6_route_show(struct seq_file *m, void *v)
2677 {
2678         struct net *net = (struct net *)m->private;
2679         fib6_clean_all(net, rt6_info_route, 0, m);
2680         return 0;
2681 }
2682
2683 static int ipv6_route_open(struct inode *inode, struct file *file)
2684 {
2685         return single_open_net(inode, file, ipv6_route_show);
2686 }
2687
2688 static const struct file_operations ipv6_route_proc_fops = {
2689         .owner          = THIS_MODULE,
2690         .open           = ipv6_route_open,
2691         .read           = seq_read,
2692         .llseek         = seq_lseek,
2693         .release        = single_release_net,
2694 };
2695
2696 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2697 {
2698         struct net *net = (struct net *)seq->private;
2699         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2700                    net->ipv6.rt6_stats->fib_nodes,
2701                    net->ipv6.rt6_stats->fib_route_nodes,
2702                    net->ipv6.rt6_stats->fib_rt_alloc,
2703                    net->ipv6.rt6_stats->fib_rt_entries,
2704                    net->ipv6.rt6_stats->fib_rt_cache,
2705                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2706                    net->ipv6.rt6_stats->fib_discarded_routes);
2707
2708         return 0;
2709 }
2710
2711 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2712 {
2713         return single_open_net(inode, file, rt6_stats_seq_show);
2714 }
2715
2716 static const struct file_operations rt6_stats_seq_fops = {
2717         .owner   = THIS_MODULE,
2718         .open    = rt6_stats_seq_open,
2719         .read    = seq_read,
2720         .llseek  = seq_lseek,
2721         .release = single_release_net,
2722 };
2723 #endif  /* CONFIG_PROC_FS */
2724
2725 #ifdef CONFIG_SYSCTL
2726
2727 static
2728 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2729                               void __user *buffer, size_t *lenp, loff_t *ppos)
2730 {
2731         struct net *net;
2732         int delay;
2733         if (!write)
2734                 return -EINVAL;
2735
2736         net = (struct net *)ctl->extra1;
2737         delay = net->ipv6.sysctl.flush_delay;
2738         proc_dointvec(ctl, write, buffer, lenp, ppos);
2739         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
2740         return 0;
2741 }
2742
2743 ctl_table ipv6_route_table_template[] = {
2744         {
2745                 .procname       =       "flush",
2746                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2747                 .maxlen         =       sizeof(int),
2748                 .mode           =       0200,
2749                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2750         },
2751         {
2752                 .procname       =       "gc_thresh",
2753                 .data           =       &ip6_dst_ops_template.gc_thresh,
2754                 .maxlen         =       sizeof(int),
2755                 .mode           =       0644,
2756                 .proc_handler   =       proc_dointvec,
2757         },
2758         {
2759                 .procname       =       "max_size",
2760                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2761                 .maxlen         =       sizeof(int),
2762                 .mode           =       0644,
2763                 .proc_handler   =       proc_dointvec,
2764         },
2765         {
2766                 .procname       =       "gc_min_interval",
2767                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2768                 .maxlen         =       sizeof(int),
2769                 .mode           =       0644,
2770                 .proc_handler   =       proc_dointvec_jiffies,
2771         },
2772         {
2773                 .procname       =       "gc_timeout",
2774                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2775                 .maxlen         =       sizeof(int),
2776                 .mode           =       0644,
2777                 .proc_handler   =       proc_dointvec_jiffies,
2778         },
2779         {
2780                 .procname       =       "gc_interval",
2781                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2782                 .maxlen         =       sizeof(int),
2783                 .mode           =       0644,
2784                 .proc_handler   =       proc_dointvec_jiffies,
2785         },
2786         {
2787                 .procname       =       "gc_elasticity",
2788                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2789                 .maxlen         =       sizeof(int),
2790                 .mode           =       0644,
2791                 .proc_handler   =       proc_dointvec,
2792         },
2793         {
2794                 .procname       =       "mtu_expires",
2795                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2796                 .maxlen         =       sizeof(int),
2797                 .mode           =       0644,
2798                 .proc_handler   =       proc_dointvec_jiffies,
2799         },
2800         {
2801                 .procname       =       "min_adv_mss",
2802                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2803                 .maxlen         =       sizeof(int),
2804                 .mode           =       0644,
2805                 .proc_handler   =       proc_dointvec,
2806         },
2807         {
2808                 .procname       =       "gc_min_interval_ms",
2809                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2810                 .maxlen         =       sizeof(int),
2811                 .mode           =       0644,
2812                 .proc_handler   =       proc_dointvec_ms_jiffies,
2813         },
2814         { }
2815 };
2816
2817 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2818 {
2819         struct ctl_table *table;
2820
2821         table = kmemdup(ipv6_route_table_template,
2822                         sizeof(ipv6_route_table_template),
2823                         GFP_KERNEL);
2824
2825         if (table) {
2826                 table[0].data = &net->ipv6.sysctl.flush_delay;
2827                 table[0].extra1 = net;
2828                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2829                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2830                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2831                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2832                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2833                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2834                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2835                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2836                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2837         }
2838
2839         return table;
2840 }
2841 #endif
2842
2843 static int __net_init ip6_route_net_init(struct net *net)
2844 {
2845         int ret = -ENOMEM;
2846
2847         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2848                sizeof(net->ipv6.ip6_dst_ops));
2849
2850         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2851                 goto out_ip6_dst_ops;
2852
2853         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2854                                            sizeof(*net->ipv6.ip6_null_entry),
2855                                            GFP_KERNEL);
2856         if (!net->ipv6.ip6_null_entry)
2857                 goto out_ip6_dst_entries;
2858         net->ipv6.ip6_null_entry->dst.path =
2859                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2860         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2861         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2862                          ip6_template_metrics, true);
2863
2864 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2865         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2866                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2867                                                GFP_KERNEL);
2868         if (!net->ipv6.ip6_prohibit_entry)
2869                 goto out_ip6_null_entry;
2870         net->ipv6.ip6_prohibit_entry->dst.path =
2871                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2872         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2873         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2874                          ip6_template_metrics, true);
2875
2876         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2877                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2878                                                GFP_KERNEL);
2879         if (!net->ipv6.ip6_blk_hole_entry)
2880                 goto out_ip6_prohibit_entry;
2881         net->ipv6.ip6_blk_hole_entry->dst.path =
2882                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2883         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2884         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2885                          ip6_template_metrics, true);
2886 #endif
2887
2888         net->ipv6.sysctl.flush_delay = 0;
2889         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2890         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2891         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2892         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2893         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2894         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2895         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2896
2897         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2898
2899         ret = 0;
2900 out:
2901         return ret;
2902
2903 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2904 out_ip6_prohibit_entry:
2905         kfree(net->ipv6.ip6_prohibit_entry);
2906 out_ip6_null_entry:
2907         kfree(net->ipv6.ip6_null_entry);
2908 #endif
2909 out_ip6_dst_entries:
2910         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2911 out_ip6_dst_ops:
2912         goto out;
2913 }
2914
2915 static void __net_exit ip6_route_net_exit(struct net *net)
2916 {
2917         kfree(net->ipv6.ip6_null_entry);
2918 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2919         kfree(net->ipv6.ip6_prohibit_entry);
2920         kfree(net->ipv6.ip6_blk_hole_entry);
2921 #endif
2922         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2923 }
2924
2925 static int __net_init ip6_route_net_init_late(struct net *net)
2926 {
2927 #ifdef CONFIG_PROC_FS
2928         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2929         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2930 #endif
2931         return 0;
2932 }
2933
2934 static void __net_exit ip6_route_net_exit_late(struct net *net)
2935 {
2936 #ifdef CONFIG_PROC_FS
2937         proc_net_remove(net, "ipv6_route");
2938         proc_net_remove(net, "rt6_stats");
2939 #endif
2940 }
2941
2942 static struct pernet_operations ip6_route_net_ops = {
2943         .init = ip6_route_net_init,
2944         .exit = ip6_route_net_exit,
2945 };
2946
2947 static struct pernet_operations ip6_route_net_late_ops = {
2948         .init = ip6_route_net_init_late,
2949         .exit = ip6_route_net_exit_late,
2950 };
2951
2952 static struct notifier_block ip6_route_dev_notifier = {
2953         .notifier_call = ip6_route_dev_notify,
2954         .priority = 0,
2955 };
2956
2957 int __init ip6_route_init(void)
2958 {
2959         int ret;
2960
2961         ret = -ENOMEM;
2962         ip6_dst_ops_template.kmem_cachep =
2963                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2964                                   SLAB_HWCACHE_ALIGN, NULL);
2965         if (!ip6_dst_ops_template.kmem_cachep)
2966                 goto out;
2967
2968         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2969         if (ret)
2970                 goto out_kmem_cache;
2971
2972         ret = register_pernet_subsys(&ip6_route_net_ops);
2973         if (ret)
2974                 goto out_dst_entries;
2975
2976         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2977
2978         /* Registering of the loopback is done before this portion of code,
2979          * the loopback reference in rt6_info will not be taken, do it
2980          * manually for init_net */
2981         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2982         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2983   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2984         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2985         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2986         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2987         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2988   #endif
2989         ret = fib6_init();
2990         if (ret)
2991                 goto out_register_subsys;
2992
2993         ret = xfrm6_init();
2994         if (ret)
2995                 goto out_fib6_init;
2996
2997         ret = fib6_rules_init();
2998         if (ret)
2999                 goto xfrm6_init;
3000
3001         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3002         if (ret)
3003                 goto fib6_rules_init;
3004
3005         ret = -ENOBUFS;
3006         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3007             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3008             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3009                 goto out_register_late_subsys;
3010
3011         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3012         if (ret)
3013                 goto out_register_late_subsys;
3014
3015 out:
3016         return ret;
3017
3018 out_register_late_subsys:
3019         unregister_pernet_subsys(&ip6_route_net_late_ops);
3020 fib6_rules_init:
3021         fib6_rules_cleanup();
3022 xfrm6_init:
3023         xfrm6_fini();
3024 out_fib6_init:
3025         fib6_gc_cleanup();
3026 out_register_subsys:
3027         unregister_pernet_subsys(&ip6_route_net_ops);
3028 out_dst_entries:
3029         dst_entries_destroy(&ip6_dst_blackhole_ops);
3030 out_kmem_cache:
3031         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3032         goto out;
3033 }
3034
3035 void ip6_route_cleanup(void)
3036 {
3037         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3038         unregister_pernet_subsys(&ip6_route_net_late_ops);
3039         fib6_rules_cleanup();
3040         xfrm6_fini();
3041         fib6_gc_cleanup();
3042         unregister_pernet_subsys(&ip6_route_net_ops);
3043         dst_entries_destroy(&ip6_dst_blackhole_ops);
3044         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3045 }