Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/ide
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
76                                     const struct in6_addr *dest);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void             ip6_dst_destroy(struct dst_entry *);
82 static void             ip6_dst_ifdown(struct dst_entry *,
83                                        struct net_device *dev, int how);
84 static int               ip6_dst_gc(struct dst_ops *ops);
85
86 static int              ip6_pkt_discard(struct sk_buff *skb);
87 static int              ip6_pkt_discard_out(struct sk_buff *skb);
88 static void             ip6_link_failure(struct sk_buff *skb);
89 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90
91 #ifdef CONFIG_IPV6_ROUTE_INFO
92 static struct rt6_info *rt6_add_route_info(struct net *net,
93                                            const struct in6_addr *prefix, int prefixlen,
94                                            const struct in6_addr *gwaddr, int ifindex,
95                                            unsigned pref);
96 static struct rt6_info *rt6_get_route_info(struct net *net,
97                                            const struct in6_addr *prefix, int prefixlen,
98                                            const struct in6_addr *gwaddr, int ifindex);
99 #endif
100
101 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
102 {
103         struct rt6_info *rt = (struct rt6_info *) dst;
104         struct inet_peer *peer;
105         u32 *p = NULL;
106
107         if (!rt->rt6i_peer)
108                 rt6_bind_peer(rt, 1);
109
110         peer = rt->rt6i_peer;
111         if (peer) {
112                 u32 *old_p = __DST_METRICS_PTR(old);
113                 unsigned long prev, new;
114
115                 p = peer->metrics;
116                 if (inet_metrics_new(peer))
117                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
118
119                 new = (unsigned long) p;
120                 prev = cmpxchg(&dst->_metrics, old, new);
121
122                 if (prev != old) {
123                         p = __DST_METRICS_PTR(prev);
124                         if (prev & DST_METRICS_READ_ONLY)
125                                 p = NULL;
126                 }
127         }
128         return p;
129 }
130
131 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
132 {
133         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
134 }
135
136 static struct dst_ops ip6_dst_ops_template = {
137         .family                 =       AF_INET6,
138         .protocol               =       cpu_to_be16(ETH_P_IPV6),
139         .gc                     =       ip6_dst_gc,
140         .gc_thresh              =       1024,
141         .check                  =       ip6_dst_check,
142         .default_advmss         =       ip6_default_advmss,
143         .default_mtu            =       ip6_default_mtu,
144         .cow_metrics            =       ipv6_cow_metrics,
145         .destroy                =       ip6_dst_destroy,
146         .ifdown                 =       ip6_dst_ifdown,
147         .negative_advice        =       ip6_negative_advice,
148         .link_failure           =       ip6_link_failure,
149         .update_pmtu            =       ip6_rt_update_pmtu,
150         .local_out              =       __ip6_local_out,
151         .neigh_lookup           =       ip6_neigh_lookup,
152 };
153
154 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
155 {
156         return 0;
157 }
158
159 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
160 {
161 }
162
163 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
164                                          unsigned long old)
165 {
166         return NULL;
167 }
168
169 static struct dst_ops ip6_dst_blackhole_ops = {
170         .family                 =       AF_INET6,
171         .protocol               =       cpu_to_be16(ETH_P_IPV6),
172         .destroy                =       ip6_dst_destroy,
173         .check                  =       ip6_dst_check,
174         .default_mtu            =       ip6_blackhole_default_mtu,
175         .default_advmss         =       ip6_default_advmss,
176         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
177         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
178         .neigh_lookup           =       ip6_neigh_lookup,
179 };
180
181 static const u32 ip6_template_metrics[RTAX_MAX] = {
182         [RTAX_HOPLIMIT - 1] = 255,
183 };
184
185 static struct rt6_info ip6_null_entry_template = {
186         .dst = {
187                 .__refcnt       = ATOMIC_INIT(1),
188                 .__use          = 1,
189                 .obsolete       = -1,
190                 .error          = -ENETUNREACH,
191                 .input          = ip6_pkt_discard,
192                 .output         = ip6_pkt_discard_out,
193         },
194         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
195         .rt6i_protocol  = RTPROT_KERNEL,
196         .rt6i_metric    = ~(u32) 0,
197         .rt6i_ref       = ATOMIC_INIT(1),
198 };
199
200 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
201
202 static int ip6_pkt_prohibit(struct sk_buff *skb);
203 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
204
205 static struct rt6_info ip6_prohibit_entry_template = {
206         .dst = {
207                 .__refcnt       = ATOMIC_INIT(1),
208                 .__use          = 1,
209                 .obsolete       = -1,
210                 .error          = -EACCES,
211                 .input          = ip6_pkt_prohibit,
212                 .output         = ip6_pkt_prohibit_out,
213         },
214         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
215         .rt6i_protocol  = RTPROT_KERNEL,
216         .rt6i_metric    = ~(u32) 0,
217         .rt6i_ref       = ATOMIC_INIT(1),
218 };
219
220 static struct rt6_info ip6_blk_hole_entry_template = {
221         .dst = {
222                 .__refcnt       = ATOMIC_INIT(1),
223                 .__use          = 1,
224                 .obsolete       = -1,
225                 .error          = -EINVAL,
226                 .input          = dst_discard,
227                 .output         = dst_discard,
228         },
229         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
230         .rt6i_protocol  = RTPROT_KERNEL,
231         .rt6i_metric    = ~(u32) 0,
232         .rt6i_ref       = ATOMIC_INIT(1),
233 };
234
235 #endif
236
237 /* allocate dst with ip6_dst_ops */
238 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
239                                              struct net_device *dev,
240                                              int flags)
241 {
242         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
243
244         memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
245
246         return rt;
247 }
248
249 static void ip6_dst_destroy(struct dst_entry *dst)
250 {
251         struct rt6_info *rt = (struct rt6_info *)dst;
252         struct inet6_dev *idev = rt->rt6i_idev;
253         struct inet_peer *peer = rt->rt6i_peer;
254
255         if (idev != NULL) {
256                 rt->rt6i_idev = NULL;
257                 in6_dev_put(idev);
258         }
259         if (peer) {
260                 rt->rt6i_peer = NULL;
261                 inet_putpeer(peer);
262         }
263 }
264
265 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
266
267 static u32 rt6_peer_genid(void)
268 {
269         return atomic_read(&__rt6_peer_genid);
270 }
271
272 void rt6_bind_peer(struct rt6_info *rt, int create)
273 {
274         struct inet_peer *peer;
275
276         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
277         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
278                 inet_putpeer(peer);
279         else
280                 rt->rt6i_peer_genid = rt6_peer_genid();
281 }
282
283 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
284                            int how)
285 {
286         struct rt6_info *rt = (struct rt6_info *)dst;
287         struct inet6_dev *idev = rt->rt6i_idev;
288         struct net_device *loopback_dev =
289                 dev_net(dev)->loopback_dev;
290
291         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
292                 struct inet6_dev *loopback_idev =
293                         in6_dev_get(loopback_dev);
294                 if (loopback_idev != NULL) {
295                         rt->rt6i_idev = loopback_idev;
296                         in6_dev_put(idev);
297                 }
298         }
299 }
300
301 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
302 {
303         return (rt->rt6i_flags & RTF_EXPIRES) &&
304                 time_after(jiffies, rt->rt6i_expires);
305 }
306
307 static inline int rt6_need_strict(const struct in6_addr *daddr)
308 {
309         return ipv6_addr_type(daddr) &
310                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
311 }
312
313 /*
314  *      Route lookup. Any table->tb6_lock is implied.
315  */
316
317 static inline struct rt6_info *rt6_device_match(struct net *net,
318                                                     struct rt6_info *rt,
319                                                     const struct in6_addr *saddr,
320                                                     int oif,
321                                                     int flags)
322 {
323         struct rt6_info *local = NULL;
324         struct rt6_info *sprt;
325
326         if (!oif && ipv6_addr_any(saddr))
327                 goto out;
328
329         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
330                 struct net_device *dev = sprt->rt6i_dev;
331
332                 if (oif) {
333                         if (dev->ifindex == oif)
334                                 return sprt;
335                         if (dev->flags & IFF_LOOPBACK) {
336                                 if (sprt->rt6i_idev == NULL ||
337                                     sprt->rt6i_idev->dev->ifindex != oif) {
338                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
339                                                 continue;
340                                         if (local && (!oif ||
341                                                       local->rt6i_idev->dev->ifindex == oif))
342                                                 continue;
343                                 }
344                                 local = sprt;
345                         }
346                 } else {
347                         if (ipv6_chk_addr(net, saddr, dev,
348                                           flags & RT6_LOOKUP_F_IFACE))
349                                 return sprt;
350                 }
351         }
352
353         if (oif) {
354                 if (local)
355                         return local;
356
357                 if (flags & RT6_LOOKUP_F_IFACE)
358                         return net->ipv6.ip6_null_entry;
359         }
360 out:
361         return rt;
362 }
363
364 #ifdef CONFIG_IPV6_ROUTER_PREF
365 static void rt6_probe(struct rt6_info *rt)
366 {
367         struct neighbour *neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
368         /*
369          * Okay, this does not seem to be appropriate
370          * for now, however, we need to check if it
371          * is really so; aka Router Reachability Probing.
372          *
373          * Router Reachability Probe MUST be rate-limited
374          * to no more than one per minute.
375          */
376         if (!neigh || (neigh->nud_state & NUD_VALID))
377                 return;
378         read_lock_bh(&neigh->lock);
379         if (!(neigh->nud_state & NUD_VALID) &&
380             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
381                 struct in6_addr mcaddr;
382                 struct in6_addr *target;
383
384                 neigh->updated = jiffies;
385                 read_unlock_bh(&neigh->lock);
386
387                 target = (struct in6_addr *)&neigh->primary_key;
388                 addrconf_addr_solict_mult(target, &mcaddr);
389                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
390         } else
391                 read_unlock_bh(&neigh->lock);
392 }
393 #else
394 static inline void rt6_probe(struct rt6_info *rt)
395 {
396 }
397 #endif
398
399 /*
400  * Default Router Selection (RFC 2461 6.3.6)
401  */
402 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
403 {
404         struct net_device *dev = rt->rt6i_dev;
405         if (!oif || dev->ifindex == oif)
406                 return 2;
407         if ((dev->flags & IFF_LOOPBACK) &&
408             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
409                 return 1;
410         return 0;
411 }
412
413 static inline int rt6_check_neigh(struct rt6_info *rt)
414 {
415         struct neighbour *neigh = dst_get_neighbour(&rt->dst);
416         int m;
417         if (rt->rt6i_flags & RTF_NONEXTHOP ||
418             !(rt->rt6i_flags & RTF_GATEWAY))
419                 m = 1;
420         else if (neigh) {
421                 read_lock_bh(&neigh->lock);
422                 if (neigh->nud_state & NUD_VALID)
423                         m = 2;
424 #ifdef CONFIG_IPV6_ROUTER_PREF
425                 else if (neigh->nud_state & NUD_FAILED)
426                         m = 0;
427 #endif
428                 else
429                         m = 1;
430                 read_unlock_bh(&neigh->lock);
431         } else
432                 m = 0;
433         return m;
434 }
435
436 static int rt6_score_route(struct rt6_info *rt, int oif,
437                            int strict)
438 {
439         int m, n;
440
441         m = rt6_check_dev(rt, oif);
442         if (!m && (strict & RT6_LOOKUP_F_IFACE))
443                 return -1;
444 #ifdef CONFIG_IPV6_ROUTER_PREF
445         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
446 #endif
447         n = rt6_check_neigh(rt);
448         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
449                 return -1;
450         return m;
451 }
452
453 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
454                                    int *mpri, struct rt6_info *match)
455 {
456         int m;
457
458         if (rt6_check_expired(rt))
459                 goto out;
460
461         m = rt6_score_route(rt, oif, strict);
462         if (m < 0)
463                 goto out;
464
465         if (m > *mpri) {
466                 if (strict & RT6_LOOKUP_F_REACHABLE)
467                         rt6_probe(match);
468                 *mpri = m;
469                 match = rt;
470         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
471                 rt6_probe(rt);
472         }
473
474 out:
475         return match;
476 }
477
478 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
479                                      struct rt6_info *rr_head,
480                                      u32 metric, int oif, int strict)
481 {
482         struct rt6_info *rt, *match;
483         int mpri = -1;
484
485         match = NULL;
486         for (rt = rr_head; rt && rt->rt6i_metric == metric;
487              rt = rt->dst.rt6_next)
488                 match = find_match(rt, oif, strict, &mpri, match);
489         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
490              rt = rt->dst.rt6_next)
491                 match = find_match(rt, oif, strict, &mpri, match);
492
493         return match;
494 }
495
496 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
497 {
498         struct rt6_info *match, *rt0;
499         struct net *net;
500
501         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
502                   __func__, fn->leaf, oif);
503
504         rt0 = fn->rr_ptr;
505         if (!rt0)
506                 fn->rr_ptr = rt0 = fn->leaf;
507
508         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
509
510         if (!match &&
511             (strict & RT6_LOOKUP_F_REACHABLE)) {
512                 struct rt6_info *next = rt0->dst.rt6_next;
513
514                 /* no entries matched; do round-robin */
515                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
516                         next = fn->leaf;
517
518                 if (next != rt0)
519                         fn->rr_ptr = next;
520         }
521
522         RT6_TRACE("%s() => %p\n",
523                   __func__, match);
524
525         net = dev_net(rt0->rt6i_dev);
526         return match ? match : net->ipv6.ip6_null_entry;
527 }
528
529 #ifdef CONFIG_IPV6_ROUTE_INFO
530 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
531                   const struct in6_addr *gwaddr)
532 {
533         struct net *net = dev_net(dev);
534         struct route_info *rinfo = (struct route_info *) opt;
535         struct in6_addr prefix_buf, *prefix;
536         unsigned int pref;
537         unsigned long lifetime;
538         struct rt6_info *rt;
539
540         if (len < sizeof(struct route_info)) {
541                 return -EINVAL;
542         }
543
544         /* Sanity check for prefix_len and length */
545         if (rinfo->length > 3) {
546                 return -EINVAL;
547         } else if (rinfo->prefix_len > 128) {
548                 return -EINVAL;
549         } else if (rinfo->prefix_len > 64) {
550                 if (rinfo->length < 2) {
551                         return -EINVAL;
552                 }
553         } else if (rinfo->prefix_len > 0) {
554                 if (rinfo->length < 1) {
555                         return -EINVAL;
556                 }
557         }
558
559         pref = rinfo->route_pref;
560         if (pref == ICMPV6_ROUTER_PREF_INVALID)
561                 return -EINVAL;
562
563         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
564
565         if (rinfo->length == 3)
566                 prefix = (struct in6_addr *)rinfo->prefix;
567         else {
568                 /* this function is safe */
569                 ipv6_addr_prefix(&prefix_buf,
570                                  (struct in6_addr *)rinfo->prefix,
571                                  rinfo->prefix_len);
572                 prefix = &prefix_buf;
573         }
574
575         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
576                                 dev->ifindex);
577
578         if (rt && !lifetime) {
579                 ip6_del_rt(rt);
580                 rt = NULL;
581         }
582
583         if (!rt && lifetime)
584                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
585                                         pref);
586         else if (rt)
587                 rt->rt6i_flags = RTF_ROUTEINFO |
588                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
589
590         if (rt) {
591                 if (!addrconf_finite_timeout(lifetime)) {
592                         rt->rt6i_flags &= ~RTF_EXPIRES;
593                 } else {
594                         rt->rt6i_expires = jiffies + HZ * lifetime;
595                         rt->rt6i_flags |= RTF_EXPIRES;
596                 }
597                 dst_release(&rt->dst);
598         }
599         return 0;
600 }
601 #endif
602
603 #define BACKTRACK(__net, saddr)                 \
604 do { \
605         if (rt == __net->ipv6.ip6_null_entry) { \
606                 struct fib6_node *pn; \
607                 while (1) { \
608                         if (fn->fn_flags & RTN_TL_ROOT) \
609                                 goto out; \
610                         pn = fn->parent; \
611                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
612                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
613                         else \
614                                 fn = pn; \
615                         if (fn->fn_flags & RTN_RTINFO) \
616                                 goto restart; \
617                 } \
618         } \
619 } while(0)
620
621 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
622                                              struct fib6_table *table,
623                                              struct flowi6 *fl6, int flags)
624 {
625         struct fib6_node *fn;
626         struct rt6_info *rt;
627
628         read_lock_bh(&table->tb6_lock);
629         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
630 restart:
631         rt = fn->leaf;
632         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
633         BACKTRACK(net, &fl6->saddr);
634 out:
635         dst_use(&rt->dst, jiffies);
636         read_unlock_bh(&table->tb6_lock);
637         return rt;
638
639 }
640
641 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
642                             const struct in6_addr *saddr, int oif, int strict)
643 {
644         struct flowi6 fl6 = {
645                 .flowi6_oif = oif,
646                 .daddr = *daddr,
647         };
648         struct dst_entry *dst;
649         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
650
651         if (saddr) {
652                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
653                 flags |= RT6_LOOKUP_F_HAS_SADDR;
654         }
655
656         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
657         if (dst->error == 0)
658                 return (struct rt6_info *) dst;
659
660         dst_release(dst);
661
662         return NULL;
663 }
664
665 EXPORT_SYMBOL(rt6_lookup);
666
667 /* ip6_ins_rt is called with FREE table->tb6_lock.
668    It takes new route entry, the addition fails by any reason the
669    route is freed. In any case, if caller does not hold it, it may
670    be destroyed.
671  */
672
673 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
674 {
675         int err;
676         struct fib6_table *table;
677
678         table = rt->rt6i_table;
679         write_lock_bh(&table->tb6_lock);
680         err = fib6_add(&table->tb6_root, rt, info);
681         write_unlock_bh(&table->tb6_lock);
682
683         return err;
684 }
685
686 int ip6_ins_rt(struct rt6_info *rt)
687 {
688         struct nl_info info = {
689                 .nl_net = dev_net(rt->rt6i_dev),
690         };
691         return __ip6_ins_rt(rt, &info);
692 }
693
694 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
695                                       const struct in6_addr *daddr,
696                                       const struct in6_addr *saddr)
697 {
698         struct rt6_info *rt;
699
700         /*
701          *      Clone the route.
702          */
703
704         rt = ip6_rt_copy(ort, daddr);
705
706         if (rt) {
707                 struct neighbour *neigh;
708                 int attempts = !in_softirq();
709
710                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
711                         if (rt->rt6i_dst.plen != 128 &&
712                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
713                                 rt->rt6i_flags |= RTF_ANYCAST;
714                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
715                 }
716
717                 rt->rt6i_dst.plen = 128;
718                 rt->rt6i_flags |= RTF_CACHE;
719                 rt->dst.flags |= DST_HOST;
720
721 #ifdef CONFIG_IPV6_SUBTREES
722                 if (rt->rt6i_src.plen && saddr) {
723                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
724                         rt->rt6i_src.plen = 128;
725                 }
726 #endif
727
728         retry:
729                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
730                 if (IS_ERR(neigh)) {
731                         struct net *net = dev_net(rt->rt6i_dev);
732                         int saved_rt_min_interval =
733                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
734                         int saved_rt_elasticity =
735                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
736
737                         if (attempts-- > 0) {
738                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
739                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
740
741                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
742
743                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
744                                         saved_rt_elasticity;
745                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
746                                         saved_rt_min_interval;
747                                 goto retry;
748                         }
749
750                         if (net_ratelimit())
751                                 printk(KERN_WARNING
752                                        "ipv6: Neighbour table overflow.\n");
753                         dst_free(&rt->dst);
754                         return NULL;
755                 }
756                 dst_set_neighbour(&rt->dst, neigh);
757
758         }
759
760         return rt;
761 }
762
763 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
764                                         const struct in6_addr *daddr)
765 {
766         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
767
768         if (rt) {
769                 rt->rt6i_dst.plen = 128;
770                 rt->rt6i_flags |= RTF_CACHE;
771                 rt->dst.flags |= DST_HOST;
772                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour(&ort->dst)));
773         }
774         return rt;
775 }
776
777 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
778                                       struct flowi6 *fl6, int flags)
779 {
780         struct fib6_node *fn;
781         struct rt6_info *rt, *nrt;
782         int strict = 0;
783         int attempts = 3;
784         int err;
785         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
786
787         strict |= flags & RT6_LOOKUP_F_IFACE;
788
789 relookup:
790         read_lock_bh(&table->tb6_lock);
791
792 restart_2:
793         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
794
795 restart:
796         rt = rt6_select(fn, oif, strict | reachable);
797
798         BACKTRACK(net, &fl6->saddr);
799         if (rt == net->ipv6.ip6_null_entry ||
800             rt->rt6i_flags & RTF_CACHE)
801                 goto out;
802
803         dst_hold(&rt->dst);
804         read_unlock_bh(&table->tb6_lock);
805
806         if (!dst_get_neighbour(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
807                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
808         else if (!(rt->dst.flags & DST_HOST))
809                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
810         else
811                 goto out2;
812
813         dst_release(&rt->dst);
814         rt = nrt ? : net->ipv6.ip6_null_entry;
815
816         dst_hold(&rt->dst);
817         if (nrt) {
818                 err = ip6_ins_rt(nrt);
819                 if (!err)
820                         goto out2;
821         }
822
823         if (--attempts <= 0)
824                 goto out2;
825
826         /*
827          * Race condition! In the gap, when table->tb6_lock was
828          * released someone could insert this route.  Relookup.
829          */
830         dst_release(&rt->dst);
831         goto relookup;
832
833 out:
834         if (reachable) {
835                 reachable = 0;
836                 goto restart_2;
837         }
838         dst_hold(&rt->dst);
839         read_unlock_bh(&table->tb6_lock);
840 out2:
841         rt->dst.lastuse = jiffies;
842         rt->dst.__use++;
843
844         return rt;
845 }
846
847 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
848                                             struct flowi6 *fl6, int flags)
849 {
850         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
851 }
852
853 void ip6_route_input(struct sk_buff *skb)
854 {
855         const struct ipv6hdr *iph = ipv6_hdr(skb);
856         struct net *net = dev_net(skb->dev);
857         int flags = RT6_LOOKUP_F_HAS_SADDR;
858         struct flowi6 fl6 = {
859                 .flowi6_iif = skb->dev->ifindex,
860                 .daddr = iph->daddr,
861                 .saddr = iph->saddr,
862                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
863                 .flowi6_mark = skb->mark,
864                 .flowi6_proto = iph->nexthdr,
865         };
866
867         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
868                 flags |= RT6_LOOKUP_F_IFACE;
869
870         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
871 }
872
873 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
874                                              struct flowi6 *fl6, int flags)
875 {
876         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
877 }
878
879 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
880                                     struct flowi6 *fl6)
881 {
882         int flags = 0;
883
884         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
885                 flags |= RT6_LOOKUP_F_IFACE;
886
887         if (!ipv6_addr_any(&fl6->saddr))
888                 flags |= RT6_LOOKUP_F_HAS_SADDR;
889         else if (sk)
890                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
891
892         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
893 }
894
895 EXPORT_SYMBOL(ip6_route_output);
896
897 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
898 {
899         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
900         struct dst_entry *new = NULL;
901
902         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
903         if (rt) {
904                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
905
906                 new = &rt->dst;
907
908                 new->__use = 1;
909                 new->input = dst_discard;
910                 new->output = dst_discard;
911
912                 if (dst_metrics_read_only(&ort->dst))
913                         new->_metrics = ort->dst._metrics;
914                 else
915                         dst_copy_metrics(new, &ort->dst);
916                 rt->rt6i_idev = ort->rt6i_idev;
917                 if (rt->rt6i_idev)
918                         in6_dev_hold(rt->rt6i_idev);
919                 rt->rt6i_expires = 0;
920
921                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
922                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
923                 rt->rt6i_metric = 0;
924
925                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
926 #ifdef CONFIG_IPV6_SUBTREES
927                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
928 #endif
929
930                 dst_free(new);
931         }
932
933         dst_release(dst_orig);
934         return new ? new : ERR_PTR(-ENOMEM);
935 }
936
937 /*
938  *      Destination cache support functions
939  */
940
941 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
942 {
943         struct rt6_info *rt;
944
945         rt = (struct rt6_info *) dst;
946
947         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
948                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
949                         if (!rt->rt6i_peer)
950                                 rt6_bind_peer(rt, 0);
951                         rt->rt6i_peer_genid = rt6_peer_genid();
952                 }
953                 return dst;
954         }
955         return NULL;
956 }
957
958 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
959 {
960         struct rt6_info *rt = (struct rt6_info *) dst;
961
962         if (rt) {
963                 if (rt->rt6i_flags & RTF_CACHE) {
964                         if (rt6_check_expired(rt)) {
965                                 ip6_del_rt(rt);
966                                 dst = NULL;
967                         }
968                 } else {
969                         dst_release(dst);
970                         dst = NULL;
971                 }
972         }
973         return dst;
974 }
975
976 static void ip6_link_failure(struct sk_buff *skb)
977 {
978         struct rt6_info *rt;
979
980         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
981
982         rt = (struct rt6_info *) skb_dst(skb);
983         if (rt) {
984                 if (rt->rt6i_flags&RTF_CACHE) {
985                         dst_set_expires(&rt->dst, 0);
986                         rt->rt6i_flags |= RTF_EXPIRES;
987                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
988                         rt->rt6i_node->fn_sernum = -1;
989         }
990 }
991
992 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
993 {
994         struct rt6_info *rt6 = (struct rt6_info*)dst;
995
996         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
997                 rt6->rt6i_flags |= RTF_MODIFIED;
998                 if (mtu < IPV6_MIN_MTU) {
999                         u32 features = dst_metric(dst, RTAX_FEATURES);
1000                         mtu = IPV6_MIN_MTU;
1001                         features |= RTAX_FEATURE_ALLFRAG;
1002                         dst_metric_set(dst, RTAX_FEATURES, features);
1003                 }
1004                 dst_metric_set(dst, RTAX_MTU, mtu);
1005         }
1006 }
1007
1008 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1009 {
1010         struct net_device *dev = dst->dev;
1011         unsigned int mtu = dst_mtu(dst);
1012         struct net *net = dev_net(dev);
1013
1014         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1015
1016         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1017                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1018
1019         /*
1020          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1021          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1022          * IPV6_MAXPLEN is also valid and means: "any MSS,
1023          * rely only on pmtu discovery"
1024          */
1025         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1026                 mtu = IPV6_MAXPLEN;
1027         return mtu;
1028 }
1029
1030 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1031 {
1032         unsigned int mtu = IPV6_MIN_MTU;
1033         struct inet6_dev *idev;
1034
1035         rcu_read_lock();
1036         idev = __in6_dev_get(dst->dev);
1037         if (idev)
1038                 mtu = idev->cnf.mtu6;
1039         rcu_read_unlock();
1040
1041         return mtu;
1042 }
1043
1044 static struct dst_entry *icmp6_dst_gc_list;
1045 static DEFINE_SPINLOCK(icmp6_dst_lock);
1046
1047 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1048                                   struct neighbour *neigh,
1049                                   const struct in6_addr *addr)
1050 {
1051         struct rt6_info *rt;
1052         struct inet6_dev *idev = in6_dev_get(dev);
1053         struct net *net = dev_net(dev);
1054
1055         if (unlikely(idev == NULL))
1056                 return NULL;
1057
1058         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1059         if (unlikely(rt == NULL)) {
1060                 in6_dev_put(idev);
1061                 goto out;
1062         }
1063
1064         if (neigh)
1065                 neigh_hold(neigh);
1066         else {
1067                 neigh = ndisc_get_neigh(dev, addr);
1068                 if (IS_ERR(neigh))
1069                         neigh = NULL;
1070         }
1071
1072         rt->rt6i_idev     = idev;
1073         dst_set_neighbour(&rt->dst, neigh);
1074         atomic_set(&rt->dst.__refcnt, 1);
1075         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1076         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1077         rt->dst.output  = ip6_output;
1078
1079         spin_lock_bh(&icmp6_dst_lock);
1080         rt->dst.next = icmp6_dst_gc_list;
1081         icmp6_dst_gc_list = &rt->dst;
1082         spin_unlock_bh(&icmp6_dst_lock);
1083
1084         fib6_force_start_gc(net);
1085
1086 out:
1087         return &rt->dst;
1088 }
1089
1090 int icmp6_dst_gc(void)
1091 {
1092         struct dst_entry *dst, **pprev;
1093         int more = 0;
1094
1095         spin_lock_bh(&icmp6_dst_lock);
1096         pprev = &icmp6_dst_gc_list;
1097
1098         while ((dst = *pprev) != NULL) {
1099                 if (!atomic_read(&dst->__refcnt)) {
1100                         *pprev = dst->next;
1101                         dst_free(dst);
1102                 } else {
1103                         pprev = &dst->next;
1104                         ++more;
1105                 }
1106         }
1107
1108         spin_unlock_bh(&icmp6_dst_lock);
1109
1110         return more;
1111 }
1112
1113 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1114                             void *arg)
1115 {
1116         struct dst_entry *dst, **pprev;
1117
1118         spin_lock_bh(&icmp6_dst_lock);
1119         pprev = &icmp6_dst_gc_list;
1120         while ((dst = *pprev) != NULL) {
1121                 struct rt6_info *rt = (struct rt6_info *) dst;
1122                 if (func(rt, arg)) {
1123                         *pprev = dst->next;
1124                         dst_free(dst);
1125                 } else {
1126                         pprev = &dst->next;
1127                 }
1128         }
1129         spin_unlock_bh(&icmp6_dst_lock);
1130 }
1131
1132 static int ip6_dst_gc(struct dst_ops *ops)
1133 {
1134         unsigned long now = jiffies;
1135         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1136         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1137         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1138         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1139         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1140         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1141         int entries;
1142
1143         entries = dst_entries_get_fast(ops);
1144         if (time_after(rt_last_gc + rt_min_interval, now) &&
1145             entries <= rt_max_size)
1146                 goto out;
1147
1148         net->ipv6.ip6_rt_gc_expire++;
1149         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1150         net->ipv6.ip6_rt_last_gc = now;
1151         entries = dst_entries_get_slow(ops);
1152         if (entries < ops->gc_thresh)
1153                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1154 out:
1155         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1156         return entries > rt_max_size;
1157 }
1158
1159 /* Clean host part of a prefix. Not necessary in radix tree,
1160    but results in cleaner routing tables.
1161
1162    Remove it only when all the things will work!
1163  */
1164
1165 int ip6_dst_hoplimit(struct dst_entry *dst)
1166 {
1167         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1168         if (hoplimit == 0) {
1169                 struct net_device *dev = dst->dev;
1170                 struct inet6_dev *idev;
1171
1172                 rcu_read_lock();
1173                 idev = __in6_dev_get(dev);
1174                 if (idev)
1175                         hoplimit = idev->cnf.hop_limit;
1176                 else
1177                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1178                 rcu_read_unlock();
1179         }
1180         return hoplimit;
1181 }
1182 EXPORT_SYMBOL(ip6_dst_hoplimit);
1183
1184 /*
1185  *
1186  */
1187
1188 int ip6_route_add(struct fib6_config *cfg)
1189 {
1190         int err;
1191         struct net *net = cfg->fc_nlinfo.nl_net;
1192         struct rt6_info *rt = NULL;
1193         struct net_device *dev = NULL;
1194         struct inet6_dev *idev = NULL;
1195         struct fib6_table *table;
1196         int addr_type;
1197
1198         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1199                 return -EINVAL;
1200 #ifndef CONFIG_IPV6_SUBTREES
1201         if (cfg->fc_src_len)
1202                 return -EINVAL;
1203 #endif
1204         if (cfg->fc_ifindex) {
1205                 err = -ENODEV;
1206                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1207                 if (!dev)
1208                         goto out;
1209                 idev = in6_dev_get(dev);
1210                 if (!idev)
1211                         goto out;
1212         }
1213
1214         if (cfg->fc_metric == 0)
1215                 cfg->fc_metric = IP6_RT_PRIO_USER;
1216
1217         table = fib6_new_table(net, cfg->fc_table);
1218         if (table == NULL) {
1219                 err = -ENOBUFS;
1220                 goto out;
1221         }
1222
1223         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1224
1225         if (rt == NULL) {
1226                 err = -ENOMEM;
1227                 goto out;
1228         }
1229
1230         rt->dst.obsolete = -1;
1231         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1232                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1233                                 0;
1234
1235         if (cfg->fc_protocol == RTPROT_UNSPEC)
1236                 cfg->fc_protocol = RTPROT_BOOT;
1237         rt->rt6i_protocol = cfg->fc_protocol;
1238
1239         addr_type = ipv6_addr_type(&cfg->fc_dst);
1240
1241         if (addr_type & IPV6_ADDR_MULTICAST)
1242                 rt->dst.input = ip6_mc_input;
1243         else if (cfg->fc_flags & RTF_LOCAL)
1244                 rt->dst.input = ip6_input;
1245         else
1246                 rt->dst.input = ip6_forward;
1247
1248         rt->dst.output = ip6_output;
1249
1250         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1251         rt->rt6i_dst.plen = cfg->fc_dst_len;
1252         if (rt->rt6i_dst.plen == 128)
1253                rt->dst.flags |= DST_HOST;
1254
1255 #ifdef CONFIG_IPV6_SUBTREES
1256         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1257         rt->rt6i_src.plen = cfg->fc_src_len;
1258 #endif
1259
1260         rt->rt6i_metric = cfg->fc_metric;
1261
1262         /* We cannot add true routes via loopback here,
1263            they would result in kernel looping; promote them to reject routes
1264          */
1265         if ((cfg->fc_flags & RTF_REJECT) ||
1266             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1267                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1268                 /* hold loopback dev/idev if we haven't done so. */
1269                 if (dev != net->loopback_dev) {
1270                         if (dev) {
1271                                 dev_put(dev);
1272                                 in6_dev_put(idev);
1273                         }
1274                         dev = net->loopback_dev;
1275                         dev_hold(dev);
1276                         idev = in6_dev_get(dev);
1277                         if (!idev) {
1278                                 err = -ENODEV;
1279                                 goto out;
1280                         }
1281                 }
1282                 rt->dst.output = ip6_pkt_discard_out;
1283                 rt->dst.input = ip6_pkt_discard;
1284                 rt->dst.error = -ENETUNREACH;
1285                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1286                 goto install_route;
1287         }
1288
1289         if (cfg->fc_flags & RTF_GATEWAY) {
1290                 const struct in6_addr *gw_addr;
1291                 int gwa_type;
1292
1293                 gw_addr = &cfg->fc_gateway;
1294                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1295                 gwa_type = ipv6_addr_type(gw_addr);
1296
1297                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1298                         struct rt6_info *grt;
1299
1300                         /* IPv6 strictly inhibits using not link-local
1301                            addresses as nexthop address.
1302                            Otherwise, router will not able to send redirects.
1303                            It is very good, but in some (rare!) circumstances
1304                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1305                            some exceptions. --ANK
1306                          */
1307                         err = -EINVAL;
1308                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1309                                 goto out;
1310
1311                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1312
1313                         err = -EHOSTUNREACH;
1314                         if (grt == NULL)
1315                                 goto out;
1316                         if (dev) {
1317                                 if (dev != grt->rt6i_dev) {
1318                                         dst_release(&grt->dst);
1319                                         goto out;
1320                                 }
1321                         } else {
1322                                 dev = grt->rt6i_dev;
1323                                 idev = grt->rt6i_idev;
1324                                 dev_hold(dev);
1325                                 in6_dev_hold(grt->rt6i_idev);
1326                         }
1327                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1328                                 err = 0;
1329                         dst_release(&grt->dst);
1330
1331                         if (err)
1332                                 goto out;
1333                 }
1334                 err = -EINVAL;
1335                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1336                         goto out;
1337         }
1338
1339         err = -ENODEV;
1340         if (dev == NULL)
1341                 goto out;
1342
1343         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1344                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1345                         err = -EINVAL;
1346                         goto out;
1347                 }
1348                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1349                 rt->rt6i_prefsrc.plen = 128;
1350         } else
1351                 rt->rt6i_prefsrc.plen = 0;
1352
1353         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1354                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1355                 if (IS_ERR(n)) {
1356                         err = PTR_ERR(n);
1357                         goto out;
1358                 }
1359                 dst_set_neighbour(&rt->dst, n);
1360         }
1361
1362         rt->rt6i_flags = cfg->fc_flags;
1363
1364 install_route:
1365         if (cfg->fc_mx) {
1366                 struct nlattr *nla;
1367                 int remaining;
1368
1369                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1370                         int type = nla_type(nla);
1371
1372                         if (type) {
1373                                 if (type > RTAX_MAX) {
1374                                         err = -EINVAL;
1375                                         goto out;
1376                                 }
1377
1378                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1379                         }
1380                 }
1381         }
1382
1383         rt->dst.dev = dev;
1384         rt->rt6i_idev = idev;
1385         rt->rt6i_table = table;
1386
1387         cfg->fc_nlinfo.nl_net = dev_net(dev);
1388
1389         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1390
1391 out:
1392         if (dev)
1393                 dev_put(dev);
1394         if (idev)
1395                 in6_dev_put(idev);
1396         if (rt)
1397                 dst_free(&rt->dst);
1398         return err;
1399 }
1400
1401 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1402 {
1403         int err;
1404         struct fib6_table *table;
1405         struct net *net = dev_net(rt->rt6i_dev);
1406
1407         if (rt == net->ipv6.ip6_null_entry)
1408                 return -ENOENT;
1409
1410         table = rt->rt6i_table;
1411         write_lock_bh(&table->tb6_lock);
1412
1413         err = fib6_del(rt, info);
1414         dst_release(&rt->dst);
1415
1416         write_unlock_bh(&table->tb6_lock);
1417
1418         return err;
1419 }
1420
1421 int ip6_del_rt(struct rt6_info *rt)
1422 {
1423         struct nl_info info = {
1424                 .nl_net = dev_net(rt->rt6i_dev),
1425         };
1426         return __ip6_del_rt(rt, &info);
1427 }
1428
1429 static int ip6_route_del(struct fib6_config *cfg)
1430 {
1431         struct fib6_table *table;
1432         struct fib6_node *fn;
1433         struct rt6_info *rt;
1434         int err = -ESRCH;
1435
1436         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1437         if (table == NULL)
1438                 return err;
1439
1440         read_lock_bh(&table->tb6_lock);
1441
1442         fn = fib6_locate(&table->tb6_root,
1443                          &cfg->fc_dst, cfg->fc_dst_len,
1444                          &cfg->fc_src, cfg->fc_src_len);
1445
1446         if (fn) {
1447                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1448                         if (cfg->fc_ifindex &&
1449                             (rt->rt6i_dev == NULL ||
1450                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1451                                 continue;
1452                         if (cfg->fc_flags & RTF_GATEWAY &&
1453                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1454                                 continue;
1455                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1456                                 continue;
1457                         dst_hold(&rt->dst);
1458                         read_unlock_bh(&table->tb6_lock);
1459
1460                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1461                 }
1462         }
1463         read_unlock_bh(&table->tb6_lock);
1464
1465         return err;
1466 }
1467
1468 /*
1469  *      Handle redirects
1470  */
1471 struct ip6rd_flowi {
1472         struct flowi6 fl6;
1473         struct in6_addr gateway;
1474 };
1475
1476 static struct rt6_info *__ip6_route_redirect(struct net *net,
1477                                              struct fib6_table *table,
1478                                              struct flowi6 *fl6,
1479                                              int flags)
1480 {
1481         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1482         struct rt6_info *rt;
1483         struct fib6_node *fn;
1484
1485         /*
1486          * Get the "current" route for this destination and
1487          * check if the redirect has come from approriate router.
1488          *
1489          * RFC 2461 specifies that redirects should only be
1490          * accepted if they come from the nexthop to the target.
1491          * Due to the way the routes are chosen, this notion
1492          * is a bit fuzzy and one might need to check all possible
1493          * routes.
1494          */
1495
1496         read_lock_bh(&table->tb6_lock);
1497         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1498 restart:
1499         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1500                 /*
1501                  * Current route is on-link; redirect is always invalid.
1502                  *
1503                  * Seems, previous statement is not true. It could
1504                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1505                  * But then router serving it might decide, that we should
1506                  * know truth 8)8) --ANK (980726).
1507                  */
1508                 if (rt6_check_expired(rt))
1509                         continue;
1510                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1511                         continue;
1512                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1513                         continue;
1514                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1515                         continue;
1516                 break;
1517         }
1518
1519         if (!rt)
1520                 rt = net->ipv6.ip6_null_entry;
1521         BACKTRACK(net, &fl6->saddr);
1522 out:
1523         dst_hold(&rt->dst);
1524
1525         read_unlock_bh(&table->tb6_lock);
1526
1527         return rt;
1528 };
1529
1530 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1531                                            const struct in6_addr *src,
1532                                            const struct in6_addr *gateway,
1533                                            struct net_device *dev)
1534 {
1535         int flags = RT6_LOOKUP_F_HAS_SADDR;
1536         struct net *net = dev_net(dev);
1537         struct ip6rd_flowi rdfl = {
1538                 .fl6 = {
1539                         .flowi6_oif = dev->ifindex,
1540                         .daddr = *dest,
1541                         .saddr = *src,
1542                 },
1543         };
1544
1545         ipv6_addr_copy(&rdfl.gateway, gateway);
1546
1547         if (rt6_need_strict(dest))
1548                 flags |= RT6_LOOKUP_F_IFACE;
1549
1550         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1551                                                    flags, __ip6_route_redirect);
1552 }
1553
1554 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1555                   const struct in6_addr *saddr,
1556                   struct neighbour *neigh, u8 *lladdr, int on_link)
1557 {
1558         struct rt6_info *rt, *nrt = NULL;
1559         struct netevent_redirect netevent;
1560         struct net *net = dev_net(neigh->dev);
1561
1562         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1563
1564         if (rt == net->ipv6.ip6_null_entry) {
1565                 if (net_ratelimit())
1566                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1567                                "for redirect target\n");
1568                 goto out;
1569         }
1570
1571         /*
1572          *      We have finally decided to accept it.
1573          */
1574
1575         neigh_update(neigh, lladdr, NUD_STALE,
1576                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1577                      NEIGH_UPDATE_F_OVERRIDE|
1578                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1579                                      NEIGH_UPDATE_F_ISROUTER))
1580                      );
1581
1582         /*
1583          * Redirect received -> path was valid.
1584          * Look, redirects are sent only in response to data packets,
1585          * so that this nexthop apparently is reachable. --ANK
1586          */
1587         dst_confirm(&rt->dst);
1588
1589         /* Duplicate redirect: silently ignore. */
1590         if (neigh == dst_get_neighbour(&rt->dst))
1591                 goto out;
1592
1593         nrt = ip6_rt_copy(rt, dest);
1594         if (nrt == NULL)
1595                 goto out;
1596
1597         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1598         if (on_link)
1599                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1600
1601         nrt->rt6i_dst.plen = 128;
1602         nrt->dst.flags |= DST_HOST;
1603
1604         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1605         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1606
1607         if (ip6_ins_rt(nrt))
1608                 goto out;
1609
1610         netevent.old = &rt->dst;
1611         netevent.new = &nrt->dst;
1612         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1613
1614         if (rt->rt6i_flags&RTF_CACHE) {
1615                 ip6_del_rt(rt);
1616                 return;
1617         }
1618
1619 out:
1620         dst_release(&rt->dst);
1621 }
1622
1623 /*
1624  *      Handle ICMP "packet too big" messages
1625  *      i.e. Path MTU discovery
1626  */
1627
1628 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1629                              struct net *net, u32 pmtu, int ifindex)
1630 {
1631         struct rt6_info *rt, *nrt;
1632         int allfrag = 0;
1633 again:
1634         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1635         if (rt == NULL)
1636                 return;
1637
1638         if (rt6_check_expired(rt)) {
1639                 ip6_del_rt(rt);
1640                 goto again;
1641         }
1642
1643         if (pmtu >= dst_mtu(&rt->dst))
1644                 goto out;
1645
1646         if (pmtu < IPV6_MIN_MTU) {
1647                 /*
1648                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1649                  * MTU (1280) and a fragment header should always be included
1650                  * after a node receiving Too Big message reporting PMTU is
1651                  * less than the IPv6 Minimum Link MTU.
1652                  */
1653                 pmtu = IPV6_MIN_MTU;
1654                 allfrag = 1;
1655         }
1656
1657         /* New mtu received -> path was valid.
1658            They are sent only in response to data packets,
1659            so that this nexthop apparently is reachable. --ANK
1660          */
1661         dst_confirm(&rt->dst);
1662
1663         /* Host route. If it is static, it would be better
1664            not to override it, but add new one, so that
1665            when cache entry will expire old pmtu
1666            would return automatically.
1667          */
1668         if (rt->rt6i_flags & RTF_CACHE) {
1669                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1670                 if (allfrag) {
1671                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1672                         features |= RTAX_FEATURE_ALLFRAG;
1673                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1674                 }
1675                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1676                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1677                 goto out;
1678         }
1679
1680         /* Network route.
1681            Two cases are possible:
1682            1. It is connected route. Action: COW
1683            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1684          */
1685         if (!dst_get_neighbour(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1686                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1687         else
1688                 nrt = rt6_alloc_clone(rt, daddr);
1689
1690         if (nrt) {
1691                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1692                 if (allfrag) {
1693                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1694                         features |= RTAX_FEATURE_ALLFRAG;
1695                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1696                 }
1697
1698                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1699                  * happened within 5 mins, the recommended timer is 10 mins.
1700                  * Here this route expiration time is set to ip6_rt_mtu_expires
1701                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1702                  * and detecting PMTU increase will be automatically happened.
1703                  */
1704                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1705                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1706
1707                 ip6_ins_rt(nrt);
1708         }
1709 out:
1710         dst_release(&rt->dst);
1711 }
1712
1713 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1714                         struct net_device *dev, u32 pmtu)
1715 {
1716         struct net *net = dev_net(dev);
1717
1718         /*
1719          * RFC 1981 states that a node "MUST reduce the size of the packets it
1720          * is sending along the path" that caused the Packet Too Big message.
1721          * Since it's not possible in the general case to determine which
1722          * interface was used to send the original packet, we update the MTU
1723          * on the interface that will be used to send future packets. We also
1724          * update the MTU on the interface that received the Packet Too Big in
1725          * case the original packet was forced out that interface with
1726          * SO_BINDTODEVICE or similar. This is the next best thing to the
1727          * correct behaviour, which would be to update the MTU on all
1728          * interfaces.
1729          */
1730         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1731         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1732 }
1733
1734 /*
1735  *      Misc support functions
1736  */
1737
1738 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1739                                     const struct in6_addr *dest)
1740 {
1741         struct net *net = dev_net(ort->rt6i_dev);
1742         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1743                                             ort->dst.dev, 0);
1744
1745         if (rt) {
1746                 rt->dst.input = ort->dst.input;
1747                 rt->dst.output = ort->dst.output;
1748
1749                 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1750                 rt->rt6i_dst.plen = ort->rt6i_dst.plen;
1751                 dst_copy_metrics(&rt->dst, &ort->dst);
1752                 rt->dst.error = ort->dst.error;
1753                 rt->rt6i_idev = ort->rt6i_idev;
1754                 if (rt->rt6i_idev)
1755                         in6_dev_hold(rt->rt6i_idev);
1756                 rt->dst.lastuse = jiffies;
1757                 rt->rt6i_expires = 0;
1758
1759                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1760                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1761                 rt->rt6i_metric = 0;
1762
1763 #ifdef CONFIG_IPV6_SUBTREES
1764                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1765 #endif
1766                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1767                 rt->rt6i_table = ort->rt6i_table;
1768         }
1769         return rt;
1770 }
1771
1772 #ifdef CONFIG_IPV6_ROUTE_INFO
1773 static struct rt6_info *rt6_get_route_info(struct net *net,
1774                                            const struct in6_addr *prefix, int prefixlen,
1775                                            const struct in6_addr *gwaddr, int ifindex)
1776 {
1777         struct fib6_node *fn;
1778         struct rt6_info *rt = NULL;
1779         struct fib6_table *table;
1780
1781         table = fib6_get_table(net, RT6_TABLE_INFO);
1782         if (table == NULL)
1783                 return NULL;
1784
1785         write_lock_bh(&table->tb6_lock);
1786         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1787         if (!fn)
1788                 goto out;
1789
1790         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1791                 if (rt->rt6i_dev->ifindex != ifindex)
1792                         continue;
1793                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1794                         continue;
1795                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1796                         continue;
1797                 dst_hold(&rt->dst);
1798                 break;
1799         }
1800 out:
1801         write_unlock_bh(&table->tb6_lock);
1802         return rt;
1803 }
1804
1805 static struct rt6_info *rt6_add_route_info(struct net *net,
1806                                            const struct in6_addr *prefix, int prefixlen,
1807                                            const struct in6_addr *gwaddr, int ifindex,
1808                                            unsigned pref)
1809 {
1810         struct fib6_config cfg = {
1811                 .fc_table       = RT6_TABLE_INFO,
1812                 .fc_metric      = IP6_RT_PRIO_USER,
1813                 .fc_ifindex     = ifindex,
1814                 .fc_dst_len     = prefixlen,
1815                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1816                                   RTF_UP | RTF_PREF(pref),
1817                 .fc_nlinfo.pid = 0,
1818                 .fc_nlinfo.nlh = NULL,
1819                 .fc_nlinfo.nl_net = net,
1820         };
1821
1822         ipv6_addr_copy(&cfg.fc_dst, prefix);
1823         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1824
1825         /* We should treat it as a default route if prefix length is 0. */
1826         if (!prefixlen)
1827                 cfg.fc_flags |= RTF_DEFAULT;
1828
1829         ip6_route_add(&cfg);
1830
1831         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1832 }
1833 #endif
1834
1835 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1836 {
1837         struct rt6_info *rt;
1838         struct fib6_table *table;
1839
1840         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1841         if (table == NULL)
1842                 return NULL;
1843
1844         write_lock_bh(&table->tb6_lock);
1845         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1846                 if (dev == rt->rt6i_dev &&
1847                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1848                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1849                         break;
1850         }
1851         if (rt)
1852                 dst_hold(&rt->dst);
1853         write_unlock_bh(&table->tb6_lock);
1854         return rt;
1855 }
1856
1857 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1858                                      struct net_device *dev,
1859                                      unsigned int pref)
1860 {
1861         struct fib6_config cfg = {
1862                 .fc_table       = RT6_TABLE_DFLT,
1863                 .fc_metric      = IP6_RT_PRIO_USER,
1864                 .fc_ifindex     = dev->ifindex,
1865                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1866                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1867                 .fc_nlinfo.pid = 0,
1868                 .fc_nlinfo.nlh = NULL,
1869                 .fc_nlinfo.nl_net = dev_net(dev),
1870         };
1871
1872         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1873
1874         ip6_route_add(&cfg);
1875
1876         return rt6_get_dflt_router(gwaddr, dev);
1877 }
1878
1879 void rt6_purge_dflt_routers(struct net *net)
1880 {
1881         struct rt6_info *rt;
1882         struct fib6_table *table;
1883
1884         /* NOTE: Keep consistent with rt6_get_dflt_router */
1885         table = fib6_get_table(net, RT6_TABLE_DFLT);
1886         if (table == NULL)
1887                 return;
1888
1889 restart:
1890         read_lock_bh(&table->tb6_lock);
1891         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1892                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1893                         dst_hold(&rt->dst);
1894                         read_unlock_bh(&table->tb6_lock);
1895                         ip6_del_rt(rt);
1896                         goto restart;
1897                 }
1898         }
1899         read_unlock_bh(&table->tb6_lock);
1900 }
1901
1902 static void rtmsg_to_fib6_config(struct net *net,
1903                                  struct in6_rtmsg *rtmsg,
1904                                  struct fib6_config *cfg)
1905 {
1906         memset(cfg, 0, sizeof(*cfg));
1907
1908         cfg->fc_table = RT6_TABLE_MAIN;
1909         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1910         cfg->fc_metric = rtmsg->rtmsg_metric;
1911         cfg->fc_expires = rtmsg->rtmsg_info;
1912         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1913         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1914         cfg->fc_flags = rtmsg->rtmsg_flags;
1915
1916         cfg->fc_nlinfo.nl_net = net;
1917
1918         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1919         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1920         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1921 }
1922
1923 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1924 {
1925         struct fib6_config cfg;
1926         struct in6_rtmsg rtmsg;
1927         int err;
1928
1929         switch(cmd) {
1930         case SIOCADDRT:         /* Add a route */
1931         case SIOCDELRT:         /* Delete a route */
1932                 if (!capable(CAP_NET_ADMIN))
1933                         return -EPERM;
1934                 err = copy_from_user(&rtmsg, arg,
1935                                      sizeof(struct in6_rtmsg));
1936                 if (err)
1937                         return -EFAULT;
1938
1939                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1940
1941                 rtnl_lock();
1942                 switch (cmd) {
1943                 case SIOCADDRT:
1944                         err = ip6_route_add(&cfg);
1945                         break;
1946                 case SIOCDELRT:
1947                         err = ip6_route_del(&cfg);
1948                         break;
1949                 default:
1950                         err = -EINVAL;
1951                 }
1952                 rtnl_unlock();
1953
1954                 return err;
1955         }
1956
1957         return -EINVAL;
1958 }
1959
1960 /*
1961  *      Drop the packet on the floor
1962  */
1963
1964 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1965 {
1966         int type;
1967         struct dst_entry *dst = skb_dst(skb);
1968         switch (ipstats_mib_noroutes) {
1969         case IPSTATS_MIB_INNOROUTES:
1970                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1971                 if (type == IPV6_ADDR_ANY) {
1972                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1973                                       IPSTATS_MIB_INADDRERRORS);
1974                         break;
1975                 }
1976                 /* FALLTHROUGH */
1977         case IPSTATS_MIB_OUTNOROUTES:
1978                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1979                               ipstats_mib_noroutes);
1980                 break;
1981         }
1982         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1983         kfree_skb(skb);
1984         return 0;
1985 }
1986
1987 static int ip6_pkt_discard(struct sk_buff *skb)
1988 {
1989         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1990 }
1991
1992 static int ip6_pkt_discard_out(struct sk_buff *skb)
1993 {
1994         skb->dev = skb_dst(skb)->dev;
1995         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1996 }
1997
1998 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1999
2000 static int ip6_pkt_prohibit(struct sk_buff *skb)
2001 {
2002         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2003 }
2004
2005 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2006 {
2007         skb->dev = skb_dst(skb)->dev;
2008         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2009 }
2010
2011 #endif
2012
2013 /*
2014  *      Allocate a dst for local (unicast / anycast) address.
2015  */
2016
2017 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2018                                     const struct in6_addr *addr,
2019                                     int anycast)
2020 {
2021         struct net *net = dev_net(idev->dev);
2022         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2023                                             net->loopback_dev, 0);
2024         struct neighbour *neigh;
2025
2026         if (rt == NULL) {
2027                 if (net_ratelimit())
2028                         pr_warning("IPv6:  Maximum number of routes reached,"
2029                                    " consider increasing route/max_size.\n");
2030                 return ERR_PTR(-ENOMEM);
2031         }
2032
2033         in6_dev_hold(idev);
2034
2035         rt->dst.flags |= DST_HOST;
2036         rt->dst.input = ip6_input;
2037         rt->dst.output = ip6_output;
2038         rt->rt6i_idev = idev;
2039         rt->dst.obsolete = -1;
2040
2041         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2042         if (anycast)
2043                 rt->rt6i_flags |= RTF_ANYCAST;
2044         else
2045                 rt->rt6i_flags |= RTF_LOCAL;
2046         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2047         if (IS_ERR(neigh)) {
2048                 dst_free(&rt->dst);
2049
2050                 return ERR_CAST(neigh);
2051         }
2052         dst_set_neighbour(&rt->dst, neigh);
2053
2054         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2055         rt->rt6i_dst.plen = 128;
2056         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2057
2058         atomic_set(&rt->dst.__refcnt, 1);
2059
2060         return rt;
2061 }
2062
2063 int ip6_route_get_saddr(struct net *net,
2064                         struct rt6_info *rt,
2065                         const struct in6_addr *daddr,
2066                         unsigned int prefs,
2067                         struct in6_addr *saddr)
2068 {
2069         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2070         int err = 0;
2071         if (rt->rt6i_prefsrc.plen)
2072                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2073         else
2074                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2075                                          daddr, prefs, saddr);
2076         return err;
2077 }
2078
2079 /* remove deleted ip from prefsrc entries */
2080 struct arg_dev_net_ip {
2081         struct net_device *dev;
2082         struct net *net;
2083         struct in6_addr *addr;
2084 };
2085
2086 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2087 {
2088         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2089         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2090         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2091
2092         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2093             rt != net->ipv6.ip6_null_entry &&
2094             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2095                 /* remove prefsrc entry */
2096                 rt->rt6i_prefsrc.plen = 0;
2097         }
2098         return 0;
2099 }
2100
2101 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2102 {
2103         struct net *net = dev_net(ifp->idev->dev);
2104         struct arg_dev_net_ip adni = {
2105                 .dev = ifp->idev->dev,
2106                 .net = net,
2107                 .addr = &ifp->addr,
2108         };
2109         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2110 }
2111
2112 struct arg_dev_net {
2113         struct net_device *dev;
2114         struct net *net;
2115 };
2116
2117 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2118 {
2119         const struct arg_dev_net *adn = arg;
2120         const struct net_device *dev = adn->dev;
2121
2122         if ((rt->rt6i_dev == dev || dev == NULL) &&
2123             rt != adn->net->ipv6.ip6_null_entry) {
2124                 RT6_TRACE("deleted by ifdown %p\n", rt);
2125                 return -1;
2126         }
2127         return 0;
2128 }
2129
2130 void rt6_ifdown(struct net *net, struct net_device *dev)
2131 {
2132         struct arg_dev_net adn = {
2133                 .dev = dev,
2134                 .net = net,
2135         };
2136
2137         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2138         icmp6_clean_all(fib6_ifdown, &adn);
2139 }
2140
2141 struct rt6_mtu_change_arg
2142 {
2143         struct net_device *dev;
2144         unsigned mtu;
2145 };
2146
2147 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2148 {
2149         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2150         struct inet6_dev *idev;
2151
2152         /* In IPv6 pmtu discovery is not optional,
2153            so that RTAX_MTU lock cannot disable it.
2154            We still use this lock to block changes
2155            caused by addrconf/ndisc.
2156         */
2157
2158         idev = __in6_dev_get(arg->dev);
2159         if (idev == NULL)
2160                 return 0;
2161
2162         /* For administrative MTU increase, there is no way to discover
2163            IPv6 PMTU increase, so PMTU increase should be updated here.
2164            Since RFC 1981 doesn't include administrative MTU increase
2165            update PMTU increase is a MUST. (i.e. jumbo frame)
2166          */
2167         /*
2168            If new MTU is less than route PMTU, this new MTU will be the
2169            lowest MTU in the path, update the route PMTU to reflect PMTU
2170            decreases; if new MTU is greater than route PMTU, and the
2171            old MTU is the lowest MTU in the path, update the route PMTU
2172            to reflect the increase. In this case if the other nodes' MTU
2173            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2174            PMTU discouvery.
2175          */
2176         if (rt->rt6i_dev == arg->dev &&
2177             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2178             (dst_mtu(&rt->dst) >= arg->mtu ||
2179              (dst_mtu(&rt->dst) < arg->mtu &&
2180               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2181                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2182         }
2183         return 0;
2184 }
2185
2186 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2187 {
2188         struct rt6_mtu_change_arg arg = {
2189                 .dev = dev,
2190                 .mtu = mtu,
2191         };
2192
2193         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2194 }
2195
2196 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2197         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2198         [RTA_OIF]               = { .type = NLA_U32 },
2199         [RTA_IIF]               = { .type = NLA_U32 },
2200         [RTA_PRIORITY]          = { .type = NLA_U32 },
2201         [RTA_METRICS]           = { .type = NLA_NESTED },
2202 };
2203
2204 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2205                               struct fib6_config *cfg)
2206 {
2207         struct rtmsg *rtm;
2208         struct nlattr *tb[RTA_MAX+1];
2209         int err;
2210
2211         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2212         if (err < 0)
2213                 goto errout;
2214
2215         err = -EINVAL;
2216         rtm = nlmsg_data(nlh);
2217         memset(cfg, 0, sizeof(*cfg));
2218
2219         cfg->fc_table = rtm->rtm_table;
2220         cfg->fc_dst_len = rtm->rtm_dst_len;
2221         cfg->fc_src_len = rtm->rtm_src_len;
2222         cfg->fc_flags = RTF_UP;
2223         cfg->fc_protocol = rtm->rtm_protocol;
2224
2225         if (rtm->rtm_type == RTN_UNREACHABLE)
2226                 cfg->fc_flags |= RTF_REJECT;
2227
2228         if (rtm->rtm_type == RTN_LOCAL)
2229                 cfg->fc_flags |= RTF_LOCAL;
2230
2231         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2232         cfg->fc_nlinfo.nlh = nlh;
2233         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2234
2235         if (tb[RTA_GATEWAY]) {
2236                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2237                 cfg->fc_flags |= RTF_GATEWAY;
2238         }
2239
2240         if (tb[RTA_DST]) {
2241                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2242
2243                 if (nla_len(tb[RTA_DST]) < plen)
2244                         goto errout;
2245
2246                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2247         }
2248
2249         if (tb[RTA_SRC]) {
2250                 int plen = (rtm->rtm_src_len + 7) >> 3;
2251
2252                 if (nla_len(tb[RTA_SRC]) < plen)
2253                         goto errout;
2254
2255                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2256         }
2257
2258         if (tb[RTA_PREFSRC])
2259                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2260
2261         if (tb[RTA_OIF])
2262                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2263
2264         if (tb[RTA_PRIORITY])
2265                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2266
2267         if (tb[RTA_METRICS]) {
2268                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2269                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2270         }
2271
2272         if (tb[RTA_TABLE])
2273                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2274
2275         err = 0;
2276 errout:
2277         return err;
2278 }
2279
2280 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2281 {
2282         struct fib6_config cfg;
2283         int err;
2284
2285         err = rtm_to_fib6_config(skb, nlh, &cfg);
2286         if (err < 0)
2287                 return err;
2288
2289         return ip6_route_del(&cfg);
2290 }
2291
2292 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2293 {
2294         struct fib6_config cfg;
2295         int err;
2296
2297         err = rtm_to_fib6_config(skb, nlh, &cfg);
2298         if (err < 0)
2299                 return err;
2300
2301         return ip6_route_add(&cfg);
2302 }
2303
2304 static inline size_t rt6_nlmsg_size(void)
2305 {
2306         return NLMSG_ALIGN(sizeof(struct rtmsg))
2307                + nla_total_size(16) /* RTA_SRC */
2308                + nla_total_size(16) /* RTA_DST */
2309                + nla_total_size(16) /* RTA_GATEWAY */
2310                + nla_total_size(16) /* RTA_PREFSRC */
2311                + nla_total_size(4) /* RTA_TABLE */
2312                + nla_total_size(4) /* RTA_IIF */
2313                + nla_total_size(4) /* RTA_OIF */
2314                + nla_total_size(4) /* RTA_PRIORITY */
2315                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2316                + nla_total_size(sizeof(struct rta_cacheinfo));
2317 }
2318
2319 static int rt6_fill_node(struct net *net,
2320                          struct sk_buff *skb, struct rt6_info *rt,
2321                          struct in6_addr *dst, struct in6_addr *src,
2322                          int iif, int type, u32 pid, u32 seq,
2323                          int prefix, int nowait, unsigned int flags)
2324 {
2325         struct rtmsg *rtm;
2326         struct nlmsghdr *nlh;
2327         long expires;
2328         u32 table;
2329
2330         if (prefix) {   /* user wants prefix routes only */
2331                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2332                         /* success since this is not a prefix route */
2333                         return 1;
2334                 }
2335         }
2336
2337         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2338         if (nlh == NULL)
2339                 return -EMSGSIZE;
2340
2341         rtm = nlmsg_data(nlh);
2342         rtm->rtm_family = AF_INET6;
2343         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2344         rtm->rtm_src_len = rt->rt6i_src.plen;
2345         rtm->rtm_tos = 0;
2346         if (rt->rt6i_table)
2347                 table = rt->rt6i_table->tb6_id;
2348         else
2349                 table = RT6_TABLE_UNSPEC;
2350         rtm->rtm_table = table;
2351         NLA_PUT_U32(skb, RTA_TABLE, table);
2352         if (rt->rt6i_flags&RTF_REJECT)
2353                 rtm->rtm_type = RTN_UNREACHABLE;
2354         else if (rt->rt6i_flags&RTF_LOCAL)
2355                 rtm->rtm_type = RTN_LOCAL;
2356         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2357                 rtm->rtm_type = RTN_LOCAL;
2358         else
2359                 rtm->rtm_type = RTN_UNICAST;
2360         rtm->rtm_flags = 0;
2361         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2362         rtm->rtm_protocol = rt->rt6i_protocol;
2363         if (rt->rt6i_flags&RTF_DYNAMIC)
2364                 rtm->rtm_protocol = RTPROT_REDIRECT;
2365         else if (rt->rt6i_flags & RTF_ADDRCONF)
2366                 rtm->rtm_protocol = RTPROT_KERNEL;
2367         else if (rt->rt6i_flags&RTF_DEFAULT)
2368                 rtm->rtm_protocol = RTPROT_RA;
2369
2370         if (rt->rt6i_flags&RTF_CACHE)
2371                 rtm->rtm_flags |= RTM_F_CLONED;
2372
2373         if (dst) {
2374                 NLA_PUT(skb, RTA_DST, 16, dst);
2375                 rtm->rtm_dst_len = 128;
2376         } else if (rtm->rtm_dst_len)
2377                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2378 #ifdef CONFIG_IPV6_SUBTREES
2379         if (src) {
2380                 NLA_PUT(skb, RTA_SRC, 16, src);
2381                 rtm->rtm_src_len = 128;
2382         } else if (rtm->rtm_src_len)
2383                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2384 #endif
2385         if (iif) {
2386 #ifdef CONFIG_IPV6_MROUTE
2387                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2388                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2389                         if (err <= 0) {
2390                                 if (!nowait) {
2391                                         if (err == 0)
2392                                                 return 0;
2393                                         goto nla_put_failure;
2394                                 } else {
2395                                         if (err == -EMSGSIZE)
2396                                                 goto nla_put_failure;
2397                                 }
2398                         }
2399                 } else
2400 #endif
2401                         NLA_PUT_U32(skb, RTA_IIF, iif);
2402         } else if (dst) {
2403                 struct in6_addr saddr_buf;
2404                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2405                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2406         }
2407
2408         if (rt->rt6i_prefsrc.plen) {
2409                 struct in6_addr saddr_buf;
2410                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2411                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2412         }
2413
2414         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2415                 goto nla_put_failure;
2416
2417         if (dst_get_neighbour(&rt->dst))
2418                 NLA_PUT(skb, RTA_GATEWAY, 16, &dst_get_neighbour(&rt->dst)->primary_key);
2419
2420         if (rt->dst.dev)
2421                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2422
2423         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2424
2425         if (!(rt->rt6i_flags & RTF_EXPIRES))
2426                 expires = 0;
2427         else if (rt->rt6i_expires - jiffies < INT_MAX)
2428                 expires = rt->rt6i_expires - jiffies;
2429         else
2430                 expires = INT_MAX;
2431
2432         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2433                                expires, rt->dst.error) < 0)
2434                 goto nla_put_failure;
2435
2436         return nlmsg_end(skb, nlh);
2437
2438 nla_put_failure:
2439         nlmsg_cancel(skb, nlh);
2440         return -EMSGSIZE;
2441 }
2442
2443 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2444 {
2445         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2446         int prefix;
2447
2448         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2449                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2450                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2451         } else
2452                 prefix = 0;
2453
2454         return rt6_fill_node(arg->net,
2455                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2456                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2457                      prefix, 0, NLM_F_MULTI);
2458 }
2459
2460 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2461 {
2462         struct net *net = sock_net(in_skb->sk);
2463         struct nlattr *tb[RTA_MAX+1];
2464         struct rt6_info *rt;
2465         struct sk_buff *skb;
2466         struct rtmsg *rtm;
2467         struct flowi6 fl6;
2468         int err, iif = 0;
2469
2470         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2471         if (err < 0)
2472                 goto errout;
2473
2474         err = -EINVAL;
2475         memset(&fl6, 0, sizeof(fl6));
2476
2477         if (tb[RTA_SRC]) {
2478                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2479                         goto errout;
2480
2481                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2482         }
2483
2484         if (tb[RTA_DST]) {
2485                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2486                         goto errout;
2487
2488                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2489         }
2490
2491         if (tb[RTA_IIF])
2492                 iif = nla_get_u32(tb[RTA_IIF]);
2493
2494         if (tb[RTA_OIF])
2495                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2496
2497         if (iif) {
2498                 struct net_device *dev;
2499                 dev = __dev_get_by_index(net, iif);
2500                 if (!dev) {
2501                         err = -ENODEV;
2502                         goto errout;
2503                 }
2504         }
2505
2506         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2507         if (skb == NULL) {
2508                 err = -ENOBUFS;
2509                 goto errout;
2510         }
2511
2512         /* Reserve room for dummy headers, this skb can pass
2513            through good chunk of routing engine.
2514          */
2515         skb_reset_mac_header(skb);
2516         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2517
2518         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2519         skb_dst_set(skb, &rt->dst);
2520
2521         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2522                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2523                             nlh->nlmsg_seq, 0, 0, 0);
2524         if (err < 0) {
2525                 kfree_skb(skb);
2526                 goto errout;
2527         }
2528
2529         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2530 errout:
2531         return err;
2532 }
2533
2534 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2535 {
2536         struct sk_buff *skb;
2537         struct net *net = info->nl_net;
2538         u32 seq;
2539         int err;
2540
2541         err = -ENOBUFS;
2542         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2543
2544         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2545         if (skb == NULL)
2546                 goto errout;
2547
2548         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2549                                 event, info->pid, seq, 0, 0, 0);
2550         if (err < 0) {
2551                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2552                 WARN_ON(err == -EMSGSIZE);
2553                 kfree_skb(skb);
2554                 goto errout;
2555         }
2556         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2557                     info->nlh, gfp_any());
2558         return;
2559 errout:
2560         if (err < 0)
2561                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2562 }
2563
2564 static int ip6_route_dev_notify(struct notifier_block *this,
2565                                 unsigned long event, void *data)
2566 {
2567         struct net_device *dev = (struct net_device *)data;
2568         struct net *net = dev_net(dev);
2569
2570         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2571                 net->ipv6.ip6_null_entry->dst.dev = dev;
2572                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2573 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2574                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2575                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2576                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2577                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2578 #endif
2579         }
2580
2581         return NOTIFY_OK;
2582 }
2583
2584 /*
2585  *      /proc
2586  */
2587
2588 #ifdef CONFIG_PROC_FS
2589
2590 struct rt6_proc_arg
2591 {
2592         char *buffer;
2593         int offset;
2594         int length;
2595         int skip;
2596         int len;
2597 };
2598
2599 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2600 {
2601         struct seq_file *m = p_arg;
2602         struct neighbour *n;
2603
2604         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2605
2606 #ifdef CONFIG_IPV6_SUBTREES
2607         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2608 #else
2609         seq_puts(m, "00000000000000000000000000000000 00 ");
2610 #endif
2611         n = dst_get_neighbour(&rt->dst);
2612         if (n) {
2613                 seq_printf(m, "%pi6", n->primary_key);
2614         } else {
2615                 seq_puts(m, "00000000000000000000000000000000");
2616         }
2617         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2618                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2619                    rt->dst.__use, rt->rt6i_flags,
2620                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2621         return 0;
2622 }
2623
2624 static int ipv6_route_show(struct seq_file *m, void *v)
2625 {
2626         struct net *net = (struct net *)m->private;
2627         fib6_clean_all(net, rt6_info_route, 0, m);
2628         return 0;
2629 }
2630
2631 static int ipv6_route_open(struct inode *inode, struct file *file)
2632 {
2633         return single_open_net(inode, file, ipv6_route_show);
2634 }
2635
2636 static const struct file_operations ipv6_route_proc_fops = {
2637         .owner          = THIS_MODULE,
2638         .open           = ipv6_route_open,
2639         .read           = seq_read,
2640         .llseek         = seq_lseek,
2641         .release        = single_release_net,
2642 };
2643
2644 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2645 {
2646         struct net *net = (struct net *)seq->private;
2647         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2648                    net->ipv6.rt6_stats->fib_nodes,
2649                    net->ipv6.rt6_stats->fib_route_nodes,
2650                    net->ipv6.rt6_stats->fib_rt_alloc,
2651                    net->ipv6.rt6_stats->fib_rt_entries,
2652                    net->ipv6.rt6_stats->fib_rt_cache,
2653                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2654                    net->ipv6.rt6_stats->fib_discarded_routes);
2655
2656         return 0;
2657 }
2658
2659 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2660 {
2661         return single_open_net(inode, file, rt6_stats_seq_show);
2662 }
2663
2664 static const struct file_operations rt6_stats_seq_fops = {
2665         .owner   = THIS_MODULE,
2666         .open    = rt6_stats_seq_open,
2667         .read    = seq_read,
2668         .llseek  = seq_lseek,
2669         .release = single_release_net,
2670 };
2671 #endif  /* CONFIG_PROC_FS */
2672
2673 #ifdef CONFIG_SYSCTL
2674
2675 static
2676 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2677                               void __user *buffer, size_t *lenp, loff_t *ppos)
2678 {
2679         struct net *net;
2680         int delay;
2681         if (!write)
2682                 return -EINVAL;
2683
2684         net = (struct net *)ctl->extra1;
2685         delay = net->ipv6.sysctl.flush_delay;
2686         proc_dointvec(ctl, write, buffer, lenp, ppos);
2687         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2688         return 0;
2689 }
2690
2691 ctl_table ipv6_route_table_template[] = {
2692         {
2693                 .procname       =       "flush",
2694                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2695                 .maxlen         =       sizeof(int),
2696                 .mode           =       0200,
2697                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2698         },
2699         {
2700                 .procname       =       "gc_thresh",
2701                 .data           =       &ip6_dst_ops_template.gc_thresh,
2702                 .maxlen         =       sizeof(int),
2703                 .mode           =       0644,
2704                 .proc_handler   =       proc_dointvec,
2705         },
2706         {
2707                 .procname       =       "max_size",
2708                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2709                 .maxlen         =       sizeof(int),
2710                 .mode           =       0644,
2711                 .proc_handler   =       proc_dointvec,
2712         },
2713         {
2714                 .procname       =       "gc_min_interval",
2715                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2716                 .maxlen         =       sizeof(int),
2717                 .mode           =       0644,
2718                 .proc_handler   =       proc_dointvec_jiffies,
2719         },
2720         {
2721                 .procname       =       "gc_timeout",
2722                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2723                 .maxlen         =       sizeof(int),
2724                 .mode           =       0644,
2725                 .proc_handler   =       proc_dointvec_jiffies,
2726         },
2727         {
2728                 .procname       =       "gc_interval",
2729                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2730                 .maxlen         =       sizeof(int),
2731                 .mode           =       0644,
2732                 .proc_handler   =       proc_dointvec_jiffies,
2733         },
2734         {
2735                 .procname       =       "gc_elasticity",
2736                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2737                 .maxlen         =       sizeof(int),
2738                 .mode           =       0644,
2739                 .proc_handler   =       proc_dointvec,
2740         },
2741         {
2742                 .procname       =       "mtu_expires",
2743                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2744                 .maxlen         =       sizeof(int),
2745                 .mode           =       0644,
2746                 .proc_handler   =       proc_dointvec_jiffies,
2747         },
2748         {
2749                 .procname       =       "min_adv_mss",
2750                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2751                 .maxlen         =       sizeof(int),
2752                 .mode           =       0644,
2753                 .proc_handler   =       proc_dointvec,
2754         },
2755         {
2756                 .procname       =       "gc_min_interval_ms",
2757                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2758                 .maxlen         =       sizeof(int),
2759                 .mode           =       0644,
2760                 .proc_handler   =       proc_dointvec_ms_jiffies,
2761         },
2762         { }
2763 };
2764
2765 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2766 {
2767         struct ctl_table *table;
2768
2769         table = kmemdup(ipv6_route_table_template,
2770                         sizeof(ipv6_route_table_template),
2771                         GFP_KERNEL);
2772
2773         if (table) {
2774                 table[0].data = &net->ipv6.sysctl.flush_delay;
2775                 table[0].extra1 = net;
2776                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2777                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2778                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2779                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2780                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2781                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2782                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2783                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2784                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2785         }
2786
2787         return table;
2788 }
2789 #endif
2790
2791 static int __net_init ip6_route_net_init(struct net *net)
2792 {
2793         int ret = -ENOMEM;
2794
2795         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2796                sizeof(net->ipv6.ip6_dst_ops));
2797
2798         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2799                 goto out_ip6_dst_ops;
2800
2801         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2802                                            sizeof(*net->ipv6.ip6_null_entry),
2803                                            GFP_KERNEL);
2804         if (!net->ipv6.ip6_null_entry)
2805                 goto out_ip6_dst_entries;
2806         net->ipv6.ip6_null_entry->dst.path =
2807                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2808         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2809         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2810                          ip6_template_metrics, true);
2811
2812 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2813         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2814                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2815                                                GFP_KERNEL);
2816         if (!net->ipv6.ip6_prohibit_entry)
2817                 goto out_ip6_null_entry;
2818         net->ipv6.ip6_prohibit_entry->dst.path =
2819                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2820         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2821         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2822                          ip6_template_metrics, true);
2823
2824         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2825                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2826                                                GFP_KERNEL);
2827         if (!net->ipv6.ip6_blk_hole_entry)
2828                 goto out_ip6_prohibit_entry;
2829         net->ipv6.ip6_blk_hole_entry->dst.path =
2830                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2831         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2832         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2833                          ip6_template_metrics, true);
2834 #endif
2835
2836         net->ipv6.sysctl.flush_delay = 0;
2837         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2838         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2839         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2840         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2841         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2842         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2843         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2844
2845 #ifdef CONFIG_PROC_FS
2846         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2847         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2848 #endif
2849         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2850
2851         ret = 0;
2852 out:
2853         return ret;
2854
2855 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2856 out_ip6_prohibit_entry:
2857         kfree(net->ipv6.ip6_prohibit_entry);
2858 out_ip6_null_entry:
2859         kfree(net->ipv6.ip6_null_entry);
2860 #endif
2861 out_ip6_dst_entries:
2862         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2863 out_ip6_dst_ops:
2864         goto out;
2865 }
2866
2867 static void __net_exit ip6_route_net_exit(struct net *net)
2868 {
2869 #ifdef CONFIG_PROC_FS
2870         proc_net_remove(net, "ipv6_route");
2871         proc_net_remove(net, "rt6_stats");
2872 #endif
2873         kfree(net->ipv6.ip6_null_entry);
2874 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2875         kfree(net->ipv6.ip6_prohibit_entry);
2876         kfree(net->ipv6.ip6_blk_hole_entry);
2877 #endif
2878         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2879 }
2880
2881 static struct pernet_operations ip6_route_net_ops = {
2882         .init = ip6_route_net_init,
2883         .exit = ip6_route_net_exit,
2884 };
2885
2886 static struct notifier_block ip6_route_dev_notifier = {
2887         .notifier_call = ip6_route_dev_notify,
2888         .priority = 0,
2889 };
2890
2891 int __init ip6_route_init(void)
2892 {
2893         int ret;
2894
2895         ret = -ENOMEM;
2896         ip6_dst_ops_template.kmem_cachep =
2897                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2898                                   SLAB_HWCACHE_ALIGN, NULL);
2899         if (!ip6_dst_ops_template.kmem_cachep)
2900                 goto out;
2901
2902         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2903         if (ret)
2904                 goto out_kmem_cache;
2905
2906         ret = register_pernet_subsys(&ip6_route_net_ops);
2907         if (ret)
2908                 goto out_dst_entries;
2909
2910         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2911
2912         /* Registering of the loopback is done before this portion of code,
2913          * the loopback reference in rt6_info will not be taken, do it
2914          * manually for init_net */
2915         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2916         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2917   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2918         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2919         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2920         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2921         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2922   #endif
2923         ret = fib6_init();
2924         if (ret)
2925                 goto out_register_subsys;
2926
2927         ret = xfrm6_init();
2928         if (ret)
2929                 goto out_fib6_init;
2930
2931         ret = fib6_rules_init();
2932         if (ret)
2933                 goto xfrm6_init;
2934
2935         ret = -ENOBUFS;
2936         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2937             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2938             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2939                 goto fib6_rules_init;
2940
2941         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2942         if (ret)
2943                 goto fib6_rules_init;
2944
2945 out:
2946         return ret;
2947
2948 fib6_rules_init:
2949         fib6_rules_cleanup();
2950 xfrm6_init:
2951         xfrm6_fini();
2952 out_fib6_init:
2953         fib6_gc_cleanup();
2954 out_register_subsys:
2955         unregister_pernet_subsys(&ip6_route_net_ops);
2956 out_dst_entries:
2957         dst_entries_destroy(&ip6_dst_blackhole_ops);
2958 out_kmem_cache:
2959         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2960         goto out;
2961 }
2962
2963 void ip6_route_cleanup(void)
2964 {
2965         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2966         fib6_rules_cleanup();
2967         xfrm6_fini();
2968         fib6_gc_cleanup();
2969         unregister_pernet_subsys(&ip6_route_net_ops);
2970         dst_entries_destroy(&ip6_dst_blackhole_ops);
2971         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2972 }