Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
76                                     const struct in6_addr *dest);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void             ip6_dst_destroy(struct dst_entry *);
82 static void             ip6_dst_ifdown(struct dst_entry *,
83                                        struct net_device *dev, int how);
84 static int               ip6_dst_gc(struct dst_ops *ops);
85
86 static int              ip6_pkt_discard(struct sk_buff *skb);
87 static int              ip6_pkt_discard_out(struct sk_buff *skb);
88 static void             ip6_link_failure(struct sk_buff *skb);
89 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90
91 #ifdef CONFIG_IPV6_ROUTE_INFO
92 static struct rt6_info *rt6_add_route_info(struct net *net,
93                                            const struct in6_addr *prefix, int prefixlen,
94                                            const struct in6_addr *gwaddr, int ifindex,
95                                            unsigned pref);
96 static struct rt6_info *rt6_get_route_info(struct net *net,
97                                            const struct in6_addr *prefix, int prefixlen,
98                                            const struct in6_addr *gwaddr, int ifindex);
99 #endif
100
101 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
102 {
103         struct rt6_info *rt = (struct rt6_info *) dst;
104         struct inet_peer *peer;
105         u32 *p = NULL;
106
107         if (!rt->rt6i_peer)
108                 rt6_bind_peer(rt, 1);
109
110         peer = rt->rt6i_peer;
111         if (peer) {
112                 u32 *old_p = __DST_METRICS_PTR(old);
113                 unsigned long prev, new;
114
115                 p = peer->metrics;
116                 if (inet_metrics_new(peer))
117                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
118
119                 new = (unsigned long) p;
120                 prev = cmpxchg(&dst->_metrics, old, new);
121
122                 if (prev != old) {
123                         p = __DST_METRICS_PTR(prev);
124                         if (prev & DST_METRICS_READ_ONLY)
125                                 p = NULL;
126                 }
127         }
128         return p;
129 }
130
131 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
132 {
133         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
134 }
135
136 static struct dst_ops ip6_dst_ops_template = {
137         .family                 =       AF_INET6,
138         .protocol               =       cpu_to_be16(ETH_P_IPV6),
139         .gc                     =       ip6_dst_gc,
140         .gc_thresh              =       1024,
141         .check                  =       ip6_dst_check,
142         .default_advmss         =       ip6_default_advmss,
143         .default_mtu            =       ip6_default_mtu,
144         .cow_metrics            =       ipv6_cow_metrics,
145         .destroy                =       ip6_dst_destroy,
146         .ifdown                 =       ip6_dst_ifdown,
147         .negative_advice        =       ip6_negative_advice,
148         .link_failure           =       ip6_link_failure,
149         .update_pmtu            =       ip6_rt_update_pmtu,
150         .local_out              =       __ip6_local_out,
151         .neigh_lookup           =       ip6_neigh_lookup,
152 };
153
154 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
155 {
156         return 0;
157 }
158
159 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
160 {
161 }
162
163 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
164                                          unsigned long old)
165 {
166         return NULL;
167 }
168
169 static struct dst_ops ip6_dst_blackhole_ops = {
170         .family                 =       AF_INET6,
171         .protocol               =       cpu_to_be16(ETH_P_IPV6),
172         .destroy                =       ip6_dst_destroy,
173         .check                  =       ip6_dst_check,
174         .default_mtu            =       ip6_blackhole_default_mtu,
175         .default_advmss         =       ip6_default_advmss,
176         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
177         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
178         .neigh_lookup           =       ip6_neigh_lookup,
179 };
180
181 static const u32 ip6_template_metrics[RTAX_MAX] = {
182         [RTAX_HOPLIMIT - 1] = 255,
183 };
184
185 static struct rt6_info ip6_null_entry_template = {
186         .dst = {
187                 .__refcnt       = ATOMIC_INIT(1),
188                 .__use          = 1,
189                 .obsolete       = -1,
190                 .error          = -ENETUNREACH,
191                 .input          = ip6_pkt_discard,
192                 .output         = ip6_pkt_discard_out,
193         },
194         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
195         .rt6i_protocol  = RTPROT_KERNEL,
196         .rt6i_metric    = ~(u32) 0,
197         .rt6i_ref       = ATOMIC_INIT(1),
198 };
199
200 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
201
202 static int ip6_pkt_prohibit(struct sk_buff *skb);
203 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
204
205 static struct rt6_info ip6_prohibit_entry_template = {
206         .dst = {
207                 .__refcnt       = ATOMIC_INIT(1),
208                 .__use          = 1,
209                 .obsolete       = -1,
210                 .error          = -EACCES,
211                 .input          = ip6_pkt_prohibit,
212                 .output         = ip6_pkt_prohibit_out,
213         },
214         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
215         .rt6i_protocol  = RTPROT_KERNEL,
216         .rt6i_metric    = ~(u32) 0,
217         .rt6i_ref       = ATOMIC_INIT(1),
218 };
219
220 static struct rt6_info ip6_blk_hole_entry_template = {
221         .dst = {
222                 .__refcnt       = ATOMIC_INIT(1),
223                 .__use          = 1,
224                 .obsolete       = -1,
225                 .error          = -EINVAL,
226                 .input          = dst_discard,
227                 .output         = dst_discard,
228         },
229         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
230         .rt6i_protocol  = RTPROT_KERNEL,
231         .rt6i_metric    = ~(u32) 0,
232         .rt6i_ref       = ATOMIC_INIT(1),
233 };
234
235 #endif
236
237 /* allocate dst with ip6_dst_ops */
238 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
239                                              struct net_device *dev,
240                                              int flags)
241 {
242         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
243
244         memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
245
246         return rt;
247 }
248
249 static void ip6_dst_destroy(struct dst_entry *dst)
250 {
251         struct rt6_info *rt = (struct rt6_info *)dst;
252         struct inet6_dev *idev = rt->rt6i_idev;
253         struct inet_peer *peer = rt->rt6i_peer;
254
255         if (idev != NULL) {
256                 rt->rt6i_idev = NULL;
257                 in6_dev_put(idev);
258         }
259         if (peer) {
260                 rt->rt6i_peer = NULL;
261                 inet_putpeer(peer);
262         }
263 }
264
265 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
266
267 static u32 rt6_peer_genid(void)
268 {
269         return atomic_read(&__rt6_peer_genid);
270 }
271
272 void rt6_bind_peer(struct rt6_info *rt, int create)
273 {
274         struct inet_peer *peer;
275
276         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
277         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
278                 inet_putpeer(peer);
279         else
280                 rt->rt6i_peer_genid = rt6_peer_genid();
281 }
282
283 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
284                            int how)
285 {
286         struct rt6_info *rt = (struct rt6_info *)dst;
287         struct inet6_dev *idev = rt->rt6i_idev;
288         struct net_device *loopback_dev =
289                 dev_net(dev)->loopback_dev;
290
291         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
292                 struct inet6_dev *loopback_idev =
293                         in6_dev_get(loopback_dev);
294                 if (loopback_idev != NULL) {
295                         rt->rt6i_idev = loopback_idev;
296                         in6_dev_put(idev);
297                 }
298         }
299 }
300
301 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
302 {
303         return (rt->rt6i_flags & RTF_EXPIRES) &&
304                 time_after(jiffies, rt->rt6i_expires);
305 }
306
307 static inline int rt6_need_strict(const struct in6_addr *daddr)
308 {
309         return ipv6_addr_type(daddr) &
310                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
311 }
312
313 /*
314  *      Route lookup. Any table->tb6_lock is implied.
315  */
316
317 static inline struct rt6_info *rt6_device_match(struct net *net,
318                                                     struct rt6_info *rt,
319                                                     const struct in6_addr *saddr,
320                                                     int oif,
321                                                     int flags)
322 {
323         struct rt6_info *local = NULL;
324         struct rt6_info *sprt;
325
326         if (!oif && ipv6_addr_any(saddr))
327                 goto out;
328
329         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
330                 struct net_device *dev = sprt->rt6i_dev;
331
332                 if (oif) {
333                         if (dev->ifindex == oif)
334                                 return sprt;
335                         if (dev->flags & IFF_LOOPBACK) {
336                                 if (sprt->rt6i_idev == NULL ||
337                                     sprt->rt6i_idev->dev->ifindex != oif) {
338                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
339                                                 continue;
340                                         if (local && (!oif ||
341                                                       local->rt6i_idev->dev->ifindex == oif))
342                                                 continue;
343                                 }
344                                 local = sprt;
345                         }
346                 } else {
347                         if (ipv6_chk_addr(net, saddr, dev,
348                                           flags & RT6_LOOKUP_F_IFACE))
349                                 return sprt;
350                 }
351         }
352
353         if (oif) {
354                 if (local)
355                         return local;
356
357                 if (flags & RT6_LOOKUP_F_IFACE)
358                         return net->ipv6.ip6_null_entry;
359         }
360 out:
361         return rt;
362 }
363
364 #ifdef CONFIG_IPV6_ROUTER_PREF
365 static void rt6_probe(struct rt6_info *rt)
366 {
367         struct neighbour *neigh;
368         /*
369          * Okay, this does not seem to be appropriate
370          * for now, however, we need to check if it
371          * is really so; aka Router Reachability Probing.
372          *
373          * Router Reachability Probe MUST be rate-limited
374          * to no more than one per minute.
375          */
376         rcu_read_lock();
377         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
378         if (!neigh || (neigh->nud_state & NUD_VALID))
379                 goto out;
380         read_lock_bh(&neigh->lock);
381         if (!(neigh->nud_state & NUD_VALID) &&
382             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
383                 struct in6_addr mcaddr;
384                 struct in6_addr *target;
385
386                 neigh->updated = jiffies;
387                 read_unlock_bh(&neigh->lock);
388
389                 target = (struct in6_addr *)&neigh->primary_key;
390                 addrconf_addr_solict_mult(target, &mcaddr);
391                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
392         } else {
393                 read_unlock_bh(&neigh->lock);
394         }
395 out:
396         rcu_read_unlock();
397 }
398 #else
399 static inline void rt6_probe(struct rt6_info *rt)
400 {
401 }
402 #endif
403
404 /*
405  * Default Router Selection (RFC 2461 6.3.6)
406  */
407 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
408 {
409         struct net_device *dev = rt->rt6i_dev;
410         if (!oif || dev->ifindex == oif)
411                 return 2;
412         if ((dev->flags & IFF_LOOPBACK) &&
413             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
414                 return 1;
415         return 0;
416 }
417
418 static inline int rt6_check_neigh(struct rt6_info *rt)
419 {
420         struct neighbour *neigh;
421         int m;
422
423         rcu_read_lock();
424         neigh = dst_get_neighbour(&rt->dst);
425         if (rt->rt6i_flags & RTF_NONEXTHOP ||
426             !(rt->rt6i_flags & RTF_GATEWAY))
427                 m = 1;
428         else if (neigh) {
429                 read_lock_bh(&neigh->lock);
430                 if (neigh->nud_state & NUD_VALID)
431                         m = 2;
432 #ifdef CONFIG_IPV6_ROUTER_PREF
433                 else if (neigh->nud_state & NUD_FAILED)
434                         m = 0;
435 #endif
436                 else
437                         m = 1;
438                 read_unlock_bh(&neigh->lock);
439         } else
440                 m = 0;
441         rcu_read_unlock();
442         return m;
443 }
444
445 static int rt6_score_route(struct rt6_info *rt, int oif,
446                            int strict)
447 {
448         int m, n;
449
450         m = rt6_check_dev(rt, oif);
451         if (!m && (strict & RT6_LOOKUP_F_IFACE))
452                 return -1;
453 #ifdef CONFIG_IPV6_ROUTER_PREF
454         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
455 #endif
456         n = rt6_check_neigh(rt);
457         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
458                 return -1;
459         return m;
460 }
461
462 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
463                                    int *mpri, struct rt6_info *match)
464 {
465         int m;
466
467         if (rt6_check_expired(rt))
468                 goto out;
469
470         m = rt6_score_route(rt, oif, strict);
471         if (m < 0)
472                 goto out;
473
474         if (m > *mpri) {
475                 if (strict & RT6_LOOKUP_F_REACHABLE)
476                         rt6_probe(match);
477                 *mpri = m;
478                 match = rt;
479         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
480                 rt6_probe(rt);
481         }
482
483 out:
484         return match;
485 }
486
487 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
488                                      struct rt6_info *rr_head,
489                                      u32 metric, int oif, int strict)
490 {
491         struct rt6_info *rt, *match;
492         int mpri = -1;
493
494         match = NULL;
495         for (rt = rr_head; rt && rt->rt6i_metric == metric;
496              rt = rt->dst.rt6_next)
497                 match = find_match(rt, oif, strict, &mpri, match);
498         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
499              rt = rt->dst.rt6_next)
500                 match = find_match(rt, oif, strict, &mpri, match);
501
502         return match;
503 }
504
505 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
506 {
507         struct rt6_info *match, *rt0;
508         struct net *net;
509
510         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
511                   __func__, fn->leaf, oif);
512
513         rt0 = fn->rr_ptr;
514         if (!rt0)
515                 fn->rr_ptr = rt0 = fn->leaf;
516
517         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
518
519         if (!match &&
520             (strict & RT6_LOOKUP_F_REACHABLE)) {
521                 struct rt6_info *next = rt0->dst.rt6_next;
522
523                 /* no entries matched; do round-robin */
524                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
525                         next = fn->leaf;
526
527                 if (next != rt0)
528                         fn->rr_ptr = next;
529         }
530
531         RT6_TRACE("%s() => %p\n",
532                   __func__, match);
533
534         net = dev_net(rt0->rt6i_dev);
535         return match ? match : net->ipv6.ip6_null_entry;
536 }
537
538 #ifdef CONFIG_IPV6_ROUTE_INFO
539 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
540                   const struct in6_addr *gwaddr)
541 {
542         struct net *net = dev_net(dev);
543         struct route_info *rinfo = (struct route_info *) opt;
544         struct in6_addr prefix_buf, *prefix;
545         unsigned int pref;
546         unsigned long lifetime;
547         struct rt6_info *rt;
548
549         if (len < sizeof(struct route_info)) {
550                 return -EINVAL;
551         }
552
553         /* Sanity check for prefix_len and length */
554         if (rinfo->length > 3) {
555                 return -EINVAL;
556         } else if (rinfo->prefix_len > 128) {
557                 return -EINVAL;
558         } else if (rinfo->prefix_len > 64) {
559                 if (rinfo->length < 2) {
560                         return -EINVAL;
561                 }
562         } else if (rinfo->prefix_len > 0) {
563                 if (rinfo->length < 1) {
564                         return -EINVAL;
565                 }
566         }
567
568         pref = rinfo->route_pref;
569         if (pref == ICMPV6_ROUTER_PREF_INVALID)
570                 return -EINVAL;
571
572         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
573
574         if (rinfo->length == 3)
575                 prefix = (struct in6_addr *)rinfo->prefix;
576         else {
577                 /* this function is safe */
578                 ipv6_addr_prefix(&prefix_buf,
579                                  (struct in6_addr *)rinfo->prefix,
580                                  rinfo->prefix_len);
581                 prefix = &prefix_buf;
582         }
583
584         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
585                                 dev->ifindex);
586
587         if (rt && !lifetime) {
588                 ip6_del_rt(rt);
589                 rt = NULL;
590         }
591
592         if (!rt && lifetime)
593                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
594                                         pref);
595         else if (rt)
596                 rt->rt6i_flags = RTF_ROUTEINFO |
597                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
598
599         if (rt) {
600                 if (!addrconf_finite_timeout(lifetime)) {
601                         rt->rt6i_flags &= ~RTF_EXPIRES;
602                 } else {
603                         rt->rt6i_expires = jiffies + HZ * lifetime;
604                         rt->rt6i_flags |= RTF_EXPIRES;
605                 }
606                 dst_release(&rt->dst);
607         }
608         return 0;
609 }
610 #endif
611
612 #define BACKTRACK(__net, saddr)                 \
613 do { \
614         if (rt == __net->ipv6.ip6_null_entry) { \
615                 struct fib6_node *pn; \
616                 while (1) { \
617                         if (fn->fn_flags & RTN_TL_ROOT) \
618                                 goto out; \
619                         pn = fn->parent; \
620                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
621                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
622                         else \
623                                 fn = pn; \
624                         if (fn->fn_flags & RTN_RTINFO) \
625                                 goto restart; \
626                 } \
627         } \
628 } while(0)
629
630 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
631                                              struct fib6_table *table,
632                                              struct flowi6 *fl6, int flags)
633 {
634         struct fib6_node *fn;
635         struct rt6_info *rt;
636
637         read_lock_bh(&table->tb6_lock);
638         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
639 restart:
640         rt = fn->leaf;
641         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
642         BACKTRACK(net, &fl6->saddr);
643 out:
644         dst_use(&rt->dst, jiffies);
645         read_unlock_bh(&table->tb6_lock);
646         return rt;
647
648 }
649
650 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
651                             const struct in6_addr *saddr, int oif, int strict)
652 {
653         struct flowi6 fl6 = {
654                 .flowi6_oif = oif,
655                 .daddr = *daddr,
656         };
657         struct dst_entry *dst;
658         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
659
660         if (saddr) {
661                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
662                 flags |= RT6_LOOKUP_F_HAS_SADDR;
663         }
664
665         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
666         if (dst->error == 0)
667                 return (struct rt6_info *) dst;
668
669         dst_release(dst);
670
671         return NULL;
672 }
673
674 EXPORT_SYMBOL(rt6_lookup);
675
676 /* ip6_ins_rt is called with FREE table->tb6_lock.
677    It takes new route entry, the addition fails by any reason the
678    route is freed. In any case, if caller does not hold it, it may
679    be destroyed.
680  */
681
682 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
683 {
684         int err;
685         struct fib6_table *table;
686
687         table = rt->rt6i_table;
688         write_lock_bh(&table->tb6_lock);
689         err = fib6_add(&table->tb6_root, rt, info);
690         write_unlock_bh(&table->tb6_lock);
691
692         return err;
693 }
694
695 int ip6_ins_rt(struct rt6_info *rt)
696 {
697         struct nl_info info = {
698                 .nl_net = dev_net(rt->rt6i_dev),
699         };
700         return __ip6_ins_rt(rt, &info);
701 }
702
703 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
704                                       const struct in6_addr *daddr,
705                                       const struct in6_addr *saddr)
706 {
707         struct rt6_info *rt;
708
709         /*
710          *      Clone the route.
711          */
712
713         rt = ip6_rt_copy(ort, daddr);
714
715         if (rt) {
716                 struct neighbour *neigh;
717                 int attempts = !in_softirq();
718
719                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
720                         if (rt->rt6i_dst.plen != 128 &&
721                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
722                                 rt->rt6i_flags |= RTF_ANYCAST;
723                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
724                 }
725
726                 rt->rt6i_dst.plen = 128;
727                 rt->rt6i_flags |= RTF_CACHE;
728                 rt->dst.flags |= DST_HOST;
729
730 #ifdef CONFIG_IPV6_SUBTREES
731                 if (rt->rt6i_src.plen && saddr) {
732                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
733                         rt->rt6i_src.plen = 128;
734                 }
735 #endif
736
737         retry:
738                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
739                 if (IS_ERR(neigh)) {
740                         struct net *net = dev_net(rt->rt6i_dev);
741                         int saved_rt_min_interval =
742                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
743                         int saved_rt_elasticity =
744                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
745
746                         if (attempts-- > 0) {
747                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
748                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
749
750                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
751
752                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
753                                         saved_rt_elasticity;
754                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
755                                         saved_rt_min_interval;
756                                 goto retry;
757                         }
758
759                         if (net_ratelimit())
760                                 printk(KERN_WARNING
761                                        "ipv6: Neighbour table overflow.\n");
762                         dst_free(&rt->dst);
763                         return NULL;
764                 }
765                 dst_set_neighbour(&rt->dst, neigh);
766
767         }
768
769         return rt;
770 }
771
772 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
773                                         const struct in6_addr *daddr)
774 {
775         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
776
777         if (rt) {
778                 rt->rt6i_dst.plen = 128;
779                 rt->rt6i_flags |= RTF_CACHE;
780                 rt->dst.flags |= DST_HOST;
781                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
782         }
783         return rt;
784 }
785
786 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
787                                       struct flowi6 *fl6, int flags)
788 {
789         struct fib6_node *fn;
790         struct rt6_info *rt, *nrt;
791         int strict = 0;
792         int attempts = 3;
793         int err;
794         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
795
796         strict |= flags & RT6_LOOKUP_F_IFACE;
797
798 relookup:
799         read_lock_bh(&table->tb6_lock);
800
801 restart_2:
802         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
803
804 restart:
805         rt = rt6_select(fn, oif, strict | reachable);
806
807         BACKTRACK(net, &fl6->saddr);
808         if (rt == net->ipv6.ip6_null_entry ||
809             rt->rt6i_flags & RTF_CACHE)
810                 goto out;
811
812         dst_hold(&rt->dst);
813         read_unlock_bh(&table->tb6_lock);
814
815         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
816                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
817         else if (!(rt->dst.flags & DST_HOST))
818                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
819         else
820                 goto out2;
821
822         dst_release(&rt->dst);
823         rt = nrt ? : net->ipv6.ip6_null_entry;
824
825         dst_hold(&rt->dst);
826         if (nrt) {
827                 err = ip6_ins_rt(nrt);
828                 if (!err)
829                         goto out2;
830         }
831
832         if (--attempts <= 0)
833                 goto out2;
834
835         /*
836          * Race condition! In the gap, when table->tb6_lock was
837          * released someone could insert this route.  Relookup.
838          */
839         dst_release(&rt->dst);
840         goto relookup;
841
842 out:
843         if (reachable) {
844                 reachable = 0;
845                 goto restart_2;
846         }
847         dst_hold(&rt->dst);
848         read_unlock_bh(&table->tb6_lock);
849 out2:
850         rt->dst.lastuse = jiffies;
851         rt->dst.__use++;
852
853         return rt;
854 }
855
856 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
857                                             struct flowi6 *fl6, int flags)
858 {
859         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
860 }
861
862 void ip6_route_input(struct sk_buff *skb)
863 {
864         const struct ipv6hdr *iph = ipv6_hdr(skb);
865         struct net *net = dev_net(skb->dev);
866         int flags = RT6_LOOKUP_F_HAS_SADDR;
867         struct flowi6 fl6 = {
868                 .flowi6_iif = skb->dev->ifindex,
869                 .daddr = iph->daddr,
870                 .saddr = iph->saddr,
871                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
872                 .flowi6_mark = skb->mark,
873                 .flowi6_proto = iph->nexthdr,
874         };
875
876         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
877                 flags |= RT6_LOOKUP_F_IFACE;
878
879         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
880 }
881
882 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
883                                              struct flowi6 *fl6, int flags)
884 {
885         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
886 }
887
888 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
889                                     struct flowi6 *fl6)
890 {
891         int flags = 0;
892
893         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
894                 flags |= RT6_LOOKUP_F_IFACE;
895
896         if (!ipv6_addr_any(&fl6->saddr))
897                 flags |= RT6_LOOKUP_F_HAS_SADDR;
898         else if (sk)
899                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
900
901         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
902 }
903
904 EXPORT_SYMBOL(ip6_route_output);
905
906 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
907 {
908         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
909         struct dst_entry *new = NULL;
910
911         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
912         if (rt) {
913                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
914
915                 new = &rt->dst;
916
917                 new->__use = 1;
918                 new->input = dst_discard;
919                 new->output = dst_discard;
920
921                 if (dst_metrics_read_only(&ort->dst))
922                         new->_metrics = ort->dst._metrics;
923                 else
924                         dst_copy_metrics(new, &ort->dst);
925                 rt->rt6i_idev = ort->rt6i_idev;
926                 if (rt->rt6i_idev)
927                         in6_dev_hold(rt->rt6i_idev);
928                 rt->rt6i_expires = 0;
929
930                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
931                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
932                 rt->rt6i_metric = 0;
933
934                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
935 #ifdef CONFIG_IPV6_SUBTREES
936                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
937 #endif
938
939                 dst_free(new);
940         }
941
942         dst_release(dst_orig);
943         return new ? new : ERR_PTR(-ENOMEM);
944 }
945
946 /*
947  *      Destination cache support functions
948  */
949
950 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
951 {
952         struct rt6_info *rt;
953
954         rt = (struct rt6_info *) dst;
955
956         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
957                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
958                         if (!rt->rt6i_peer)
959                                 rt6_bind_peer(rt, 0);
960                         rt->rt6i_peer_genid = rt6_peer_genid();
961                 }
962                 return dst;
963         }
964         return NULL;
965 }
966
967 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
968 {
969         struct rt6_info *rt = (struct rt6_info *) dst;
970
971         if (rt) {
972                 if (rt->rt6i_flags & RTF_CACHE) {
973                         if (rt6_check_expired(rt)) {
974                                 ip6_del_rt(rt);
975                                 dst = NULL;
976                         }
977                 } else {
978                         dst_release(dst);
979                         dst = NULL;
980                 }
981         }
982         return dst;
983 }
984
985 static void ip6_link_failure(struct sk_buff *skb)
986 {
987         struct rt6_info *rt;
988
989         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
990
991         rt = (struct rt6_info *) skb_dst(skb);
992         if (rt) {
993                 if (rt->rt6i_flags&RTF_CACHE) {
994                         dst_set_expires(&rt->dst, 0);
995                         rt->rt6i_flags |= RTF_EXPIRES;
996                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
997                         rt->rt6i_node->fn_sernum = -1;
998         }
999 }
1000
1001 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1002 {
1003         struct rt6_info *rt6 = (struct rt6_info*)dst;
1004
1005         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1006                 rt6->rt6i_flags |= RTF_MODIFIED;
1007                 if (mtu < IPV6_MIN_MTU) {
1008                         u32 features = dst_metric(dst, RTAX_FEATURES);
1009                         mtu = IPV6_MIN_MTU;
1010                         features |= RTAX_FEATURE_ALLFRAG;
1011                         dst_metric_set(dst, RTAX_FEATURES, features);
1012                 }
1013                 dst_metric_set(dst, RTAX_MTU, mtu);
1014         }
1015 }
1016
1017 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1018 {
1019         struct net_device *dev = dst->dev;
1020         unsigned int mtu = dst_mtu(dst);
1021         struct net *net = dev_net(dev);
1022
1023         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1024
1025         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1026                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1027
1028         /*
1029          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1030          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1031          * IPV6_MAXPLEN is also valid and means: "any MSS,
1032          * rely only on pmtu discovery"
1033          */
1034         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1035                 mtu = IPV6_MAXPLEN;
1036         return mtu;
1037 }
1038
1039 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1040 {
1041         unsigned int mtu = IPV6_MIN_MTU;
1042         struct inet6_dev *idev;
1043
1044         rcu_read_lock();
1045         idev = __in6_dev_get(dst->dev);
1046         if (idev)
1047                 mtu = idev->cnf.mtu6;
1048         rcu_read_unlock();
1049
1050         return mtu;
1051 }
1052
1053 static struct dst_entry *icmp6_dst_gc_list;
1054 static DEFINE_SPINLOCK(icmp6_dst_lock);
1055
1056 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1057                                   struct neighbour *neigh,
1058                                   const struct in6_addr *addr)
1059 {
1060         struct rt6_info *rt;
1061         struct inet6_dev *idev = in6_dev_get(dev);
1062         struct net *net = dev_net(dev);
1063
1064         if (unlikely(idev == NULL))
1065                 return NULL;
1066
1067         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1068         if (unlikely(rt == NULL)) {
1069                 in6_dev_put(idev);
1070                 goto out;
1071         }
1072
1073         if (neigh)
1074                 neigh_hold(neigh);
1075         else {
1076                 neigh = ndisc_get_neigh(dev, addr);
1077                 if (IS_ERR(neigh))
1078                         neigh = NULL;
1079         }
1080
1081         rt->rt6i_idev     = idev;
1082         dst_set_neighbour(&rt->dst, neigh);
1083         atomic_set(&rt->dst.__refcnt, 1);
1084         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1085         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1086         rt->dst.output  = ip6_output;
1087
1088         spin_lock_bh(&icmp6_dst_lock);
1089         rt->dst.next = icmp6_dst_gc_list;
1090         icmp6_dst_gc_list = &rt->dst;
1091         spin_unlock_bh(&icmp6_dst_lock);
1092
1093         fib6_force_start_gc(net);
1094
1095 out:
1096         return &rt->dst;
1097 }
1098
1099 int icmp6_dst_gc(void)
1100 {
1101         struct dst_entry *dst, **pprev;
1102         int more = 0;
1103
1104         spin_lock_bh(&icmp6_dst_lock);
1105         pprev = &icmp6_dst_gc_list;
1106
1107         while ((dst = *pprev) != NULL) {
1108                 if (!atomic_read(&dst->__refcnt)) {
1109                         *pprev = dst->next;
1110                         dst_free(dst);
1111                 } else {
1112                         pprev = &dst->next;
1113                         ++more;
1114                 }
1115         }
1116
1117         spin_unlock_bh(&icmp6_dst_lock);
1118
1119         return more;
1120 }
1121
1122 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1123                             void *arg)
1124 {
1125         struct dst_entry *dst, **pprev;
1126
1127         spin_lock_bh(&icmp6_dst_lock);
1128         pprev = &icmp6_dst_gc_list;
1129         while ((dst = *pprev) != NULL) {
1130                 struct rt6_info *rt = (struct rt6_info *) dst;
1131                 if (func(rt, arg)) {
1132                         *pprev = dst->next;
1133                         dst_free(dst);
1134                 } else {
1135                         pprev = &dst->next;
1136                 }
1137         }
1138         spin_unlock_bh(&icmp6_dst_lock);
1139 }
1140
1141 static int ip6_dst_gc(struct dst_ops *ops)
1142 {
1143         unsigned long now = jiffies;
1144         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1145         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1146         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1147         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1148         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1149         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1150         int entries;
1151
1152         entries = dst_entries_get_fast(ops);
1153         if (time_after(rt_last_gc + rt_min_interval, now) &&
1154             entries <= rt_max_size)
1155                 goto out;
1156
1157         net->ipv6.ip6_rt_gc_expire++;
1158         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1159         net->ipv6.ip6_rt_last_gc = now;
1160         entries = dst_entries_get_slow(ops);
1161         if (entries < ops->gc_thresh)
1162                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1163 out:
1164         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1165         return entries > rt_max_size;
1166 }
1167
1168 /* Clean host part of a prefix. Not necessary in radix tree,
1169    but results in cleaner routing tables.
1170
1171    Remove it only when all the things will work!
1172  */
1173
1174 int ip6_dst_hoplimit(struct dst_entry *dst)
1175 {
1176         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1177         if (hoplimit == 0) {
1178                 struct net_device *dev = dst->dev;
1179                 struct inet6_dev *idev;
1180
1181                 rcu_read_lock();
1182                 idev = __in6_dev_get(dev);
1183                 if (idev)
1184                         hoplimit = idev->cnf.hop_limit;
1185                 else
1186                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1187                 rcu_read_unlock();
1188         }
1189         return hoplimit;
1190 }
1191 EXPORT_SYMBOL(ip6_dst_hoplimit);
1192
1193 /*
1194  *
1195  */
1196
1197 int ip6_route_add(struct fib6_config *cfg)
1198 {
1199         int err;
1200         struct net *net = cfg->fc_nlinfo.nl_net;
1201         struct rt6_info *rt = NULL;
1202         struct net_device *dev = NULL;
1203         struct inet6_dev *idev = NULL;
1204         struct fib6_table *table;
1205         int addr_type;
1206
1207         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1208                 return -EINVAL;
1209 #ifndef CONFIG_IPV6_SUBTREES
1210         if (cfg->fc_src_len)
1211                 return -EINVAL;
1212 #endif
1213         if (cfg->fc_ifindex) {
1214                 err = -ENODEV;
1215                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1216                 if (!dev)
1217                         goto out;
1218                 idev = in6_dev_get(dev);
1219                 if (!idev)
1220                         goto out;
1221         }
1222
1223         if (cfg->fc_metric == 0)
1224                 cfg->fc_metric = IP6_RT_PRIO_USER;
1225
1226         table = fib6_new_table(net, cfg->fc_table);
1227         if (table == NULL) {
1228                 err = -ENOBUFS;
1229                 goto out;
1230         }
1231
1232         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1233
1234         if (rt == NULL) {
1235                 err = -ENOMEM;
1236                 goto out;
1237         }
1238
1239         rt->dst.obsolete = -1;
1240         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1241                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1242                                 0;
1243
1244         if (cfg->fc_protocol == RTPROT_UNSPEC)
1245                 cfg->fc_protocol = RTPROT_BOOT;
1246         rt->rt6i_protocol = cfg->fc_protocol;
1247
1248         addr_type = ipv6_addr_type(&cfg->fc_dst);
1249
1250         if (addr_type & IPV6_ADDR_MULTICAST)
1251                 rt->dst.input = ip6_mc_input;
1252         else if (cfg->fc_flags & RTF_LOCAL)
1253                 rt->dst.input = ip6_input;
1254         else
1255                 rt->dst.input = ip6_forward;
1256
1257         rt->dst.output = ip6_output;
1258
1259         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1260         rt->rt6i_dst.plen = cfg->fc_dst_len;
1261         if (rt->rt6i_dst.plen == 128)
1262                rt->dst.flags |= DST_HOST;
1263
1264 #ifdef CONFIG_IPV6_SUBTREES
1265         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1266         rt->rt6i_src.plen = cfg->fc_src_len;
1267 #endif
1268
1269         rt->rt6i_metric = cfg->fc_metric;
1270
1271         /* We cannot add true routes via loopback here,
1272            they would result in kernel looping; promote them to reject routes
1273          */
1274         if ((cfg->fc_flags & RTF_REJECT) ||
1275             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1276                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1277                 /* hold loopback dev/idev if we haven't done so. */
1278                 if (dev != net->loopback_dev) {
1279                         if (dev) {
1280                                 dev_put(dev);
1281                                 in6_dev_put(idev);
1282                         }
1283                         dev = net->loopback_dev;
1284                         dev_hold(dev);
1285                         idev = in6_dev_get(dev);
1286                         if (!idev) {
1287                                 err = -ENODEV;
1288                                 goto out;
1289                         }
1290                 }
1291                 rt->dst.output = ip6_pkt_discard_out;
1292                 rt->dst.input = ip6_pkt_discard;
1293                 rt->dst.error = -ENETUNREACH;
1294                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1295                 goto install_route;
1296         }
1297
1298         if (cfg->fc_flags & RTF_GATEWAY) {
1299                 const struct in6_addr *gw_addr;
1300                 int gwa_type;
1301
1302                 gw_addr = &cfg->fc_gateway;
1303                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1304                 gwa_type = ipv6_addr_type(gw_addr);
1305
1306                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1307                         struct rt6_info *grt;
1308
1309                         /* IPv6 strictly inhibits using not link-local
1310                            addresses as nexthop address.
1311                            Otherwise, router will not able to send redirects.
1312                            It is very good, but in some (rare!) circumstances
1313                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1314                            some exceptions. --ANK
1315                          */
1316                         err = -EINVAL;
1317                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1318                                 goto out;
1319
1320                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1321
1322                         err = -EHOSTUNREACH;
1323                         if (grt == NULL)
1324                                 goto out;
1325                         if (dev) {
1326                                 if (dev != grt->rt6i_dev) {
1327                                         dst_release(&grt->dst);
1328                                         goto out;
1329                                 }
1330                         } else {
1331                                 dev = grt->rt6i_dev;
1332                                 idev = grt->rt6i_idev;
1333                                 dev_hold(dev);
1334                                 in6_dev_hold(grt->rt6i_idev);
1335                         }
1336                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1337                                 err = 0;
1338                         dst_release(&grt->dst);
1339
1340                         if (err)
1341                                 goto out;
1342                 }
1343                 err = -EINVAL;
1344                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1345                         goto out;
1346         }
1347
1348         err = -ENODEV;
1349         if (dev == NULL)
1350                 goto out;
1351
1352         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1353                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1354                         err = -EINVAL;
1355                         goto out;
1356                 }
1357                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1358                 rt->rt6i_prefsrc.plen = 128;
1359         } else
1360                 rt->rt6i_prefsrc.plen = 0;
1361
1362         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1363                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1364                 if (IS_ERR(n)) {
1365                         err = PTR_ERR(n);
1366                         goto out;
1367                 }
1368                 dst_set_neighbour(&rt->dst, n);
1369         }
1370
1371         rt->rt6i_flags = cfg->fc_flags;
1372
1373 install_route:
1374         if (cfg->fc_mx) {
1375                 struct nlattr *nla;
1376                 int remaining;
1377
1378                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1379                         int type = nla_type(nla);
1380
1381                         if (type) {
1382                                 if (type > RTAX_MAX) {
1383                                         err = -EINVAL;
1384                                         goto out;
1385                                 }
1386
1387                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1388                         }
1389                 }
1390         }
1391
1392         rt->dst.dev = dev;
1393         rt->rt6i_idev = idev;
1394         rt->rt6i_table = table;
1395
1396         cfg->fc_nlinfo.nl_net = dev_net(dev);
1397
1398         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1399
1400 out:
1401         if (dev)
1402                 dev_put(dev);
1403         if (idev)
1404                 in6_dev_put(idev);
1405         if (rt)
1406                 dst_free(&rt->dst);
1407         return err;
1408 }
1409
1410 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1411 {
1412         int err;
1413         struct fib6_table *table;
1414         struct net *net = dev_net(rt->rt6i_dev);
1415
1416         if (rt == net->ipv6.ip6_null_entry)
1417                 return -ENOENT;
1418
1419         table = rt->rt6i_table;
1420         write_lock_bh(&table->tb6_lock);
1421
1422         err = fib6_del(rt, info);
1423         dst_release(&rt->dst);
1424
1425         write_unlock_bh(&table->tb6_lock);
1426
1427         return err;
1428 }
1429
1430 int ip6_del_rt(struct rt6_info *rt)
1431 {
1432         struct nl_info info = {
1433                 .nl_net = dev_net(rt->rt6i_dev),
1434         };
1435         return __ip6_del_rt(rt, &info);
1436 }
1437
1438 static int ip6_route_del(struct fib6_config *cfg)
1439 {
1440         struct fib6_table *table;
1441         struct fib6_node *fn;
1442         struct rt6_info *rt;
1443         int err = -ESRCH;
1444
1445         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1446         if (table == NULL)
1447                 return err;
1448
1449         read_lock_bh(&table->tb6_lock);
1450
1451         fn = fib6_locate(&table->tb6_root,
1452                          &cfg->fc_dst, cfg->fc_dst_len,
1453                          &cfg->fc_src, cfg->fc_src_len);
1454
1455         if (fn) {
1456                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1457                         if (cfg->fc_ifindex &&
1458                             (rt->rt6i_dev == NULL ||
1459                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1460                                 continue;
1461                         if (cfg->fc_flags & RTF_GATEWAY &&
1462                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1463                                 continue;
1464                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1465                                 continue;
1466                         dst_hold(&rt->dst);
1467                         read_unlock_bh(&table->tb6_lock);
1468
1469                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1470                 }
1471         }
1472         read_unlock_bh(&table->tb6_lock);
1473
1474         return err;
1475 }
1476
1477 /*
1478  *      Handle redirects
1479  */
1480 struct ip6rd_flowi {
1481         struct flowi6 fl6;
1482         struct in6_addr gateway;
1483 };
1484
1485 static struct rt6_info *__ip6_route_redirect(struct net *net,
1486                                              struct fib6_table *table,
1487                                              struct flowi6 *fl6,
1488                                              int flags)
1489 {
1490         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1491         struct rt6_info *rt;
1492         struct fib6_node *fn;
1493
1494         /*
1495          * Get the "current" route for this destination and
1496          * check if the redirect has come from approriate router.
1497          *
1498          * RFC 2461 specifies that redirects should only be
1499          * accepted if they come from the nexthop to the target.
1500          * Due to the way the routes are chosen, this notion
1501          * is a bit fuzzy and one might need to check all possible
1502          * routes.
1503          */
1504
1505         read_lock_bh(&table->tb6_lock);
1506         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1507 restart:
1508         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1509                 /*
1510                  * Current route is on-link; redirect is always invalid.
1511                  *
1512                  * Seems, previous statement is not true. It could
1513                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1514                  * But then router serving it might decide, that we should
1515                  * know truth 8)8) --ANK (980726).
1516                  */
1517                 if (rt6_check_expired(rt))
1518                         continue;
1519                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1520                         continue;
1521                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1522                         continue;
1523                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1524                         continue;
1525                 break;
1526         }
1527
1528         if (!rt)
1529                 rt = net->ipv6.ip6_null_entry;
1530         BACKTRACK(net, &fl6->saddr);
1531 out:
1532         dst_hold(&rt->dst);
1533
1534         read_unlock_bh(&table->tb6_lock);
1535
1536         return rt;
1537 };
1538
1539 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1540                                            const struct in6_addr *src,
1541                                            const struct in6_addr *gateway,
1542                                            struct net_device *dev)
1543 {
1544         int flags = RT6_LOOKUP_F_HAS_SADDR;
1545         struct net *net = dev_net(dev);
1546         struct ip6rd_flowi rdfl = {
1547                 .fl6 = {
1548                         .flowi6_oif = dev->ifindex,
1549                         .daddr = *dest,
1550                         .saddr = *src,
1551                 },
1552         };
1553
1554         ipv6_addr_copy(&rdfl.gateway, gateway);
1555
1556         if (rt6_need_strict(dest))
1557                 flags |= RT6_LOOKUP_F_IFACE;
1558
1559         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1560                                                    flags, __ip6_route_redirect);
1561 }
1562
1563 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1564                   const struct in6_addr *saddr,
1565                   struct neighbour *neigh, u8 *lladdr, int on_link)
1566 {
1567         struct rt6_info *rt, *nrt = NULL;
1568         struct netevent_redirect netevent;
1569         struct net *net = dev_net(neigh->dev);
1570
1571         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1572
1573         if (rt == net->ipv6.ip6_null_entry) {
1574                 if (net_ratelimit())
1575                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1576                                "for redirect target\n");
1577                 goto out;
1578         }
1579
1580         /*
1581          *      We have finally decided to accept it.
1582          */
1583
1584         neigh_update(neigh, lladdr, NUD_STALE,
1585                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1586                      NEIGH_UPDATE_F_OVERRIDE|
1587                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1588                                      NEIGH_UPDATE_F_ISROUTER))
1589                      );
1590
1591         /*
1592          * Redirect received -> path was valid.
1593          * Look, redirects are sent only in response to data packets,
1594          * so that this nexthop apparently is reachable. --ANK
1595          */
1596         dst_confirm(&rt->dst);
1597
1598         /* Duplicate redirect: silently ignore. */
1599         if (neigh == dst_get_neighbour_raw(&rt->dst))
1600                 goto out;
1601
1602         nrt = ip6_rt_copy(rt, dest);
1603         if (nrt == NULL)
1604                 goto out;
1605
1606         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1607         if (on_link)
1608                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1609
1610         nrt->rt6i_dst.plen = 128;
1611         nrt->dst.flags |= DST_HOST;
1612
1613         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1614         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1615
1616         if (ip6_ins_rt(nrt))
1617                 goto out;
1618
1619         netevent.old = &rt->dst;
1620         netevent.new = &nrt->dst;
1621         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1622
1623         if (rt->rt6i_flags&RTF_CACHE) {
1624                 ip6_del_rt(rt);
1625                 return;
1626         }
1627
1628 out:
1629         dst_release(&rt->dst);
1630 }
1631
1632 /*
1633  *      Handle ICMP "packet too big" messages
1634  *      i.e. Path MTU discovery
1635  */
1636
1637 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1638                              struct net *net, u32 pmtu, int ifindex)
1639 {
1640         struct rt6_info *rt, *nrt;
1641         int allfrag = 0;
1642 again:
1643         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1644         if (rt == NULL)
1645                 return;
1646
1647         if (rt6_check_expired(rt)) {
1648                 ip6_del_rt(rt);
1649                 goto again;
1650         }
1651
1652         if (pmtu >= dst_mtu(&rt->dst))
1653                 goto out;
1654
1655         if (pmtu < IPV6_MIN_MTU) {
1656                 /*
1657                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1658                  * MTU (1280) and a fragment header should always be included
1659                  * after a node receiving Too Big message reporting PMTU is
1660                  * less than the IPv6 Minimum Link MTU.
1661                  */
1662                 pmtu = IPV6_MIN_MTU;
1663                 allfrag = 1;
1664         }
1665
1666         /* New mtu received -> path was valid.
1667            They are sent only in response to data packets,
1668            so that this nexthop apparently is reachable. --ANK
1669          */
1670         dst_confirm(&rt->dst);
1671
1672         /* Host route. If it is static, it would be better
1673            not to override it, but add new one, so that
1674            when cache entry will expire old pmtu
1675            would return automatically.
1676          */
1677         if (rt->rt6i_flags & RTF_CACHE) {
1678                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1679                 if (allfrag) {
1680                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1681                         features |= RTAX_FEATURE_ALLFRAG;
1682                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1683                 }
1684                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1685                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1686                 goto out;
1687         }
1688
1689         /* Network route.
1690            Two cases are possible:
1691            1. It is connected route. Action: COW
1692            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1693          */
1694         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1695                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1696         else
1697                 nrt = rt6_alloc_clone(rt, daddr);
1698
1699         if (nrt) {
1700                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1701                 if (allfrag) {
1702                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1703                         features |= RTAX_FEATURE_ALLFRAG;
1704                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1705                 }
1706
1707                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1708                  * happened within 5 mins, the recommended timer is 10 mins.
1709                  * Here this route expiration time is set to ip6_rt_mtu_expires
1710                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1711                  * and detecting PMTU increase will be automatically happened.
1712                  */
1713                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1714                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1715
1716                 ip6_ins_rt(nrt);
1717         }
1718 out:
1719         dst_release(&rt->dst);
1720 }
1721
1722 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1723                         struct net_device *dev, u32 pmtu)
1724 {
1725         struct net *net = dev_net(dev);
1726
1727         /*
1728          * RFC 1981 states that a node "MUST reduce the size of the packets it
1729          * is sending along the path" that caused the Packet Too Big message.
1730          * Since it's not possible in the general case to determine which
1731          * interface was used to send the original packet, we update the MTU
1732          * on the interface that will be used to send future packets. We also
1733          * update the MTU on the interface that received the Packet Too Big in
1734          * case the original packet was forced out that interface with
1735          * SO_BINDTODEVICE or similar. This is the next best thing to the
1736          * correct behaviour, which would be to update the MTU on all
1737          * interfaces.
1738          */
1739         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1740         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1741 }
1742
1743 /*
1744  *      Misc support functions
1745  */
1746
1747 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1748                                     const struct in6_addr *dest)
1749 {
1750         struct net *net = dev_net(ort->rt6i_dev);
1751         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1752                                             ort->dst.dev, 0);
1753
1754         if (rt) {
1755                 rt->dst.input = ort->dst.input;
1756                 rt->dst.output = ort->dst.output;
1757
1758                 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1759                 rt->rt6i_dst.plen = ort->rt6i_dst.plen;
1760                 dst_copy_metrics(&rt->dst, &ort->dst);
1761                 rt->dst.error = ort->dst.error;
1762                 rt->rt6i_idev = ort->rt6i_idev;
1763                 if (rt->rt6i_idev)
1764                         in6_dev_hold(rt->rt6i_idev);
1765                 rt->dst.lastuse = jiffies;
1766                 rt->rt6i_expires = 0;
1767
1768                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1769                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1770                 rt->rt6i_metric = 0;
1771
1772 #ifdef CONFIG_IPV6_SUBTREES
1773                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1774 #endif
1775                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1776                 rt->rt6i_table = ort->rt6i_table;
1777         }
1778         return rt;
1779 }
1780
1781 #ifdef CONFIG_IPV6_ROUTE_INFO
1782 static struct rt6_info *rt6_get_route_info(struct net *net,
1783                                            const struct in6_addr *prefix, int prefixlen,
1784                                            const struct in6_addr *gwaddr, int ifindex)
1785 {
1786         struct fib6_node *fn;
1787         struct rt6_info *rt = NULL;
1788         struct fib6_table *table;
1789
1790         table = fib6_get_table(net, RT6_TABLE_INFO);
1791         if (table == NULL)
1792                 return NULL;
1793
1794         write_lock_bh(&table->tb6_lock);
1795         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1796         if (!fn)
1797                 goto out;
1798
1799         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1800                 if (rt->rt6i_dev->ifindex != ifindex)
1801                         continue;
1802                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1803                         continue;
1804                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1805                         continue;
1806                 dst_hold(&rt->dst);
1807                 break;
1808         }
1809 out:
1810         write_unlock_bh(&table->tb6_lock);
1811         return rt;
1812 }
1813
1814 static struct rt6_info *rt6_add_route_info(struct net *net,
1815                                            const struct in6_addr *prefix, int prefixlen,
1816                                            const struct in6_addr *gwaddr, int ifindex,
1817                                            unsigned pref)
1818 {
1819         struct fib6_config cfg = {
1820                 .fc_table       = RT6_TABLE_INFO,
1821                 .fc_metric      = IP6_RT_PRIO_USER,
1822                 .fc_ifindex     = ifindex,
1823                 .fc_dst_len     = prefixlen,
1824                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1825                                   RTF_UP | RTF_PREF(pref),
1826                 .fc_nlinfo.pid = 0,
1827                 .fc_nlinfo.nlh = NULL,
1828                 .fc_nlinfo.nl_net = net,
1829         };
1830
1831         ipv6_addr_copy(&cfg.fc_dst, prefix);
1832         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1833
1834         /* We should treat it as a default route if prefix length is 0. */
1835         if (!prefixlen)
1836                 cfg.fc_flags |= RTF_DEFAULT;
1837
1838         ip6_route_add(&cfg);
1839
1840         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1841 }
1842 #endif
1843
1844 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1845 {
1846         struct rt6_info *rt;
1847         struct fib6_table *table;
1848
1849         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1850         if (table == NULL)
1851                 return NULL;
1852
1853         write_lock_bh(&table->tb6_lock);
1854         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1855                 if (dev == rt->rt6i_dev &&
1856                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1857                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1858                         break;
1859         }
1860         if (rt)
1861                 dst_hold(&rt->dst);
1862         write_unlock_bh(&table->tb6_lock);
1863         return rt;
1864 }
1865
1866 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1867                                      struct net_device *dev,
1868                                      unsigned int pref)
1869 {
1870         struct fib6_config cfg = {
1871                 .fc_table       = RT6_TABLE_DFLT,
1872                 .fc_metric      = IP6_RT_PRIO_USER,
1873                 .fc_ifindex     = dev->ifindex,
1874                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1875                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1876                 .fc_nlinfo.pid = 0,
1877                 .fc_nlinfo.nlh = NULL,
1878                 .fc_nlinfo.nl_net = dev_net(dev),
1879         };
1880
1881         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1882
1883         ip6_route_add(&cfg);
1884
1885         return rt6_get_dflt_router(gwaddr, dev);
1886 }
1887
1888 void rt6_purge_dflt_routers(struct net *net)
1889 {
1890         struct rt6_info *rt;
1891         struct fib6_table *table;
1892
1893         /* NOTE: Keep consistent with rt6_get_dflt_router */
1894         table = fib6_get_table(net, RT6_TABLE_DFLT);
1895         if (table == NULL)
1896                 return;
1897
1898 restart:
1899         read_lock_bh(&table->tb6_lock);
1900         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1901                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1902                         dst_hold(&rt->dst);
1903                         read_unlock_bh(&table->tb6_lock);
1904                         ip6_del_rt(rt);
1905                         goto restart;
1906                 }
1907         }
1908         read_unlock_bh(&table->tb6_lock);
1909 }
1910
1911 static void rtmsg_to_fib6_config(struct net *net,
1912                                  struct in6_rtmsg *rtmsg,
1913                                  struct fib6_config *cfg)
1914 {
1915         memset(cfg, 0, sizeof(*cfg));
1916
1917         cfg->fc_table = RT6_TABLE_MAIN;
1918         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1919         cfg->fc_metric = rtmsg->rtmsg_metric;
1920         cfg->fc_expires = rtmsg->rtmsg_info;
1921         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1922         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1923         cfg->fc_flags = rtmsg->rtmsg_flags;
1924
1925         cfg->fc_nlinfo.nl_net = net;
1926
1927         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1928         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1929         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1930 }
1931
1932 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1933 {
1934         struct fib6_config cfg;
1935         struct in6_rtmsg rtmsg;
1936         int err;
1937
1938         switch(cmd) {
1939         case SIOCADDRT:         /* Add a route */
1940         case SIOCDELRT:         /* Delete a route */
1941                 if (!capable(CAP_NET_ADMIN))
1942                         return -EPERM;
1943                 err = copy_from_user(&rtmsg, arg,
1944                                      sizeof(struct in6_rtmsg));
1945                 if (err)
1946                         return -EFAULT;
1947
1948                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1949
1950                 rtnl_lock();
1951                 switch (cmd) {
1952                 case SIOCADDRT:
1953                         err = ip6_route_add(&cfg);
1954                         break;
1955                 case SIOCDELRT:
1956                         err = ip6_route_del(&cfg);
1957                         break;
1958                 default:
1959                         err = -EINVAL;
1960                 }
1961                 rtnl_unlock();
1962
1963                 return err;
1964         }
1965
1966         return -EINVAL;
1967 }
1968
1969 /*
1970  *      Drop the packet on the floor
1971  */
1972
1973 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1974 {
1975         int type;
1976         struct dst_entry *dst = skb_dst(skb);
1977         switch (ipstats_mib_noroutes) {
1978         case IPSTATS_MIB_INNOROUTES:
1979                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1980                 if (type == IPV6_ADDR_ANY) {
1981                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1982                                       IPSTATS_MIB_INADDRERRORS);
1983                         break;
1984                 }
1985                 /* FALLTHROUGH */
1986         case IPSTATS_MIB_OUTNOROUTES:
1987                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1988                               ipstats_mib_noroutes);
1989                 break;
1990         }
1991         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1992         kfree_skb(skb);
1993         return 0;
1994 }
1995
1996 static int ip6_pkt_discard(struct sk_buff *skb)
1997 {
1998         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1999 }
2000
2001 static int ip6_pkt_discard_out(struct sk_buff *skb)
2002 {
2003         skb->dev = skb_dst(skb)->dev;
2004         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2005 }
2006
2007 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2008
2009 static int ip6_pkt_prohibit(struct sk_buff *skb)
2010 {
2011         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2012 }
2013
2014 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2015 {
2016         skb->dev = skb_dst(skb)->dev;
2017         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2018 }
2019
2020 #endif
2021
2022 /*
2023  *      Allocate a dst for local (unicast / anycast) address.
2024  */
2025
2026 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2027                                     const struct in6_addr *addr,
2028                                     int anycast)
2029 {
2030         struct net *net = dev_net(idev->dev);
2031         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2032                                             net->loopback_dev, 0);
2033         struct neighbour *neigh;
2034
2035         if (rt == NULL) {
2036                 if (net_ratelimit())
2037                         pr_warning("IPv6:  Maximum number of routes reached,"
2038                                    " consider increasing route/max_size.\n");
2039                 return ERR_PTR(-ENOMEM);
2040         }
2041
2042         in6_dev_hold(idev);
2043
2044         rt->dst.flags |= DST_HOST;
2045         rt->dst.input = ip6_input;
2046         rt->dst.output = ip6_output;
2047         rt->rt6i_idev = idev;
2048         rt->dst.obsolete = -1;
2049
2050         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2051         if (anycast)
2052                 rt->rt6i_flags |= RTF_ANYCAST;
2053         else
2054                 rt->rt6i_flags |= RTF_LOCAL;
2055         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2056         if (IS_ERR(neigh)) {
2057                 dst_free(&rt->dst);
2058
2059                 return ERR_CAST(neigh);
2060         }
2061         dst_set_neighbour(&rt->dst, neigh);
2062
2063         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2064         rt->rt6i_dst.plen = 128;
2065         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2066
2067         atomic_set(&rt->dst.__refcnt, 1);
2068
2069         return rt;
2070 }
2071
2072 int ip6_route_get_saddr(struct net *net,
2073                         struct rt6_info *rt,
2074                         const struct in6_addr *daddr,
2075                         unsigned int prefs,
2076                         struct in6_addr *saddr)
2077 {
2078         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2079         int err = 0;
2080         if (rt->rt6i_prefsrc.plen)
2081                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2082         else
2083                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2084                                          daddr, prefs, saddr);
2085         return err;
2086 }
2087
2088 /* remove deleted ip from prefsrc entries */
2089 struct arg_dev_net_ip {
2090         struct net_device *dev;
2091         struct net *net;
2092         struct in6_addr *addr;
2093 };
2094
2095 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2096 {
2097         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2098         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2099         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2100
2101         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2102             rt != net->ipv6.ip6_null_entry &&
2103             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2104                 /* remove prefsrc entry */
2105                 rt->rt6i_prefsrc.plen = 0;
2106         }
2107         return 0;
2108 }
2109
2110 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2111 {
2112         struct net *net = dev_net(ifp->idev->dev);
2113         struct arg_dev_net_ip adni = {
2114                 .dev = ifp->idev->dev,
2115                 .net = net,
2116                 .addr = &ifp->addr,
2117         };
2118         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2119 }
2120
2121 struct arg_dev_net {
2122         struct net_device *dev;
2123         struct net *net;
2124 };
2125
2126 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2127 {
2128         const struct arg_dev_net *adn = arg;
2129         const struct net_device *dev = adn->dev;
2130
2131         if ((rt->rt6i_dev == dev || dev == NULL) &&
2132             rt != adn->net->ipv6.ip6_null_entry) {
2133                 RT6_TRACE("deleted by ifdown %p\n", rt);
2134                 return -1;
2135         }
2136         return 0;
2137 }
2138
2139 void rt6_ifdown(struct net *net, struct net_device *dev)
2140 {
2141         struct arg_dev_net adn = {
2142                 .dev = dev,
2143                 .net = net,
2144         };
2145
2146         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2147         icmp6_clean_all(fib6_ifdown, &adn);
2148 }
2149
2150 struct rt6_mtu_change_arg
2151 {
2152         struct net_device *dev;
2153         unsigned mtu;
2154 };
2155
2156 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2157 {
2158         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2159         struct inet6_dev *idev;
2160
2161         /* In IPv6 pmtu discovery is not optional,
2162            so that RTAX_MTU lock cannot disable it.
2163            We still use this lock to block changes
2164            caused by addrconf/ndisc.
2165         */
2166
2167         idev = __in6_dev_get(arg->dev);
2168         if (idev == NULL)
2169                 return 0;
2170
2171         /* For administrative MTU increase, there is no way to discover
2172            IPv6 PMTU increase, so PMTU increase should be updated here.
2173            Since RFC 1981 doesn't include administrative MTU increase
2174            update PMTU increase is a MUST. (i.e. jumbo frame)
2175          */
2176         /*
2177            If new MTU is less than route PMTU, this new MTU will be the
2178            lowest MTU in the path, update the route PMTU to reflect PMTU
2179            decreases; if new MTU is greater than route PMTU, and the
2180            old MTU is the lowest MTU in the path, update the route PMTU
2181            to reflect the increase. In this case if the other nodes' MTU
2182            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2183            PMTU discouvery.
2184          */
2185         if (rt->rt6i_dev == arg->dev &&
2186             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2187             (dst_mtu(&rt->dst) >= arg->mtu ||
2188              (dst_mtu(&rt->dst) < arg->mtu &&
2189               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2190                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2191         }
2192         return 0;
2193 }
2194
2195 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2196 {
2197         struct rt6_mtu_change_arg arg = {
2198                 .dev = dev,
2199                 .mtu = mtu,
2200         };
2201
2202         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2203 }
2204
2205 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2206         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2207         [RTA_OIF]               = { .type = NLA_U32 },
2208         [RTA_IIF]               = { .type = NLA_U32 },
2209         [RTA_PRIORITY]          = { .type = NLA_U32 },
2210         [RTA_METRICS]           = { .type = NLA_NESTED },
2211 };
2212
2213 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2214                               struct fib6_config *cfg)
2215 {
2216         struct rtmsg *rtm;
2217         struct nlattr *tb[RTA_MAX+1];
2218         int err;
2219
2220         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2221         if (err < 0)
2222                 goto errout;
2223
2224         err = -EINVAL;
2225         rtm = nlmsg_data(nlh);
2226         memset(cfg, 0, sizeof(*cfg));
2227
2228         cfg->fc_table = rtm->rtm_table;
2229         cfg->fc_dst_len = rtm->rtm_dst_len;
2230         cfg->fc_src_len = rtm->rtm_src_len;
2231         cfg->fc_flags = RTF_UP;
2232         cfg->fc_protocol = rtm->rtm_protocol;
2233
2234         if (rtm->rtm_type == RTN_UNREACHABLE)
2235                 cfg->fc_flags |= RTF_REJECT;
2236
2237         if (rtm->rtm_type == RTN_LOCAL)
2238                 cfg->fc_flags |= RTF_LOCAL;
2239
2240         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2241         cfg->fc_nlinfo.nlh = nlh;
2242         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2243
2244         if (tb[RTA_GATEWAY]) {
2245                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2246                 cfg->fc_flags |= RTF_GATEWAY;
2247         }
2248
2249         if (tb[RTA_DST]) {
2250                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2251
2252                 if (nla_len(tb[RTA_DST]) < plen)
2253                         goto errout;
2254
2255                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2256         }
2257
2258         if (tb[RTA_SRC]) {
2259                 int plen = (rtm->rtm_src_len + 7) >> 3;
2260
2261                 if (nla_len(tb[RTA_SRC]) < plen)
2262                         goto errout;
2263
2264                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2265         }
2266
2267         if (tb[RTA_PREFSRC])
2268                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2269
2270         if (tb[RTA_OIF])
2271                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2272
2273         if (tb[RTA_PRIORITY])
2274                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2275
2276         if (tb[RTA_METRICS]) {
2277                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2278                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2279         }
2280
2281         if (tb[RTA_TABLE])
2282                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2283
2284         err = 0;
2285 errout:
2286         return err;
2287 }
2288
2289 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2290 {
2291         struct fib6_config cfg;
2292         int err;
2293
2294         err = rtm_to_fib6_config(skb, nlh, &cfg);
2295         if (err < 0)
2296                 return err;
2297
2298         return ip6_route_del(&cfg);
2299 }
2300
2301 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2302 {
2303         struct fib6_config cfg;
2304         int err;
2305
2306         err = rtm_to_fib6_config(skb, nlh, &cfg);
2307         if (err < 0)
2308                 return err;
2309
2310         return ip6_route_add(&cfg);
2311 }
2312
2313 static inline size_t rt6_nlmsg_size(void)
2314 {
2315         return NLMSG_ALIGN(sizeof(struct rtmsg))
2316                + nla_total_size(16) /* RTA_SRC */
2317                + nla_total_size(16) /* RTA_DST */
2318                + nla_total_size(16) /* RTA_GATEWAY */
2319                + nla_total_size(16) /* RTA_PREFSRC */
2320                + nla_total_size(4) /* RTA_TABLE */
2321                + nla_total_size(4) /* RTA_IIF */
2322                + nla_total_size(4) /* RTA_OIF */
2323                + nla_total_size(4) /* RTA_PRIORITY */
2324                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2325                + nla_total_size(sizeof(struct rta_cacheinfo));
2326 }
2327
2328 static int rt6_fill_node(struct net *net,
2329                          struct sk_buff *skb, struct rt6_info *rt,
2330                          struct in6_addr *dst, struct in6_addr *src,
2331                          int iif, int type, u32 pid, u32 seq,
2332                          int prefix, int nowait, unsigned int flags)
2333 {
2334         struct rtmsg *rtm;
2335         struct nlmsghdr *nlh;
2336         long expires;
2337         u32 table;
2338         struct neighbour *n;
2339
2340         if (prefix) {   /* user wants prefix routes only */
2341                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2342                         /* success since this is not a prefix route */
2343                         return 1;
2344                 }
2345         }
2346
2347         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2348         if (nlh == NULL)
2349                 return -EMSGSIZE;
2350
2351         rtm = nlmsg_data(nlh);
2352         rtm->rtm_family = AF_INET6;
2353         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2354         rtm->rtm_src_len = rt->rt6i_src.plen;
2355         rtm->rtm_tos = 0;
2356         if (rt->rt6i_table)
2357                 table = rt->rt6i_table->tb6_id;
2358         else
2359                 table = RT6_TABLE_UNSPEC;
2360         rtm->rtm_table = table;
2361         NLA_PUT_U32(skb, RTA_TABLE, table);
2362         if (rt->rt6i_flags&RTF_REJECT)
2363                 rtm->rtm_type = RTN_UNREACHABLE;
2364         else if (rt->rt6i_flags&RTF_LOCAL)
2365                 rtm->rtm_type = RTN_LOCAL;
2366         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2367                 rtm->rtm_type = RTN_LOCAL;
2368         else
2369                 rtm->rtm_type = RTN_UNICAST;
2370         rtm->rtm_flags = 0;
2371         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2372         rtm->rtm_protocol = rt->rt6i_protocol;
2373         if (rt->rt6i_flags&RTF_DYNAMIC)
2374                 rtm->rtm_protocol = RTPROT_REDIRECT;
2375         else if (rt->rt6i_flags & RTF_ADDRCONF)
2376                 rtm->rtm_protocol = RTPROT_KERNEL;
2377         else if (rt->rt6i_flags&RTF_DEFAULT)
2378                 rtm->rtm_protocol = RTPROT_RA;
2379
2380         if (rt->rt6i_flags&RTF_CACHE)
2381                 rtm->rtm_flags |= RTM_F_CLONED;
2382
2383         if (dst) {
2384                 NLA_PUT(skb, RTA_DST, 16, dst);
2385                 rtm->rtm_dst_len = 128;
2386         } else if (rtm->rtm_dst_len)
2387                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2388 #ifdef CONFIG_IPV6_SUBTREES
2389         if (src) {
2390                 NLA_PUT(skb, RTA_SRC, 16, src);
2391                 rtm->rtm_src_len = 128;
2392         } else if (rtm->rtm_src_len)
2393                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2394 #endif
2395         if (iif) {
2396 #ifdef CONFIG_IPV6_MROUTE
2397                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2398                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2399                         if (err <= 0) {
2400                                 if (!nowait) {
2401                                         if (err == 0)
2402                                                 return 0;
2403                                         goto nla_put_failure;
2404                                 } else {
2405                                         if (err == -EMSGSIZE)
2406                                                 goto nla_put_failure;
2407                                 }
2408                         }
2409                 } else
2410 #endif
2411                         NLA_PUT_U32(skb, RTA_IIF, iif);
2412         } else if (dst) {
2413                 struct in6_addr saddr_buf;
2414                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2415                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2416         }
2417
2418         if (rt->rt6i_prefsrc.plen) {
2419                 struct in6_addr saddr_buf;
2420                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2421                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2422         }
2423
2424         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2425                 goto nla_put_failure;
2426
2427         rcu_read_lock();
2428         n = dst_get_neighbour(&rt->dst);
2429         if (n)
2430                 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2431         rcu_read_unlock();
2432
2433         if (rt->dst.dev)
2434                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2435
2436         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2437
2438         if (!(rt->rt6i_flags & RTF_EXPIRES))
2439                 expires = 0;
2440         else if (rt->rt6i_expires - jiffies < INT_MAX)
2441                 expires = rt->rt6i_expires - jiffies;
2442         else
2443                 expires = INT_MAX;
2444
2445         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2446                                expires, rt->dst.error) < 0)
2447                 goto nla_put_failure;
2448
2449         return nlmsg_end(skb, nlh);
2450
2451 nla_put_failure:
2452         nlmsg_cancel(skb, nlh);
2453         return -EMSGSIZE;
2454 }
2455
2456 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2457 {
2458         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2459         int prefix;
2460
2461         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2462                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2463                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2464         } else
2465                 prefix = 0;
2466
2467         return rt6_fill_node(arg->net,
2468                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2469                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2470                      prefix, 0, NLM_F_MULTI);
2471 }
2472
2473 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2474 {
2475         struct net *net = sock_net(in_skb->sk);
2476         struct nlattr *tb[RTA_MAX+1];
2477         struct rt6_info *rt;
2478         struct sk_buff *skb;
2479         struct rtmsg *rtm;
2480         struct flowi6 fl6;
2481         int err, iif = 0;
2482
2483         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2484         if (err < 0)
2485                 goto errout;
2486
2487         err = -EINVAL;
2488         memset(&fl6, 0, sizeof(fl6));
2489
2490         if (tb[RTA_SRC]) {
2491                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2492                         goto errout;
2493
2494                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2495         }
2496
2497         if (tb[RTA_DST]) {
2498                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2499                         goto errout;
2500
2501                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2502         }
2503
2504         if (tb[RTA_IIF])
2505                 iif = nla_get_u32(tb[RTA_IIF]);
2506
2507         if (tb[RTA_OIF])
2508                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2509
2510         if (iif) {
2511                 struct net_device *dev;
2512                 dev = __dev_get_by_index(net, iif);
2513                 if (!dev) {
2514                         err = -ENODEV;
2515                         goto errout;
2516                 }
2517         }
2518
2519         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2520         if (skb == NULL) {
2521                 err = -ENOBUFS;
2522                 goto errout;
2523         }
2524
2525         /* Reserve room for dummy headers, this skb can pass
2526            through good chunk of routing engine.
2527          */
2528         skb_reset_mac_header(skb);
2529         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2530
2531         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2532         skb_dst_set(skb, &rt->dst);
2533
2534         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2535                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2536                             nlh->nlmsg_seq, 0, 0, 0);
2537         if (err < 0) {
2538                 kfree_skb(skb);
2539                 goto errout;
2540         }
2541
2542         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2543 errout:
2544         return err;
2545 }
2546
2547 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2548 {
2549         struct sk_buff *skb;
2550         struct net *net = info->nl_net;
2551         u32 seq;
2552         int err;
2553
2554         err = -ENOBUFS;
2555         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2556
2557         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2558         if (skb == NULL)
2559                 goto errout;
2560
2561         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2562                                 event, info->pid, seq, 0, 0, 0);
2563         if (err < 0) {
2564                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2565                 WARN_ON(err == -EMSGSIZE);
2566                 kfree_skb(skb);
2567                 goto errout;
2568         }
2569         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2570                     info->nlh, gfp_any());
2571         return;
2572 errout:
2573         if (err < 0)
2574                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2575 }
2576
2577 static int ip6_route_dev_notify(struct notifier_block *this,
2578                                 unsigned long event, void *data)
2579 {
2580         struct net_device *dev = (struct net_device *)data;
2581         struct net *net = dev_net(dev);
2582
2583         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2584                 net->ipv6.ip6_null_entry->dst.dev = dev;
2585                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2586 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2587                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2588                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2589                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2590                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2591 #endif
2592         }
2593
2594         return NOTIFY_OK;
2595 }
2596
2597 /*
2598  *      /proc
2599  */
2600
2601 #ifdef CONFIG_PROC_FS
2602
2603 struct rt6_proc_arg
2604 {
2605         char *buffer;
2606         int offset;
2607         int length;
2608         int skip;
2609         int len;
2610 };
2611
2612 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2613 {
2614         struct seq_file *m = p_arg;
2615         struct neighbour *n;
2616
2617         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2618
2619 #ifdef CONFIG_IPV6_SUBTREES
2620         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2621 #else
2622         seq_puts(m, "00000000000000000000000000000000 00 ");
2623 #endif
2624         rcu_read_lock();
2625         n = dst_get_neighbour(&rt->dst);
2626         if (n) {
2627                 seq_printf(m, "%pi6", n->primary_key);
2628         } else {
2629                 seq_puts(m, "00000000000000000000000000000000");
2630         }
2631         rcu_read_unlock();
2632         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2633                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2634                    rt->dst.__use, rt->rt6i_flags,
2635                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2636         return 0;
2637 }
2638
2639 static int ipv6_route_show(struct seq_file *m, void *v)
2640 {
2641         struct net *net = (struct net *)m->private;
2642         fib6_clean_all(net, rt6_info_route, 0, m);
2643         return 0;
2644 }
2645
2646 static int ipv6_route_open(struct inode *inode, struct file *file)
2647 {
2648         return single_open_net(inode, file, ipv6_route_show);
2649 }
2650
2651 static const struct file_operations ipv6_route_proc_fops = {
2652         .owner          = THIS_MODULE,
2653         .open           = ipv6_route_open,
2654         .read           = seq_read,
2655         .llseek         = seq_lseek,
2656         .release        = single_release_net,
2657 };
2658
2659 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2660 {
2661         struct net *net = (struct net *)seq->private;
2662         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2663                    net->ipv6.rt6_stats->fib_nodes,
2664                    net->ipv6.rt6_stats->fib_route_nodes,
2665                    net->ipv6.rt6_stats->fib_rt_alloc,
2666                    net->ipv6.rt6_stats->fib_rt_entries,
2667                    net->ipv6.rt6_stats->fib_rt_cache,
2668                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2669                    net->ipv6.rt6_stats->fib_discarded_routes);
2670
2671         return 0;
2672 }
2673
2674 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2675 {
2676         return single_open_net(inode, file, rt6_stats_seq_show);
2677 }
2678
2679 static const struct file_operations rt6_stats_seq_fops = {
2680         .owner   = THIS_MODULE,
2681         .open    = rt6_stats_seq_open,
2682         .read    = seq_read,
2683         .llseek  = seq_lseek,
2684         .release = single_release_net,
2685 };
2686 #endif  /* CONFIG_PROC_FS */
2687
2688 #ifdef CONFIG_SYSCTL
2689
2690 static
2691 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2692                               void __user *buffer, size_t *lenp, loff_t *ppos)
2693 {
2694         struct net *net;
2695         int delay;
2696         if (!write)
2697                 return -EINVAL;
2698
2699         net = (struct net *)ctl->extra1;
2700         delay = net->ipv6.sysctl.flush_delay;
2701         proc_dointvec(ctl, write, buffer, lenp, ppos);
2702         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2703         return 0;
2704 }
2705
2706 ctl_table ipv6_route_table_template[] = {
2707         {
2708                 .procname       =       "flush",
2709                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2710                 .maxlen         =       sizeof(int),
2711                 .mode           =       0200,
2712                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2713         },
2714         {
2715                 .procname       =       "gc_thresh",
2716                 .data           =       &ip6_dst_ops_template.gc_thresh,
2717                 .maxlen         =       sizeof(int),
2718                 .mode           =       0644,
2719                 .proc_handler   =       proc_dointvec,
2720         },
2721         {
2722                 .procname       =       "max_size",
2723                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2724                 .maxlen         =       sizeof(int),
2725                 .mode           =       0644,
2726                 .proc_handler   =       proc_dointvec,
2727         },
2728         {
2729                 .procname       =       "gc_min_interval",
2730                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2731                 .maxlen         =       sizeof(int),
2732                 .mode           =       0644,
2733                 .proc_handler   =       proc_dointvec_jiffies,
2734         },
2735         {
2736                 .procname       =       "gc_timeout",
2737                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2738                 .maxlen         =       sizeof(int),
2739                 .mode           =       0644,
2740                 .proc_handler   =       proc_dointvec_jiffies,
2741         },
2742         {
2743                 .procname       =       "gc_interval",
2744                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2745                 .maxlen         =       sizeof(int),
2746                 .mode           =       0644,
2747                 .proc_handler   =       proc_dointvec_jiffies,
2748         },
2749         {
2750                 .procname       =       "gc_elasticity",
2751                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2752                 .maxlen         =       sizeof(int),
2753                 .mode           =       0644,
2754                 .proc_handler   =       proc_dointvec,
2755         },
2756         {
2757                 .procname       =       "mtu_expires",
2758                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2759                 .maxlen         =       sizeof(int),
2760                 .mode           =       0644,
2761                 .proc_handler   =       proc_dointvec_jiffies,
2762         },
2763         {
2764                 .procname       =       "min_adv_mss",
2765                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2766                 .maxlen         =       sizeof(int),
2767                 .mode           =       0644,
2768                 .proc_handler   =       proc_dointvec,
2769         },
2770         {
2771                 .procname       =       "gc_min_interval_ms",
2772                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2773                 .maxlen         =       sizeof(int),
2774                 .mode           =       0644,
2775                 .proc_handler   =       proc_dointvec_ms_jiffies,
2776         },
2777         { }
2778 };
2779
2780 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2781 {
2782         struct ctl_table *table;
2783
2784         table = kmemdup(ipv6_route_table_template,
2785                         sizeof(ipv6_route_table_template),
2786                         GFP_KERNEL);
2787
2788         if (table) {
2789                 table[0].data = &net->ipv6.sysctl.flush_delay;
2790                 table[0].extra1 = net;
2791                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2792                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2793                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2794                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2795                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2796                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2797                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2798                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2799                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2800         }
2801
2802         return table;
2803 }
2804 #endif
2805
2806 static int __net_init ip6_route_net_init(struct net *net)
2807 {
2808         int ret = -ENOMEM;
2809
2810         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2811                sizeof(net->ipv6.ip6_dst_ops));
2812
2813         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2814                 goto out_ip6_dst_ops;
2815
2816         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2817                                            sizeof(*net->ipv6.ip6_null_entry),
2818                                            GFP_KERNEL);
2819         if (!net->ipv6.ip6_null_entry)
2820                 goto out_ip6_dst_entries;
2821         net->ipv6.ip6_null_entry->dst.path =
2822                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2823         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2824         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2825                          ip6_template_metrics, true);
2826
2827 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2828         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2829                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2830                                                GFP_KERNEL);
2831         if (!net->ipv6.ip6_prohibit_entry)
2832                 goto out_ip6_null_entry;
2833         net->ipv6.ip6_prohibit_entry->dst.path =
2834                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2835         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2836         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2837                          ip6_template_metrics, true);
2838
2839         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2840                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2841                                                GFP_KERNEL);
2842         if (!net->ipv6.ip6_blk_hole_entry)
2843                 goto out_ip6_prohibit_entry;
2844         net->ipv6.ip6_blk_hole_entry->dst.path =
2845                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2846         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2847         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2848                          ip6_template_metrics, true);
2849 #endif
2850
2851         net->ipv6.sysctl.flush_delay = 0;
2852         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2853         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2854         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2855         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2856         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2857         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2858         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2859
2860 #ifdef CONFIG_PROC_FS
2861         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2862         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2863 #endif
2864         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2865
2866         ret = 0;
2867 out:
2868         return ret;
2869
2870 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2871 out_ip6_prohibit_entry:
2872         kfree(net->ipv6.ip6_prohibit_entry);
2873 out_ip6_null_entry:
2874         kfree(net->ipv6.ip6_null_entry);
2875 #endif
2876 out_ip6_dst_entries:
2877         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2878 out_ip6_dst_ops:
2879         goto out;
2880 }
2881
2882 static void __net_exit ip6_route_net_exit(struct net *net)
2883 {
2884 #ifdef CONFIG_PROC_FS
2885         proc_net_remove(net, "ipv6_route");
2886         proc_net_remove(net, "rt6_stats");
2887 #endif
2888         kfree(net->ipv6.ip6_null_entry);
2889 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2890         kfree(net->ipv6.ip6_prohibit_entry);
2891         kfree(net->ipv6.ip6_blk_hole_entry);
2892 #endif
2893         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2894 }
2895
2896 static struct pernet_operations ip6_route_net_ops = {
2897         .init = ip6_route_net_init,
2898         .exit = ip6_route_net_exit,
2899 };
2900
2901 static struct notifier_block ip6_route_dev_notifier = {
2902         .notifier_call = ip6_route_dev_notify,
2903         .priority = 0,
2904 };
2905
2906 int __init ip6_route_init(void)
2907 {
2908         int ret;
2909
2910         ret = -ENOMEM;
2911         ip6_dst_ops_template.kmem_cachep =
2912                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2913                                   SLAB_HWCACHE_ALIGN, NULL);
2914         if (!ip6_dst_ops_template.kmem_cachep)
2915                 goto out;
2916
2917         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2918         if (ret)
2919                 goto out_kmem_cache;
2920
2921         ret = register_pernet_subsys(&ip6_route_net_ops);
2922         if (ret)
2923                 goto out_dst_entries;
2924
2925         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2926
2927         /* Registering of the loopback is done before this portion of code,
2928          * the loopback reference in rt6_info will not be taken, do it
2929          * manually for init_net */
2930         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2931         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2932   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2933         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2934         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2935         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2936         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2937   #endif
2938         ret = fib6_init();
2939         if (ret)
2940                 goto out_register_subsys;
2941
2942         ret = xfrm6_init();
2943         if (ret)
2944                 goto out_fib6_init;
2945
2946         ret = fib6_rules_init();
2947         if (ret)
2948                 goto xfrm6_init;
2949
2950         ret = -ENOBUFS;
2951         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2952             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2953             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2954                 goto fib6_rules_init;
2955
2956         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2957         if (ret)
2958                 goto fib6_rules_init;
2959
2960 out:
2961         return ret;
2962
2963 fib6_rules_init:
2964         fib6_rules_cleanup();
2965 xfrm6_init:
2966         xfrm6_fini();
2967 out_fib6_init:
2968         fib6_gc_cleanup();
2969 out_register_subsys:
2970         unregister_pernet_subsys(&ip6_route_net_ops);
2971 out_dst_entries:
2972         dst_entries_destroy(&ip6_dst_blackhole_ops);
2973 out_kmem_cache:
2974         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2975         goto out;
2976 }
2977
2978 void ip6_route_cleanup(void)
2979 {
2980         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2981         fib6_rules_cleanup();
2982         xfrm6_fini();
2983         fib6_gc_cleanup();
2984         unregister_pernet_subsys(&ip6_route_net_ops);
2985         dst_entries_destroy(&ip6_dst_blackhole_ops);
2986         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2987 }