ipv6: release reference of ip6_null_entry's dst entry in __ip6_del_rt
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77                                     const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int      ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void             ip6_dst_destroy(struct dst_entry *);
83 static void             ip6_dst_ifdown(struct dst_entry *,
84                                        struct net_device *dev, int how);
85 static int               ip6_dst_gc(struct dst_ops *ops);
86
87 static int              ip6_pkt_discard(struct sk_buff *skb);
88 static int              ip6_pkt_discard_out(struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94                                            const struct in6_addr *prefix, int prefixlen,
95                                            const struct in6_addr *gwaddr, int ifindex,
96                                            unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98                                            const struct in6_addr *prefix, int prefixlen,
99                                            const struct in6_addr *gwaddr, int ifindex);
100 #endif
101
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 {
104         struct rt6_info *rt = (struct rt6_info *) dst;
105         struct inet_peer *peer;
106         u32 *p = NULL;
107
108         if (!(rt->dst.flags & DST_HOST))
109                 return NULL;
110
111         if (!rt->rt6i_peer)
112                 rt6_bind_peer(rt, 1);
113
114         peer = rt->rt6i_peer;
115         if (peer) {
116                 u32 *old_p = __DST_METRICS_PTR(old);
117                 unsigned long prev, new;
118
119                 p = peer->metrics;
120                 if (inet_metrics_new(peer))
121                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122
123                 new = (unsigned long) p;
124                 prev = cmpxchg(&dst->_metrics, old, new);
125
126                 if (prev != old) {
127                         p = __DST_METRICS_PTR(prev);
128                         if (prev & DST_METRICS_READ_ONLY)
129                                 p = NULL;
130                 }
131         }
132         return p;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
138 }
139
140 static struct dst_ops ip6_dst_ops_template = {
141         .family                 =       AF_INET6,
142         .protocol               =       cpu_to_be16(ETH_P_IPV6),
143         .gc                     =       ip6_dst_gc,
144         .gc_thresh              =       1024,
145         .check                  =       ip6_dst_check,
146         .default_advmss         =       ip6_default_advmss,
147         .mtu                    =       ip6_mtu,
148         .cow_metrics            =       ipv6_cow_metrics,
149         .destroy                =       ip6_dst_destroy,
150         .ifdown                 =       ip6_dst_ifdown,
151         .negative_advice        =       ip6_negative_advice,
152         .link_failure           =       ip6_link_failure,
153         .update_pmtu            =       ip6_rt_update_pmtu,
154         .local_out              =       __ip6_local_out,
155         .neigh_lookup           =       ip6_neigh_lookup,
156 };
157
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
159 {
160         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
161
162         return mtu ? : dst->dev->mtu;
163 }
164
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 {
167 }
168
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170                                          unsigned long old)
171 {
172         return NULL;
173 }
174
175 static struct dst_ops ip6_dst_blackhole_ops = {
176         .family                 =       AF_INET6,
177         .protocol               =       cpu_to_be16(ETH_P_IPV6),
178         .destroy                =       ip6_dst_destroy,
179         .check                  =       ip6_dst_check,
180         .mtu                    =       ip6_blackhole_mtu,
181         .default_advmss         =       ip6_default_advmss,
182         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
183         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
184         .neigh_lookup           =       ip6_neigh_lookup,
185 };
186
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188         [RTAX_HOPLIMIT - 1] = 255,
189 };
190
191 static struct rt6_info ip6_null_entry_template = {
192         .dst = {
193                 .__refcnt       = ATOMIC_INIT(1),
194                 .__use          = 1,
195                 .obsolete       = -1,
196                 .error          = -ENETUNREACH,
197                 .input          = ip6_pkt_discard,
198                 .output         = ip6_pkt_discard_out,
199         },
200         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
201         .rt6i_protocol  = RTPROT_KERNEL,
202         .rt6i_metric    = ~(u32) 0,
203         .rt6i_ref       = ATOMIC_INIT(1),
204 };
205
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
207
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
210
211 static struct rt6_info ip6_prohibit_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -EACCES,
217                 .input          = ip6_pkt_prohibit,
218                 .output         = ip6_pkt_prohibit_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 static struct rt6_info ip6_blk_hole_entry_template = {
227         .dst = {
228                 .__refcnt       = ATOMIC_INIT(1),
229                 .__use          = 1,
230                 .obsolete       = -1,
231                 .error          = -EINVAL,
232                 .input          = dst_discard,
233                 .output         = dst_discard,
234         },
235         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
236         .rt6i_protocol  = RTPROT_KERNEL,
237         .rt6i_metric    = ~(u32) 0,
238         .rt6i_ref       = ATOMIC_INIT(1),
239 };
240
241 #endif
242
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245                                              struct net_device *dev,
246                                              int flags)
247 {
248         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
249
250         if (rt != NULL)
251                 memset(&rt->rt6i_table, 0,
252                         sizeof(*rt) - sizeof(struct dst_entry));
253
254         return rt;
255 }
256
257 static void ip6_dst_destroy(struct dst_entry *dst)
258 {
259         struct rt6_info *rt = (struct rt6_info *)dst;
260         struct inet6_dev *idev = rt->rt6i_idev;
261         struct inet_peer *peer = rt->rt6i_peer;
262
263         if (!(rt->dst.flags & DST_HOST))
264                 dst_destroy_metrics_generic(dst);
265
266         if (idev != NULL) {
267                 rt->rt6i_idev = NULL;
268                 in6_dev_put(idev);
269         }
270         if (peer) {
271                 rt->rt6i_peer = NULL;
272                 inet_putpeer(peer);
273         }
274 }
275
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
277
278 static u32 rt6_peer_genid(void)
279 {
280         return atomic_read(&__rt6_peer_genid);
281 }
282
283 void rt6_bind_peer(struct rt6_info *rt, int create)
284 {
285         struct inet_peer *peer;
286
287         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289                 inet_putpeer(peer);
290         else
291                 rt->rt6i_peer_genid = rt6_peer_genid();
292 }
293
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295                            int how)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299         struct net_device *loopback_dev =
300                 dev_net(dev)->loopback_dev;
301
302         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303                 struct inet6_dev *loopback_idev =
304                         in6_dev_get(loopback_dev);
305                 if (loopback_idev != NULL) {
306                         rt->rt6i_idev = loopback_idev;
307                         in6_dev_put(idev);
308                 }
309         }
310 }
311
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
313 {
314         return (rt->rt6i_flags & RTF_EXPIRES) &&
315                 time_after(jiffies, rt->rt6i_expires);
316 }
317
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
319 {
320         return ipv6_addr_type(daddr) &
321                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
322 }
323
324 /*
325  *      Route lookup. Any table->tb6_lock is implied.
326  */
327
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329                                                     struct rt6_info *rt,
330                                                     const struct in6_addr *saddr,
331                                                     int oif,
332                                                     int flags)
333 {
334         struct rt6_info *local = NULL;
335         struct rt6_info *sprt;
336
337         if (!oif && ipv6_addr_any(saddr))
338                 goto out;
339
340         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341                 struct net_device *dev = sprt->rt6i_dev;
342
343                 if (oif) {
344                         if (dev->ifindex == oif)
345                                 return sprt;
346                         if (dev->flags & IFF_LOOPBACK) {
347                                 if (sprt->rt6i_idev == NULL ||
348                                     sprt->rt6i_idev->dev->ifindex != oif) {
349                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
350                                                 continue;
351                                         if (local && (!oif ||
352                                                       local->rt6i_idev->dev->ifindex == oif))
353                                                 continue;
354                                 }
355                                 local = sprt;
356                         }
357                 } else {
358                         if (ipv6_chk_addr(net, saddr, dev,
359                                           flags & RT6_LOOKUP_F_IFACE))
360                                 return sprt;
361                 }
362         }
363
364         if (oif) {
365                 if (local)
366                         return local;
367
368                 if (flags & RT6_LOOKUP_F_IFACE)
369                         return net->ipv6.ip6_null_entry;
370         }
371 out:
372         return rt;
373 }
374
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
377 {
378         struct neighbour *neigh;
379         /*
380          * Okay, this does not seem to be appropriate
381          * for now, however, we need to check if it
382          * is really so; aka Router Reachability Probing.
383          *
384          * Router Reachability Probe MUST be rate-limited
385          * to no more than one per minute.
386          */
387         rcu_read_lock();
388         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389         if (!neigh || (neigh->nud_state & NUD_VALID))
390                 goto out;
391         read_lock_bh(&neigh->lock);
392         if (!(neigh->nud_state & NUD_VALID) &&
393             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394                 struct in6_addr mcaddr;
395                 struct in6_addr *target;
396
397                 neigh->updated = jiffies;
398                 read_unlock_bh(&neigh->lock);
399
400                 target = (struct in6_addr *)&neigh->primary_key;
401                 addrconf_addr_solict_mult(target, &mcaddr);
402                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403         } else {
404                 read_unlock_bh(&neigh->lock);
405         }
406 out:
407         rcu_read_unlock();
408 }
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
411 {
412 }
413 #endif
414
415 /*
416  * Default Router Selection (RFC 2461 6.3.6)
417  */
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
419 {
420         struct net_device *dev = rt->rt6i_dev;
421         if (!oif || dev->ifindex == oif)
422                 return 2;
423         if ((dev->flags & IFF_LOOPBACK) &&
424             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425                 return 1;
426         return 0;
427 }
428
429 static inline int rt6_check_neigh(struct rt6_info *rt)
430 {
431         struct neighbour *neigh;
432         int m;
433
434         rcu_read_lock();
435         neigh = dst_get_neighbour(&rt->dst);
436         if (rt->rt6i_flags & RTF_NONEXTHOP ||
437             !(rt->rt6i_flags & RTF_GATEWAY))
438                 m = 1;
439         else if (neigh) {
440                 read_lock_bh(&neigh->lock);
441                 if (neigh->nud_state & NUD_VALID)
442                         m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444                 else if (neigh->nud_state & NUD_FAILED)
445                         m = 0;
446 #endif
447                 else
448                         m = 1;
449                 read_unlock_bh(&neigh->lock);
450         } else
451                 m = 0;
452         rcu_read_unlock();
453         return m;
454 }
455
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457                            int strict)
458 {
459         int m, n;
460
461         m = rt6_check_dev(rt, oif);
462         if (!m && (strict & RT6_LOOKUP_F_IFACE))
463                 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467         n = rt6_check_neigh(rt);
468         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469                 return -1;
470         return m;
471 }
472
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474                                    int *mpri, struct rt6_info *match)
475 {
476         int m;
477
478         if (rt6_check_expired(rt))
479                 goto out;
480
481         m = rt6_score_route(rt, oif, strict);
482         if (m < 0)
483                 goto out;
484
485         if (m > *mpri) {
486                 if (strict & RT6_LOOKUP_F_REACHABLE)
487                         rt6_probe(match);
488                 *mpri = m;
489                 match = rt;
490         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491                 rt6_probe(rt);
492         }
493
494 out:
495         return match;
496 }
497
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499                                      struct rt6_info *rr_head,
500                                      u32 metric, int oif, int strict)
501 {
502         struct rt6_info *rt, *match;
503         int mpri = -1;
504
505         match = NULL;
506         for (rt = rr_head; rt && rt->rt6i_metric == metric;
507              rt = rt->dst.rt6_next)
508                 match = find_match(rt, oif, strict, &mpri, match);
509         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510              rt = rt->dst.rt6_next)
511                 match = find_match(rt, oif, strict, &mpri, match);
512
513         return match;
514 }
515
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
517 {
518         struct rt6_info *match, *rt0;
519         struct net *net;
520
521         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522                   __func__, fn->leaf, oif);
523
524         rt0 = fn->rr_ptr;
525         if (!rt0)
526                 fn->rr_ptr = rt0 = fn->leaf;
527
528         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
529
530         if (!match &&
531             (strict & RT6_LOOKUP_F_REACHABLE)) {
532                 struct rt6_info *next = rt0->dst.rt6_next;
533
534                 /* no entries matched; do round-robin */
535                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536                         next = fn->leaf;
537
538                 if (next != rt0)
539                         fn->rr_ptr = next;
540         }
541
542         RT6_TRACE("%s() => %p\n",
543                   __func__, match);
544
545         net = dev_net(rt0->rt6i_dev);
546         return match ? match : net->ipv6.ip6_null_entry;
547 }
548
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551                   const struct in6_addr *gwaddr)
552 {
553         struct net *net = dev_net(dev);
554         struct route_info *rinfo = (struct route_info *) opt;
555         struct in6_addr prefix_buf, *prefix;
556         unsigned int pref;
557         unsigned long lifetime;
558         struct rt6_info *rt;
559
560         if (len < sizeof(struct route_info)) {
561                 return -EINVAL;
562         }
563
564         /* Sanity check for prefix_len and length */
565         if (rinfo->length > 3) {
566                 return -EINVAL;
567         } else if (rinfo->prefix_len > 128) {
568                 return -EINVAL;
569         } else if (rinfo->prefix_len > 64) {
570                 if (rinfo->length < 2) {
571                         return -EINVAL;
572                 }
573         } else if (rinfo->prefix_len > 0) {
574                 if (rinfo->length < 1) {
575                         return -EINVAL;
576                 }
577         }
578
579         pref = rinfo->route_pref;
580         if (pref == ICMPV6_ROUTER_PREF_INVALID)
581                 return -EINVAL;
582
583         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584
585         if (rinfo->length == 3)
586                 prefix = (struct in6_addr *)rinfo->prefix;
587         else {
588                 /* this function is safe */
589                 ipv6_addr_prefix(&prefix_buf,
590                                  (struct in6_addr *)rinfo->prefix,
591                                  rinfo->prefix_len);
592                 prefix = &prefix_buf;
593         }
594
595         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
596                                 dev->ifindex);
597
598         if (rt && !lifetime) {
599                 ip6_del_rt(rt);
600                 rt = NULL;
601         }
602
603         if (!rt && lifetime)
604                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
605                                         pref);
606         else if (rt)
607                 rt->rt6i_flags = RTF_ROUTEINFO |
608                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
609
610         if (rt) {
611                 if (!addrconf_finite_timeout(lifetime)) {
612                         rt->rt6i_flags &= ~RTF_EXPIRES;
613                 } else {
614                         rt->rt6i_expires = jiffies + HZ * lifetime;
615                         rt->rt6i_flags |= RTF_EXPIRES;
616                 }
617                 dst_release(&rt->dst);
618         }
619         return 0;
620 }
621 #endif
622
623 #define BACKTRACK(__net, saddr)                 \
624 do { \
625         if (rt == __net->ipv6.ip6_null_entry) { \
626                 struct fib6_node *pn; \
627                 while (1) { \
628                         if (fn->fn_flags & RTN_TL_ROOT) \
629                                 goto out; \
630                         pn = fn->parent; \
631                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
633                         else \
634                                 fn = pn; \
635                         if (fn->fn_flags & RTN_RTINFO) \
636                                 goto restart; \
637                 } \
638         } \
639 } while(0)
640
641 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
642                                              struct fib6_table *table,
643                                              struct flowi6 *fl6, int flags)
644 {
645         struct fib6_node *fn;
646         struct rt6_info *rt;
647
648         read_lock_bh(&table->tb6_lock);
649         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
650 restart:
651         rt = fn->leaf;
652         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
653         BACKTRACK(net, &fl6->saddr);
654 out:
655         dst_use(&rt->dst, jiffies);
656         read_unlock_bh(&table->tb6_lock);
657         return rt;
658
659 }
660
661 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
662                             const struct in6_addr *saddr, int oif, int strict)
663 {
664         struct flowi6 fl6 = {
665                 .flowi6_oif = oif,
666                 .daddr = *daddr,
667         };
668         struct dst_entry *dst;
669         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
670
671         if (saddr) {
672                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
673                 flags |= RT6_LOOKUP_F_HAS_SADDR;
674         }
675
676         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
677         if (dst->error == 0)
678                 return (struct rt6_info *) dst;
679
680         dst_release(dst);
681
682         return NULL;
683 }
684
685 EXPORT_SYMBOL(rt6_lookup);
686
687 /* ip6_ins_rt is called with FREE table->tb6_lock.
688    It takes new route entry, the addition fails by any reason the
689    route is freed. In any case, if caller does not hold it, it may
690    be destroyed.
691  */
692
693 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
694 {
695         int err;
696         struct fib6_table *table;
697
698         table = rt->rt6i_table;
699         write_lock_bh(&table->tb6_lock);
700         err = fib6_add(&table->tb6_root, rt, info);
701         write_unlock_bh(&table->tb6_lock);
702
703         return err;
704 }
705
706 int ip6_ins_rt(struct rt6_info *rt)
707 {
708         struct nl_info info = {
709                 .nl_net = dev_net(rt->rt6i_dev),
710         };
711         return __ip6_ins_rt(rt, &info);
712 }
713
714 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
715                                       const struct in6_addr *daddr,
716                                       const struct in6_addr *saddr)
717 {
718         struct rt6_info *rt;
719
720         /*
721          *      Clone the route.
722          */
723
724         rt = ip6_rt_copy(ort, daddr);
725
726         if (rt) {
727                 struct neighbour *neigh;
728                 int attempts = !in_softirq();
729
730                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
731                         if (ort->rt6i_dst.plen != 128 &&
732                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
733                                 rt->rt6i_flags |= RTF_ANYCAST;
734                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
735                 }
736
737                 rt->rt6i_flags |= RTF_CACHE;
738
739 #ifdef CONFIG_IPV6_SUBTREES
740                 if (rt->rt6i_src.plen && saddr) {
741                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
742                         rt->rt6i_src.plen = 128;
743                 }
744 #endif
745
746         retry:
747                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
748                 if (IS_ERR(neigh)) {
749                         struct net *net = dev_net(rt->rt6i_dev);
750                         int saved_rt_min_interval =
751                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
752                         int saved_rt_elasticity =
753                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
754
755                         if (attempts-- > 0) {
756                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
757                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
758
759                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
760
761                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
762                                         saved_rt_elasticity;
763                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
764                                         saved_rt_min_interval;
765                                 goto retry;
766                         }
767
768                         if (net_ratelimit())
769                                 printk(KERN_WARNING
770                                        "ipv6: Neighbour table overflow.\n");
771                         dst_free(&rt->dst);
772                         return NULL;
773                 }
774                 dst_set_neighbour(&rt->dst, neigh);
775
776         }
777
778         return rt;
779 }
780
781 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
782                                         const struct in6_addr *daddr)
783 {
784         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
785
786         if (rt) {
787                 rt->rt6i_flags |= RTF_CACHE;
788                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
789         }
790         return rt;
791 }
792
793 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
794                                       struct flowi6 *fl6, int flags)
795 {
796         struct fib6_node *fn;
797         struct rt6_info *rt, *nrt;
798         int strict = 0;
799         int attempts = 3;
800         int err;
801         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
802
803         strict |= flags & RT6_LOOKUP_F_IFACE;
804
805 relookup:
806         read_lock_bh(&table->tb6_lock);
807
808 restart_2:
809         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
810
811 restart:
812         rt = rt6_select(fn, oif, strict | reachable);
813
814         BACKTRACK(net, &fl6->saddr);
815         if (rt == net->ipv6.ip6_null_entry ||
816             rt->rt6i_flags & RTF_CACHE)
817                 goto out;
818
819         dst_hold(&rt->dst);
820         read_unlock_bh(&table->tb6_lock);
821
822         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
823                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
824         else if (!(rt->dst.flags & DST_HOST))
825                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
826         else
827                 goto out2;
828
829         dst_release(&rt->dst);
830         rt = nrt ? : net->ipv6.ip6_null_entry;
831
832         dst_hold(&rt->dst);
833         if (nrt) {
834                 err = ip6_ins_rt(nrt);
835                 if (!err)
836                         goto out2;
837         }
838
839         if (--attempts <= 0)
840                 goto out2;
841
842         /*
843          * Race condition! In the gap, when table->tb6_lock was
844          * released someone could insert this route.  Relookup.
845          */
846         dst_release(&rt->dst);
847         goto relookup;
848
849 out:
850         if (reachable) {
851                 reachable = 0;
852                 goto restart_2;
853         }
854         dst_hold(&rt->dst);
855         read_unlock_bh(&table->tb6_lock);
856 out2:
857         rt->dst.lastuse = jiffies;
858         rt->dst.__use++;
859
860         return rt;
861 }
862
863 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
864                                             struct flowi6 *fl6, int flags)
865 {
866         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
867 }
868
869 void ip6_route_input(struct sk_buff *skb)
870 {
871         const struct ipv6hdr *iph = ipv6_hdr(skb);
872         struct net *net = dev_net(skb->dev);
873         int flags = RT6_LOOKUP_F_HAS_SADDR;
874         struct flowi6 fl6 = {
875                 .flowi6_iif = skb->dev->ifindex,
876                 .daddr = iph->daddr,
877                 .saddr = iph->saddr,
878                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
879                 .flowi6_mark = skb->mark,
880                 .flowi6_proto = iph->nexthdr,
881         };
882
883         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
884                 flags |= RT6_LOOKUP_F_IFACE;
885
886         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
887 }
888
889 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
890                                              struct flowi6 *fl6, int flags)
891 {
892         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
893 }
894
895 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
896                                     struct flowi6 *fl6)
897 {
898         int flags = 0;
899
900         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
901                 flags |= RT6_LOOKUP_F_IFACE;
902
903         if (!ipv6_addr_any(&fl6->saddr))
904                 flags |= RT6_LOOKUP_F_HAS_SADDR;
905         else if (sk)
906                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
907
908         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
909 }
910
911 EXPORT_SYMBOL(ip6_route_output);
912
913 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
914 {
915         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
916         struct dst_entry *new = NULL;
917
918         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
919         if (rt) {
920                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
921
922                 new = &rt->dst;
923
924                 new->__use = 1;
925                 new->input = dst_discard;
926                 new->output = dst_discard;
927
928                 if (dst_metrics_read_only(&ort->dst))
929                         new->_metrics = ort->dst._metrics;
930                 else
931                         dst_copy_metrics(new, &ort->dst);
932                 rt->rt6i_idev = ort->rt6i_idev;
933                 if (rt->rt6i_idev)
934                         in6_dev_hold(rt->rt6i_idev);
935                 rt->rt6i_expires = 0;
936
937                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
938                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
939                 rt->rt6i_metric = 0;
940
941                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
942 #ifdef CONFIG_IPV6_SUBTREES
943                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
944 #endif
945
946                 dst_free(new);
947         }
948
949         dst_release(dst_orig);
950         return new ? new : ERR_PTR(-ENOMEM);
951 }
952
953 /*
954  *      Destination cache support functions
955  */
956
957 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
958 {
959         struct rt6_info *rt;
960
961         rt = (struct rt6_info *) dst;
962
963         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
964                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
965                         if (!rt->rt6i_peer)
966                                 rt6_bind_peer(rt, 0);
967                         rt->rt6i_peer_genid = rt6_peer_genid();
968                 }
969                 return dst;
970         }
971         return NULL;
972 }
973
974 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
975 {
976         struct rt6_info *rt = (struct rt6_info *) dst;
977
978         if (rt) {
979                 if (rt->rt6i_flags & RTF_CACHE) {
980                         if (rt6_check_expired(rt)) {
981                                 ip6_del_rt(rt);
982                                 dst = NULL;
983                         }
984                 } else {
985                         dst_release(dst);
986                         dst = NULL;
987                 }
988         }
989         return dst;
990 }
991
992 static void ip6_link_failure(struct sk_buff *skb)
993 {
994         struct rt6_info *rt;
995
996         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
997
998         rt = (struct rt6_info *) skb_dst(skb);
999         if (rt) {
1000                 if (rt->rt6i_flags&RTF_CACHE) {
1001                         dst_set_expires(&rt->dst, 0);
1002                         rt->rt6i_flags |= RTF_EXPIRES;
1003                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1004                         rt->rt6i_node->fn_sernum = -1;
1005         }
1006 }
1007
1008 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1009 {
1010         struct rt6_info *rt6 = (struct rt6_info*)dst;
1011
1012         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1013                 rt6->rt6i_flags |= RTF_MODIFIED;
1014                 if (mtu < IPV6_MIN_MTU) {
1015                         u32 features = dst_metric(dst, RTAX_FEATURES);
1016                         mtu = IPV6_MIN_MTU;
1017                         features |= RTAX_FEATURE_ALLFRAG;
1018                         dst_metric_set(dst, RTAX_FEATURES, features);
1019                 }
1020                 dst_metric_set(dst, RTAX_MTU, mtu);
1021         }
1022 }
1023
1024 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1025 {
1026         struct net_device *dev = dst->dev;
1027         unsigned int mtu = dst_mtu(dst);
1028         struct net *net = dev_net(dev);
1029
1030         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1031
1032         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1033                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1034
1035         /*
1036          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1037          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1038          * IPV6_MAXPLEN is also valid and means: "any MSS,
1039          * rely only on pmtu discovery"
1040          */
1041         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1042                 mtu = IPV6_MAXPLEN;
1043         return mtu;
1044 }
1045
1046 static unsigned int ip6_mtu(const struct dst_entry *dst)
1047 {
1048         struct inet6_dev *idev;
1049         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1050
1051         if (mtu)
1052                 return mtu;
1053
1054         mtu = IPV6_MIN_MTU;
1055
1056         rcu_read_lock();
1057         idev = __in6_dev_get(dst->dev);
1058         if (idev)
1059                 mtu = idev->cnf.mtu6;
1060         rcu_read_unlock();
1061
1062         return mtu;
1063 }
1064
1065 static struct dst_entry *icmp6_dst_gc_list;
1066 static DEFINE_SPINLOCK(icmp6_dst_lock);
1067
1068 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1069                                   struct neighbour *neigh,
1070                                   const struct in6_addr *addr)
1071 {
1072         struct rt6_info *rt;
1073         struct inet6_dev *idev = in6_dev_get(dev);
1074         struct net *net = dev_net(dev);
1075
1076         if (unlikely(idev == NULL))
1077                 return NULL;
1078
1079         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1080         if (unlikely(rt == NULL)) {
1081                 in6_dev_put(idev);
1082                 goto out;
1083         }
1084
1085         if (neigh)
1086                 neigh_hold(neigh);
1087         else {
1088                 neigh = ndisc_get_neigh(dev, addr);
1089                 if (IS_ERR(neigh))
1090                         neigh = NULL;
1091         }
1092
1093         rt->dst.flags |= DST_HOST;
1094         rt->dst.output  = ip6_output;
1095         dst_set_neighbour(&rt->dst, neigh);
1096         atomic_set(&rt->dst.__refcnt, 1);
1097         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1098         rt->rt6i_dst.plen = 128;
1099         rt->rt6i_idev     = idev;
1100         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1101
1102         spin_lock_bh(&icmp6_dst_lock);
1103         rt->dst.next = icmp6_dst_gc_list;
1104         icmp6_dst_gc_list = &rt->dst;
1105         spin_unlock_bh(&icmp6_dst_lock);
1106
1107         fib6_force_start_gc(net);
1108
1109 out:
1110         return &rt->dst;
1111 }
1112
1113 int icmp6_dst_gc(void)
1114 {
1115         struct dst_entry *dst, **pprev;
1116         int more = 0;
1117
1118         spin_lock_bh(&icmp6_dst_lock);
1119         pprev = &icmp6_dst_gc_list;
1120
1121         while ((dst = *pprev) != NULL) {
1122                 if (!atomic_read(&dst->__refcnt)) {
1123                         *pprev = dst->next;
1124                         dst_free(dst);
1125                 } else {
1126                         pprev = &dst->next;
1127                         ++more;
1128                 }
1129         }
1130
1131         spin_unlock_bh(&icmp6_dst_lock);
1132
1133         return more;
1134 }
1135
1136 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1137                             void *arg)
1138 {
1139         struct dst_entry *dst, **pprev;
1140
1141         spin_lock_bh(&icmp6_dst_lock);
1142         pprev = &icmp6_dst_gc_list;
1143         while ((dst = *pprev) != NULL) {
1144                 struct rt6_info *rt = (struct rt6_info *) dst;
1145                 if (func(rt, arg)) {
1146                         *pprev = dst->next;
1147                         dst_free(dst);
1148                 } else {
1149                         pprev = &dst->next;
1150                 }
1151         }
1152         spin_unlock_bh(&icmp6_dst_lock);
1153 }
1154
1155 static int ip6_dst_gc(struct dst_ops *ops)
1156 {
1157         unsigned long now = jiffies;
1158         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1159         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1160         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1161         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1162         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1163         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1164         int entries;
1165
1166         entries = dst_entries_get_fast(ops);
1167         if (time_after(rt_last_gc + rt_min_interval, now) &&
1168             entries <= rt_max_size)
1169                 goto out;
1170
1171         net->ipv6.ip6_rt_gc_expire++;
1172         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1173         net->ipv6.ip6_rt_last_gc = now;
1174         entries = dst_entries_get_slow(ops);
1175         if (entries < ops->gc_thresh)
1176                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1177 out:
1178         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1179         return entries > rt_max_size;
1180 }
1181
1182 /* Clean host part of a prefix. Not necessary in radix tree,
1183    but results in cleaner routing tables.
1184
1185    Remove it only when all the things will work!
1186  */
1187
1188 int ip6_dst_hoplimit(struct dst_entry *dst)
1189 {
1190         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1191         if (hoplimit == 0) {
1192                 struct net_device *dev = dst->dev;
1193                 struct inet6_dev *idev;
1194
1195                 rcu_read_lock();
1196                 idev = __in6_dev_get(dev);
1197                 if (idev)
1198                         hoplimit = idev->cnf.hop_limit;
1199                 else
1200                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1201                 rcu_read_unlock();
1202         }
1203         return hoplimit;
1204 }
1205 EXPORT_SYMBOL(ip6_dst_hoplimit);
1206
1207 /*
1208  *
1209  */
1210
1211 int ip6_route_add(struct fib6_config *cfg)
1212 {
1213         int err;
1214         struct net *net = cfg->fc_nlinfo.nl_net;
1215         struct rt6_info *rt = NULL;
1216         struct net_device *dev = NULL;
1217         struct inet6_dev *idev = NULL;
1218         struct fib6_table *table;
1219         int addr_type;
1220
1221         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1222                 return -EINVAL;
1223 #ifndef CONFIG_IPV6_SUBTREES
1224         if (cfg->fc_src_len)
1225                 return -EINVAL;
1226 #endif
1227         if (cfg->fc_ifindex) {
1228                 err = -ENODEV;
1229                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1230                 if (!dev)
1231                         goto out;
1232                 idev = in6_dev_get(dev);
1233                 if (!idev)
1234                         goto out;
1235         }
1236
1237         if (cfg->fc_metric == 0)
1238                 cfg->fc_metric = IP6_RT_PRIO_USER;
1239
1240         table = fib6_new_table(net, cfg->fc_table);
1241         if (table == NULL) {
1242                 err = -ENOBUFS;
1243                 goto out;
1244         }
1245
1246         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1247
1248         if (rt == NULL) {
1249                 err = -ENOMEM;
1250                 goto out;
1251         }
1252
1253         rt->dst.obsolete = -1;
1254         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1255                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1256                                 0;
1257
1258         if (cfg->fc_protocol == RTPROT_UNSPEC)
1259                 cfg->fc_protocol = RTPROT_BOOT;
1260         rt->rt6i_protocol = cfg->fc_protocol;
1261
1262         addr_type = ipv6_addr_type(&cfg->fc_dst);
1263
1264         if (addr_type & IPV6_ADDR_MULTICAST)
1265                 rt->dst.input = ip6_mc_input;
1266         else if (cfg->fc_flags & RTF_LOCAL)
1267                 rt->dst.input = ip6_input;
1268         else
1269                 rt->dst.input = ip6_forward;
1270
1271         rt->dst.output = ip6_output;
1272
1273         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1274         rt->rt6i_dst.plen = cfg->fc_dst_len;
1275         if (rt->rt6i_dst.plen == 128)
1276                rt->dst.flags |= DST_HOST;
1277
1278         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1279                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1280                 if (!metrics) {
1281                         err = -ENOMEM;
1282                         goto out;
1283                 }
1284                 dst_init_metrics(&rt->dst, metrics, 0);
1285         }
1286 #ifdef CONFIG_IPV6_SUBTREES
1287         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1288         rt->rt6i_src.plen = cfg->fc_src_len;
1289 #endif
1290
1291         rt->rt6i_metric = cfg->fc_metric;
1292
1293         /* We cannot add true routes via loopback here,
1294            they would result in kernel looping; promote them to reject routes
1295          */
1296         if ((cfg->fc_flags & RTF_REJECT) ||
1297             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1298                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1299                 /* hold loopback dev/idev if we haven't done so. */
1300                 if (dev != net->loopback_dev) {
1301                         if (dev) {
1302                                 dev_put(dev);
1303                                 in6_dev_put(idev);
1304                         }
1305                         dev = net->loopback_dev;
1306                         dev_hold(dev);
1307                         idev = in6_dev_get(dev);
1308                         if (!idev) {
1309                                 err = -ENODEV;
1310                                 goto out;
1311                         }
1312                 }
1313                 rt->dst.output = ip6_pkt_discard_out;
1314                 rt->dst.input = ip6_pkt_discard;
1315                 rt->dst.error = -ENETUNREACH;
1316                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1317                 goto install_route;
1318         }
1319
1320         if (cfg->fc_flags & RTF_GATEWAY) {
1321                 const struct in6_addr *gw_addr;
1322                 int gwa_type;
1323
1324                 gw_addr = &cfg->fc_gateway;
1325                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1326                 gwa_type = ipv6_addr_type(gw_addr);
1327
1328                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1329                         struct rt6_info *grt;
1330
1331                         /* IPv6 strictly inhibits using not link-local
1332                            addresses as nexthop address.
1333                            Otherwise, router will not able to send redirects.
1334                            It is very good, but in some (rare!) circumstances
1335                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1336                            some exceptions. --ANK
1337                          */
1338                         err = -EINVAL;
1339                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1340                                 goto out;
1341
1342                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1343
1344                         err = -EHOSTUNREACH;
1345                         if (grt == NULL)
1346                                 goto out;
1347                         if (dev) {
1348                                 if (dev != grt->rt6i_dev) {
1349                                         dst_release(&grt->dst);
1350                                         goto out;
1351                                 }
1352                         } else {
1353                                 dev = grt->rt6i_dev;
1354                                 idev = grt->rt6i_idev;
1355                                 dev_hold(dev);
1356                                 in6_dev_hold(grt->rt6i_idev);
1357                         }
1358                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1359                                 err = 0;
1360                         dst_release(&grt->dst);
1361
1362                         if (err)
1363                                 goto out;
1364                 }
1365                 err = -EINVAL;
1366                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1367                         goto out;
1368         }
1369
1370         err = -ENODEV;
1371         if (dev == NULL)
1372                 goto out;
1373
1374         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1375                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1376                         err = -EINVAL;
1377                         goto out;
1378                 }
1379                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1380                 rt->rt6i_prefsrc.plen = 128;
1381         } else
1382                 rt->rt6i_prefsrc.plen = 0;
1383
1384         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1385                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1386                 if (IS_ERR(n)) {
1387                         err = PTR_ERR(n);
1388                         goto out;
1389                 }
1390                 dst_set_neighbour(&rt->dst, n);
1391         }
1392
1393         rt->rt6i_flags = cfg->fc_flags;
1394
1395 install_route:
1396         if (cfg->fc_mx) {
1397                 struct nlattr *nla;
1398                 int remaining;
1399
1400                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1401                         int type = nla_type(nla);
1402
1403                         if (type) {
1404                                 if (type > RTAX_MAX) {
1405                                         err = -EINVAL;
1406                                         goto out;
1407                                 }
1408
1409                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1410                         }
1411                 }
1412         }
1413
1414         rt->dst.dev = dev;
1415         rt->rt6i_idev = idev;
1416         rt->rt6i_table = table;
1417
1418         cfg->fc_nlinfo.nl_net = dev_net(dev);
1419
1420         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1421
1422 out:
1423         if (dev)
1424                 dev_put(dev);
1425         if (idev)
1426                 in6_dev_put(idev);
1427         if (rt)
1428                 dst_free(&rt->dst);
1429         return err;
1430 }
1431
1432 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1433 {
1434         int err;
1435         struct fib6_table *table;
1436         struct net *net = dev_net(rt->rt6i_dev);
1437
1438         if (rt == net->ipv6.ip6_null_entry) {
1439                 err = -ENOENT;
1440                 goto out;
1441         }
1442
1443         table = rt->rt6i_table;
1444         write_lock_bh(&table->tb6_lock);
1445         err = fib6_del(rt, info);
1446         write_unlock_bh(&table->tb6_lock);
1447
1448 out:
1449         dst_release(&rt->dst);
1450         return err;
1451 }
1452
1453 int ip6_del_rt(struct rt6_info *rt)
1454 {
1455         struct nl_info info = {
1456                 .nl_net = dev_net(rt->rt6i_dev),
1457         };
1458         return __ip6_del_rt(rt, &info);
1459 }
1460
1461 static int ip6_route_del(struct fib6_config *cfg)
1462 {
1463         struct fib6_table *table;
1464         struct fib6_node *fn;
1465         struct rt6_info *rt;
1466         int err = -ESRCH;
1467
1468         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1469         if (table == NULL)
1470                 return err;
1471
1472         read_lock_bh(&table->tb6_lock);
1473
1474         fn = fib6_locate(&table->tb6_root,
1475                          &cfg->fc_dst, cfg->fc_dst_len,
1476                          &cfg->fc_src, cfg->fc_src_len);
1477
1478         if (fn) {
1479                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1480                         if (cfg->fc_ifindex &&
1481                             (rt->rt6i_dev == NULL ||
1482                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1483                                 continue;
1484                         if (cfg->fc_flags & RTF_GATEWAY &&
1485                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1486                                 continue;
1487                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1488                                 continue;
1489                         dst_hold(&rt->dst);
1490                         read_unlock_bh(&table->tb6_lock);
1491
1492                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1493                 }
1494         }
1495         read_unlock_bh(&table->tb6_lock);
1496
1497         return err;
1498 }
1499
1500 /*
1501  *      Handle redirects
1502  */
1503 struct ip6rd_flowi {
1504         struct flowi6 fl6;
1505         struct in6_addr gateway;
1506 };
1507
1508 static struct rt6_info *__ip6_route_redirect(struct net *net,
1509                                              struct fib6_table *table,
1510                                              struct flowi6 *fl6,
1511                                              int flags)
1512 {
1513         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1514         struct rt6_info *rt;
1515         struct fib6_node *fn;
1516
1517         /*
1518          * Get the "current" route for this destination and
1519          * check if the redirect has come from approriate router.
1520          *
1521          * RFC 2461 specifies that redirects should only be
1522          * accepted if they come from the nexthop to the target.
1523          * Due to the way the routes are chosen, this notion
1524          * is a bit fuzzy and one might need to check all possible
1525          * routes.
1526          */
1527
1528         read_lock_bh(&table->tb6_lock);
1529         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1530 restart:
1531         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1532                 /*
1533                  * Current route is on-link; redirect is always invalid.
1534                  *
1535                  * Seems, previous statement is not true. It could
1536                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1537                  * But then router serving it might decide, that we should
1538                  * know truth 8)8) --ANK (980726).
1539                  */
1540                 if (rt6_check_expired(rt))
1541                         continue;
1542                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1543                         continue;
1544                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1545                         continue;
1546                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1547                         continue;
1548                 break;
1549         }
1550
1551         if (!rt)
1552                 rt = net->ipv6.ip6_null_entry;
1553         BACKTRACK(net, &fl6->saddr);
1554 out:
1555         dst_hold(&rt->dst);
1556
1557         read_unlock_bh(&table->tb6_lock);
1558
1559         return rt;
1560 };
1561
1562 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1563                                            const struct in6_addr *src,
1564                                            const struct in6_addr *gateway,
1565                                            struct net_device *dev)
1566 {
1567         int flags = RT6_LOOKUP_F_HAS_SADDR;
1568         struct net *net = dev_net(dev);
1569         struct ip6rd_flowi rdfl = {
1570                 .fl6 = {
1571                         .flowi6_oif = dev->ifindex,
1572                         .daddr = *dest,
1573                         .saddr = *src,
1574                 },
1575         };
1576
1577         ipv6_addr_copy(&rdfl.gateway, gateway);
1578
1579         if (rt6_need_strict(dest))
1580                 flags |= RT6_LOOKUP_F_IFACE;
1581
1582         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1583                                                    flags, __ip6_route_redirect);
1584 }
1585
1586 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1587                   const struct in6_addr *saddr,
1588                   struct neighbour *neigh, u8 *lladdr, int on_link)
1589 {
1590         struct rt6_info *rt, *nrt = NULL;
1591         struct netevent_redirect netevent;
1592         struct net *net = dev_net(neigh->dev);
1593
1594         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1595
1596         if (rt == net->ipv6.ip6_null_entry) {
1597                 if (net_ratelimit())
1598                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1599                                "for redirect target\n");
1600                 goto out;
1601         }
1602
1603         /*
1604          *      We have finally decided to accept it.
1605          */
1606
1607         neigh_update(neigh, lladdr, NUD_STALE,
1608                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1609                      NEIGH_UPDATE_F_OVERRIDE|
1610                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1611                                      NEIGH_UPDATE_F_ISROUTER))
1612                      );
1613
1614         /*
1615          * Redirect received -> path was valid.
1616          * Look, redirects are sent only in response to data packets,
1617          * so that this nexthop apparently is reachable. --ANK
1618          */
1619         dst_confirm(&rt->dst);
1620
1621         /* Duplicate redirect: silently ignore. */
1622         if (neigh == dst_get_neighbour_raw(&rt->dst))
1623                 goto out;
1624
1625         nrt = ip6_rt_copy(rt, dest);
1626         if (nrt == NULL)
1627                 goto out;
1628
1629         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1630         if (on_link)
1631                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1632
1633         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1634         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1635
1636         if (ip6_ins_rt(nrt))
1637                 goto out;
1638
1639         netevent.old = &rt->dst;
1640         netevent.new = &nrt->dst;
1641         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1642
1643         if (rt->rt6i_flags&RTF_CACHE) {
1644                 ip6_del_rt(rt);
1645                 return;
1646         }
1647
1648 out:
1649         dst_release(&rt->dst);
1650 }
1651
1652 /*
1653  *      Handle ICMP "packet too big" messages
1654  *      i.e. Path MTU discovery
1655  */
1656
1657 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1658                              struct net *net, u32 pmtu, int ifindex)
1659 {
1660         struct rt6_info *rt, *nrt;
1661         int allfrag = 0;
1662 again:
1663         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1664         if (rt == NULL)
1665                 return;
1666
1667         if (rt6_check_expired(rt)) {
1668                 ip6_del_rt(rt);
1669                 goto again;
1670         }
1671
1672         if (pmtu >= dst_mtu(&rt->dst))
1673                 goto out;
1674
1675         if (pmtu < IPV6_MIN_MTU) {
1676                 /*
1677                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1678                  * MTU (1280) and a fragment header should always be included
1679                  * after a node receiving Too Big message reporting PMTU is
1680                  * less than the IPv6 Minimum Link MTU.
1681                  */
1682                 pmtu = IPV6_MIN_MTU;
1683                 allfrag = 1;
1684         }
1685
1686         /* New mtu received -> path was valid.
1687            They are sent only in response to data packets,
1688            so that this nexthop apparently is reachable. --ANK
1689          */
1690         dst_confirm(&rt->dst);
1691
1692         /* Host route. If it is static, it would be better
1693            not to override it, but add new one, so that
1694            when cache entry will expire old pmtu
1695            would return automatically.
1696          */
1697         if (rt->rt6i_flags & RTF_CACHE) {
1698                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1699                 if (allfrag) {
1700                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1701                         features |= RTAX_FEATURE_ALLFRAG;
1702                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1703                 }
1704                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1705                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1706                 goto out;
1707         }
1708
1709         /* Network route.
1710            Two cases are possible:
1711            1. It is connected route. Action: COW
1712            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1713          */
1714         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1715                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1716         else
1717                 nrt = rt6_alloc_clone(rt, daddr);
1718
1719         if (nrt) {
1720                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1721                 if (allfrag) {
1722                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1723                         features |= RTAX_FEATURE_ALLFRAG;
1724                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1725                 }
1726
1727                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1728                  * happened within 5 mins, the recommended timer is 10 mins.
1729                  * Here this route expiration time is set to ip6_rt_mtu_expires
1730                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1731                  * and detecting PMTU increase will be automatically happened.
1732                  */
1733                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1734                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1735
1736                 ip6_ins_rt(nrt);
1737         }
1738 out:
1739         dst_release(&rt->dst);
1740 }
1741
1742 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1743                         struct net_device *dev, u32 pmtu)
1744 {
1745         struct net *net = dev_net(dev);
1746
1747         /*
1748          * RFC 1981 states that a node "MUST reduce the size of the packets it
1749          * is sending along the path" that caused the Packet Too Big message.
1750          * Since it's not possible in the general case to determine which
1751          * interface was used to send the original packet, we update the MTU
1752          * on the interface that will be used to send future packets. We also
1753          * update the MTU on the interface that received the Packet Too Big in
1754          * case the original packet was forced out that interface with
1755          * SO_BINDTODEVICE or similar. This is the next best thing to the
1756          * correct behaviour, which would be to update the MTU on all
1757          * interfaces.
1758          */
1759         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1760         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1761 }
1762
1763 /*
1764  *      Misc support functions
1765  */
1766
1767 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1768                                     const struct in6_addr *dest)
1769 {
1770         struct net *net = dev_net(ort->rt6i_dev);
1771         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1772                                             ort->dst.dev, 0);
1773
1774         if (rt) {
1775                 rt->dst.input = ort->dst.input;
1776                 rt->dst.output = ort->dst.output;
1777                 rt->dst.flags |= DST_HOST;
1778
1779                 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1780                 rt->rt6i_dst.plen = 128;
1781                 dst_copy_metrics(&rt->dst, &ort->dst);
1782                 rt->dst.error = ort->dst.error;
1783                 rt->rt6i_idev = ort->rt6i_idev;
1784                 if (rt->rt6i_idev)
1785                         in6_dev_hold(rt->rt6i_idev);
1786                 rt->dst.lastuse = jiffies;
1787                 rt->rt6i_expires = 0;
1788
1789                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1790                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1791                 rt->rt6i_metric = 0;
1792
1793 #ifdef CONFIG_IPV6_SUBTREES
1794                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1795 #endif
1796                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1797                 rt->rt6i_table = ort->rt6i_table;
1798         }
1799         return rt;
1800 }
1801
1802 #ifdef CONFIG_IPV6_ROUTE_INFO
1803 static struct rt6_info *rt6_get_route_info(struct net *net,
1804                                            const struct in6_addr *prefix, int prefixlen,
1805                                            const struct in6_addr *gwaddr, int ifindex)
1806 {
1807         struct fib6_node *fn;
1808         struct rt6_info *rt = NULL;
1809         struct fib6_table *table;
1810
1811         table = fib6_get_table(net, RT6_TABLE_INFO);
1812         if (table == NULL)
1813                 return NULL;
1814
1815         write_lock_bh(&table->tb6_lock);
1816         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1817         if (!fn)
1818                 goto out;
1819
1820         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1821                 if (rt->rt6i_dev->ifindex != ifindex)
1822                         continue;
1823                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1824                         continue;
1825                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1826                         continue;
1827                 dst_hold(&rt->dst);
1828                 break;
1829         }
1830 out:
1831         write_unlock_bh(&table->tb6_lock);
1832         return rt;
1833 }
1834
1835 static struct rt6_info *rt6_add_route_info(struct net *net,
1836                                            const struct in6_addr *prefix, int prefixlen,
1837                                            const struct in6_addr *gwaddr, int ifindex,
1838                                            unsigned pref)
1839 {
1840         struct fib6_config cfg = {
1841                 .fc_table       = RT6_TABLE_INFO,
1842                 .fc_metric      = IP6_RT_PRIO_USER,
1843                 .fc_ifindex     = ifindex,
1844                 .fc_dst_len     = prefixlen,
1845                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1846                                   RTF_UP | RTF_PREF(pref),
1847                 .fc_nlinfo.pid = 0,
1848                 .fc_nlinfo.nlh = NULL,
1849                 .fc_nlinfo.nl_net = net,
1850         };
1851
1852         ipv6_addr_copy(&cfg.fc_dst, prefix);
1853         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1854
1855         /* We should treat it as a default route if prefix length is 0. */
1856         if (!prefixlen)
1857                 cfg.fc_flags |= RTF_DEFAULT;
1858
1859         ip6_route_add(&cfg);
1860
1861         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1862 }
1863 #endif
1864
1865 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1866 {
1867         struct rt6_info *rt;
1868         struct fib6_table *table;
1869
1870         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1871         if (table == NULL)
1872                 return NULL;
1873
1874         write_lock_bh(&table->tb6_lock);
1875         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1876                 if (dev == rt->rt6i_dev &&
1877                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1878                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1879                         break;
1880         }
1881         if (rt)
1882                 dst_hold(&rt->dst);
1883         write_unlock_bh(&table->tb6_lock);
1884         return rt;
1885 }
1886
1887 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1888                                      struct net_device *dev,
1889                                      unsigned int pref)
1890 {
1891         struct fib6_config cfg = {
1892                 .fc_table       = RT6_TABLE_DFLT,
1893                 .fc_metric      = IP6_RT_PRIO_USER,
1894                 .fc_ifindex     = dev->ifindex,
1895                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1896                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1897                 .fc_nlinfo.pid = 0,
1898                 .fc_nlinfo.nlh = NULL,
1899                 .fc_nlinfo.nl_net = dev_net(dev),
1900         };
1901
1902         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1903
1904         ip6_route_add(&cfg);
1905
1906         return rt6_get_dflt_router(gwaddr, dev);
1907 }
1908
1909 void rt6_purge_dflt_routers(struct net *net)
1910 {
1911         struct rt6_info *rt;
1912         struct fib6_table *table;
1913
1914         /* NOTE: Keep consistent with rt6_get_dflt_router */
1915         table = fib6_get_table(net, RT6_TABLE_DFLT);
1916         if (table == NULL)
1917                 return;
1918
1919 restart:
1920         read_lock_bh(&table->tb6_lock);
1921         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1922                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1923                         dst_hold(&rt->dst);
1924                         read_unlock_bh(&table->tb6_lock);
1925                         ip6_del_rt(rt);
1926                         goto restart;
1927                 }
1928         }
1929         read_unlock_bh(&table->tb6_lock);
1930 }
1931
1932 static void rtmsg_to_fib6_config(struct net *net,
1933                                  struct in6_rtmsg *rtmsg,
1934                                  struct fib6_config *cfg)
1935 {
1936         memset(cfg, 0, sizeof(*cfg));
1937
1938         cfg->fc_table = RT6_TABLE_MAIN;
1939         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1940         cfg->fc_metric = rtmsg->rtmsg_metric;
1941         cfg->fc_expires = rtmsg->rtmsg_info;
1942         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1943         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1944         cfg->fc_flags = rtmsg->rtmsg_flags;
1945
1946         cfg->fc_nlinfo.nl_net = net;
1947
1948         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1949         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1950         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1951 }
1952
1953 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1954 {
1955         struct fib6_config cfg;
1956         struct in6_rtmsg rtmsg;
1957         int err;
1958
1959         switch(cmd) {
1960         case SIOCADDRT:         /* Add a route */
1961         case SIOCDELRT:         /* Delete a route */
1962                 if (!capable(CAP_NET_ADMIN))
1963                         return -EPERM;
1964                 err = copy_from_user(&rtmsg, arg,
1965                                      sizeof(struct in6_rtmsg));
1966                 if (err)
1967                         return -EFAULT;
1968
1969                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1970
1971                 rtnl_lock();
1972                 switch (cmd) {
1973                 case SIOCADDRT:
1974                         err = ip6_route_add(&cfg);
1975                         break;
1976                 case SIOCDELRT:
1977                         err = ip6_route_del(&cfg);
1978                         break;
1979                 default:
1980                         err = -EINVAL;
1981                 }
1982                 rtnl_unlock();
1983
1984                 return err;
1985         }
1986
1987         return -EINVAL;
1988 }
1989
1990 /*
1991  *      Drop the packet on the floor
1992  */
1993
1994 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1995 {
1996         int type;
1997         struct dst_entry *dst = skb_dst(skb);
1998         switch (ipstats_mib_noroutes) {
1999         case IPSTATS_MIB_INNOROUTES:
2000                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2001                 if (type == IPV6_ADDR_ANY) {
2002                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2003                                       IPSTATS_MIB_INADDRERRORS);
2004                         break;
2005                 }
2006                 /* FALLTHROUGH */
2007         case IPSTATS_MIB_OUTNOROUTES:
2008                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2009                               ipstats_mib_noroutes);
2010                 break;
2011         }
2012         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2013         kfree_skb(skb);
2014         return 0;
2015 }
2016
2017 static int ip6_pkt_discard(struct sk_buff *skb)
2018 {
2019         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2020 }
2021
2022 static int ip6_pkt_discard_out(struct sk_buff *skb)
2023 {
2024         skb->dev = skb_dst(skb)->dev;
2025         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2026 }
2027
2028 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2029
2030 static int ip6_pkt_prohibit(struct sk_buff *skb)
2031 {
2032         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2033 }
2034
2035 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2036 {
2037         skb->dev = skb_dst(skb)->dev;
2038         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2039 }
2040
2041 #endif
2042
2043 /*
2044  *      Allocate a dst for local (unicast / anycast) address.
2045  */
2046
2047 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2048                                     const struct in6_addr *addr,
2049                                     int anycast)
2050 {
2051         struct net *net = dev_net(idev->dev);
2052         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2053                                             net->loopback_dev, 0);
2054         struct neighbour *neigh;
2055
2056         if (rt == NULL) {
2057                 if (net_ratelimit())
2058                         pr_warning("IPv6:  Maximum number of routes reached,"
2059                                    " consider increasing route/max_size.\n");
2060                 return ERR_PTR(-ENOMEM);
2061         }
2062
2063         in6_dev_hold(idev);
2064
2065         rt->dst.flags |= DST_HOST;
2066         rt->dst.input = ip6_input;
2067         rt->dst.output = ip6_output;
2068         rt->rt6i_idev = idev;
2069         rt->dst.obsolete = -1;
2070
2071         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2072         if (anycast)
2073                 rt->rt6i_flags |= RTF_ANYCAST;
2074         else
2075                 rt->rt6i_flags |= RTF_LOCAL;
2076         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2077         if (IS_ERR(neigh)) {
2078                 dst_free(&rt->dst);
2079
2080                 return ERR_CAST(neigh);
2081         }
2082         dst_set_neighbour(&rt->dst, neigh);
2083
2084         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2085         rt->rt6i_dst.plen = 128;
2086         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2087
2088         atomic_set(&rt->dst.__refcnt, 1);
2089
2090         return rt;
2091 }
2092
2093 int ip6_route_get_saddr(struct net *net,
2094                         struct rt6_info *rt,
2095                         const struct in6_addr *daddr,
2096                         unsigned int prefs,
2097                         struct in6_addr *saddr)
2098 {
2099         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2100         int err = 0;
2101         if (rt->rt6i_prefsrc.plen)
2102                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2103         else
2104                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2105                                          daddr, prefs, saddr);
2106         return err;
2107 }
2108
2109 /* remove deleted ip from prefsrc entries */
2110 struct arg_dev_net_ip {
2111         struct net_device *dev;
2112         struct net *net;
2113         struct in6_addr *addr;
2114 };
2115
2116 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2117 {
2118         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2119         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2120         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2121
2122         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2123             rt != net->ipv6.ip6_null_entry &&
2124             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2125                 /* remove prefsrc entry */
2126                 rt->rt6i_prefsrc.plen = 0;
2127         }
2128         return 0;
2129 }
2130
2131 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2132 {
2133         struct net *net = dev_net(ifp->idev->dev);
2134         struct arg_dev_net_ip adni = {
2135                 .dev = ifp->idev->dev,
2136                 .net = net,
2137                 .addr = &ifp->addr,
2138         };
2139         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2140 }
2141
2142 struct arg_dev_net {
2143         struct net_device *dev;
2144         struct net *net;
2145 };
2146
2147 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2148 {
2149         const struct arg_dev_net *adn = arg;
2150         const struct net_device *dev = adn->dev;
2151
2152         if ((rt->rt6i_dev == dev || dev == NULL) &&
2153             rt != adn->net->ipv6.ip6_null_entry) {
2154                 RT6_TRACE("deleted by ifdown %p\n", rt);
2155                 return -1;
2156         }
2157         return 0;
2158 }
2159
2160 void rt6_ifdown(struct net *net, struct net_device *dev)
2161 {
2162         struct arg_dev_net adn = {
2163                 .dev = dev,
2164                 .net = net,
2165         };
2166
2167         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2168         icmp6_clean_all(fib6_ifdown, &adn);
2169 }
2170
2171 struct rt6_mtu_change_arg
2172 {
2173         struct net_device *dev;
2174         unsigned mtu;
2175 };
2176
2177 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2178 {
2179         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2180         struct inet6_dev *idev;
2181
2182         /* In IPv6 pmtu discovery is not optional,
2183            so that RTAX_MTU lock cannot disable it.
2184            We still use this lock to block changes
2185            caused by addrconf/ndisc.
2186         */
2187
2188         idev = __in6_dev_get(arg->dev);
2189         if (idev == NULL)
2190                 return 0;
2191
2192         /* For administrative MTU increase, there is no way to discover
2193            IPv6 PMTU increase, so PMTU increase should be updated here.
2194            Since RFC 1981 doesn't include administrative MTU increase
2195            update PMTU increase is a MUST. (i.e. jumbo frame)
2196          */
2197         /*
2198            If new MTU is less than route PMTU, this new MTU will be the
2199            lowest MTU in the path, update the route PMTU to reflect PMTU
2200            decreases; if new MTU is greater than route PMTU, and the
2201            old MTU is the lowest MTU in the path, update the route PMTU
2202            to reflect the increase. In this case if the other nodes' MTU
2203            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2204            PMTU discouvery.
2205          */
2206         if (rt->rt6i_dev == arg->dev &&
2207             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2208             (dst_mtu(&rt->dst) >= arg->mtu ||
2209              (dst_mtu(&rt->dst) < arg->mtu &&
2210               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2211                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2212         }
2213         return 0;
2214 }
2215
2216 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2217 {
2218         struct rt6_mtu_change_arg arg = {
2219                 .dev = dev,
2220                 .mtu = mtu,
2221         };
2222
2223         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2224 }
2225
2226 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2227         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2228         [RTA_OIF]               = { .type = NLA_U32 },
2229         [RTA_IIF]               = { .type = NLA_U32 },
2230         [RTA_PRIORITY]          = { .type = NLA_U32 },
2231         [RTA_METRICS]           = { .type = NLA_NESTED },
2232 };
2233
2234 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2235                               struct fib6_config *cfg)
2236 {
2237         struct rtmsg *rtm;
2238         struct nlattr *tb[RTA_MAX+1];
2239         int err;
2240
2241         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2242         if (err < 0)
2243                 goto errout;
2244
2245         err = -EINVAL;
2246         rtm = nlmsg_data(nlh);
2247         memset(cfg, 0, sizeof(*cfg));
2248
2249         cfg->fc_table = rtm->rtm_table;
2250         cfg->fc_dst_len = rtm->rtm_dst_len;
2251         cfg->fc_src_len = rtm->rtm_src_len;
2252         cfg->fc_flags = RTF_UP;
2253         cfg->fc_protocol = rtm->rtm_protocol;
2254
2255         if (rtm->rtm_type == RTN_UNREACHABLE)
2256                 cfg->fc_flags |= RTF_REJECT;
2257
2258         if (rtm->rtm_type == RTN_LOCAL)
2259                 cfg->fc_flags |= RTF_LOCAL;
2260
2261         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2262         cfg->fc_nlinfo.nlh = nlh;
2263         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2264
2265         if (tb[RTA_GATEWAY]) {
2266                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2267                 cfg->fc_flags |= RTF_GATEWAY;
2268         }
2269
2270         if (tb[RTA_DST]) {
2271                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2272
2273                 if (nla_len(tb[RTA_DST]) < plen)
2274                         goto errout;
2275
2276                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2277         }
2278
2279         if (tb[RTA_SRC]) {
2280                 int plen = (rtm->rtm_src_len + 7) >> 3;
2281
2282                 if (nla_len(tb[RTA_SRC]) < plen)
2283                         goto errout;
2284
2285                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2286         }
2287
2288         if (tb[RTA_PREFSRC])
2289                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2290
2291         if (tb[RTA_OIF])
2292                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2293
2294         if (tb[RTA_PRIORITY])
2295                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2296
2297         if (tb[RTA_METRICS]) {
2298                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2299                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2300         }
2301
2302         if (tb[RTA_TABLE])
2303                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2304
2305         err = 0;
2306 errout:
2307         return err;
2308 }
2309
2310 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2311 {
2312         struct fib6_config cfg;
2313         int err;
2314
2315         err = rtm_to_fib6_config(skb, nlh, &cfg);
2316         if (err < 0)
2317                 return err;
2318
2319         return ip6_route_del(&cfg);
2320 }
2321
2322 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2323 {
2324         struct fib6_config cfg;
2325         int err;
2326
2327         err = rtm_to_fib6_config(skb, nlh, &cfg);
2328         if (err < 0)
2329                 return err;
2330
2331         return ip6_route_add(&cfg);
2332 }
2333
2334 static inline size_t rt6_nlmsg_size(void)
2335 {
2336         return NLMSG_ALIGN(sizeof(struct rtmsg))
2337                + nla_total_size(16) /* RTA_SRC */
2338                + nla_total_size(16) /* RTA_DST */
2339                + nla_total_size(16) /* RTA_GATEWAY */
2340                + nla_total_size(16) /* RTA_PREFSRC */
2341                + nla_total_size(4) /* RTA_TABLE */
2342                + nla_total_size(4) /* RTA_IIF */
2343                + nla_total_size(4) /* RTA_OIF */
2344                + nla_total_size(4) /* RTA_PRIORITY */
2345                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2346                + nla_total_size(sizeof(struct rta_cacheinfo));
2347 }
2348
2349 static int rt6_fill_node(struct net *net,
2350                          struct sk_buff *skb, struct rt6_info *rt,
2351                          struct in6_addr *dst, struct in6_addr *src,
2352                          int iif, int type, u32 pid, u32 seq,
2353                          int prefix, int nowait, unsigned int flags)
2354 {
2355         struct rtmsg *rtm;
2356         struct nlmsghdr *nlh;
2357         long expires;
2358         u32 table;
2359         struct neighbour *n;
2360
2361         if (prefix) {   /* user wants prefix routes only */
2362                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2363                         /* success since this is not a prefix route */
2364                         return 1;
2365                 }
2366         }
2367
2368         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2369         if (nlh == NULL)
2370                 return -EMSGSIZE;
2371
2372         rtm = nlmsg_data(nlh);
2373         rtm->rtm_family = AF_INET6;
2374         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2375         rtm->rtm_src_len = rt->rt6i_src.plen;
2376         rtm->rtm_tos = 0;
2377         if (rt->rt6i_table)
2378                 table = rt->rt6i_table->tb6_id;
2379         else
2380                 table = RT6_TABLE_UNSPEC;
2381         rtm->rtm_table = table;
2382         NLA_PUT_U32(skb, RTA_TABLE, table);
2383         if (rt->rt6i_flags&RTF_REJECT)
2384                 rtm->rtm_type = RTN_UNREACHABLE;
2385         else if (rt->rt6i_flags&RTF_LOCAL)
2386                 rtm->rtm_type = RTN_LOCAL;
2387         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2388                 rtm->rtm_type = RTN_LOCAL;
2389         else
2390                 rtm->rtm_type = RTN_UNICAST;
2391         rtm->rtm_flags = 0;
2392         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2393         rtm->rtm_protocol = rt->rt6i_protocol;
2394         if (rt->rt6i_flags&RTF_DYNAMIC)
2395                 rtm->rtm_protocol = RTPROT_REDIRECT;
2396         else if (rt->rt6i_flags & RTF_ADDRCONF)
2397                 rtm->rtm_protocol = RTPROT_KERNEL;
2398         else if (rt->rt6i_flags&RTF_DEFAULT)
2399                 rtm->rtm_protocol = RTPROT_RA;
2400
2401         if (rt->rt6i_flags&RTF_CACHE)
2402                 rtm->rtm_flags |= RTM_F_CLONED;
2403
2404         if (dst) {
2405                 NLA_PUT(skb, RTA_DST, 16, dst);
2406                 rtm->rtm_dst_len = 128;
2407         } else if (rtm->rtm_dst_len)
2408                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2409 #ifdef CONFIG_IPV6_SUBTREES
2410         if (src) {
2411                 NLA_PUT(skb, RTA_SRC, 16, src);
2412                 rtm->rtm_src_len = 128;
2413         } else if (rtm->rtm_src_len)
2414                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2415 #endif
2416         if (iif) {
2417 #ifdef CONFIG_IPV6_MROUTE
2418                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2419                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2420                         if (err <= 0) {
2421                                 if (!nowait) {
2422                                         if (err == 0)
2423                                                 return 0;
2424                                         goto nla_put_failure;
2425                                 } else {
2426                                         if (err == -EMSGSIZE)
2427                                                 goto nla_put_failure;
2428                                 }
2429                         }
2430                 } else
2431 #endif
2432                         NLA_PUT_U32(skb, RTA_IIF, iif);
2433         } else if (dst) {
2434                 struct in6_addr saddr_buf;
2435                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2436                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2437         }
2438
2439         if (rt->rt6i_prefsrc.plen) {
2440                 struct in6_addr saddr_buf;
2441                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2442                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2443         }
2444
2445         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2446                 goto nla_put_failure;
2447
2448         rcu_read_lock();
2449         n = dst_get_neighbour(&rt->dst);
2450         if (n) {
2451                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2452                         rcu_read_unlock();
2453                         goto nla_put_failure;
2454                 }
2455         }
2456         rcu_read_unlock();
2457
2458         if (rt->dst.dev)
2459                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2460
2461         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2462
2463         if (!(rt->rt6i_flags & RTF_EXPIRES))
2464                 expires = 0;
2465         else if (rt->rt6i_expires - jiffies < INT_MAX)
2466                 expires = rt->rt6i_expires - jiffies;
2467         else
2468                 expires = INT_MAX;
2469
2470         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2471                                expires, rt->dst.error) < 0)
2472                 goto nla_put_failure;
2473
2474         return nlmsg_end(skb, nlh);
2475
2476 nla_put_failure:
2477         nlmsg_cancel(skb, nlh);
2478         return -EMSGSIZE;
2479 }
2480
2481 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2482 {
2483         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2484         int prefix;
2485
2486         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2487                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2488                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2489         } else
2490                 prefix = 0;
2491
2492         return rt6_fill_node(arg->net,
2493                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2494                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2495                      prefix, 0, NLM_F_MULTI);
2496 }
2497
2498 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2499 {
2500         struct net *net = sock_net(in_skb->sk);
2501         struct nlattr *tb[RTA_MAX+1];
2502         struct rt6_info *rt;
2503         struct sk_buff *skb;
2504         struct rtmsg *rtm;
2505         struct flowi6 fl6;
2506         int err, iif = 0;
2507
2508         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2509         if (err < 0)
2510                 goto errout;
2511
2512         err = -EINVAL;
2513         memset(&fl6, 0, sizeof(fl6));
2514
2515         if (tb[RTA_SRC]) {
2516                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2517                         goto errout;
2518
2519                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2520         }
2521
2522         if (tb[RTA_DST]) {
2523                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2524                         goto errout;
2525
2526                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2527         }
2528
2529         if (tb[RTA_IIF])
2530                 iif = nla_get_u32(tb[RTA_IIF]);
2531
2532         if (tb[RTA_OIF])
2533                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2534
2535         if (iif) {
2536                 struct net_device *dev;
2537                 dev = __dev_get_by_index(net, iif);
2538                 if (!dev) {
2539                         err = -ENODEV;
2540                         goto errout;
2541                 }
2542         }
2543
2544         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2545         if (skb == NULL) {
2546                 err = -ENOBUFS;
2547                 goto errout;
2548         }
2549
2550         /* Reserve room for dummy headers, this skb can pass
2551            through good chunk of routing engine.
2552          */
2553         skb_reset_mac_header(skb);
2554         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2555
2556         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2557         skb_dst_set(skb, &rt->dst);
2558
2559         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2560                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2561                             nlh->nlmsg_seq, 0, 0, 0);
2562         if (err < 0) {
2563                 kfree_skb(skb);
2564                 goto errout;
2565         }
2566
2567         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2568 errout:
2569         return err;
2570 }
2571
2572 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2573 {
2574         struct sk_buff *skb;
2575         struct net *net = info->nl_net;
2576         u32 seq;
2577         int err;
2578
2579         err = -ENOBUFS;
2580         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2581
2582         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2583         if (skb == NULL)
2584                 goto errout;
2585
2586         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2587                                 event, info->pid, seq, 0, 0, 0);
2588         if (err < 0) {
2589                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2590                 WARN_ON(err == -EMSGSIZE);
2591                 kfree_skb(skb);
2592                 goto errout;
2593         }
2594         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2595                     info->nlh, gfp_any());
2596         return;
2597 errout:
2598         if (err < 0)
2599                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2600 }
2601
2602 static int ip6_route_dev_notify(struct notifier_block *this,
2603                                 unsigned long event, void *data)
2604 {
2605         struct net_device *dev = (struct net_device *)data;
2606         struct net *net = dev_net(dev);
2607
2608         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2609                 net->ipv6.ip6_null_entry->dst.dev = dev;
2610                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2611 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2612                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2613                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2614                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2615                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2616 #endif
2617         }
2618
2619         return NOTIFY_OK;
2620 }
2621
2622 /*
2623  *      /proc
2624  */
2625
2626 #ifdef CONFIG_PROC_FS
2627
2628 struct rt6_proc_arg
2629 {
2630         char *buffer;
2631         int offset;
2632         int length;
2633         int skip;
2634         int len;
2635 };
2636
2637 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2638 {
2639         struct seq_file *m = p_arg;
2640         struct neighbour *n;
2641
2642         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2643
2644 #ifdef CONFIG_IPV6_SUBTREES
2645         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2646 #else
2647         seq_puts(m, "00000000000000000000000000000000 00 ");
2648 #endif
2649         rcu_read_lock();
2650         n = dst_get_neighbour(&rt->dst);
2651         if (n) {
2652                 seq_printf(m, "%pi6", n->primary_key);
2653         } else {
2654                 seq_puts(m, "00000000000000000000000000000000");
2655         }
2656         rcu_read_unlock();
2657         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2658                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2659                    rt->dst.__use, rt->rt6i_flags,
2660                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2661         return 0;
2662 }
2663
2664 static int ipv6_route_show(struct seq_file *m, void *v)
2665 {
2666         struct net *net = (struct net *)m->private;
2667         fib6_clean_all(net, rt6_info_route, 0, m);
2668         return 0;
2669 }
2670
2671 static int ipv6_route_open(struct inode *inode, struct file *file)
2672 {
2673         return single_open_net(inode, file, ipv6_route_show);
2674 }
2675
2676 static const struct file_operations ipv6_route_proc_fops = {
2677         .owner          = THIS_MODULE,
2678         .open           = ipv6_route_open,
2679         .read           = seq_read,
2680         .llseek         = seq_lseek,
2681         .release        = single_release_net,
2682 };
2683
2684 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2685 {
2686         struct net *net = (struct net *)seq->private;
2687         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2688                    net->ipv6.rt6_stats->fib_nodes,
2689                    net->ipv6.rt6_stats->fib_route_nodes,
2690                    net->ipv6.rt6_stats->fib_rt_alloc,
2691                    net->ipv6.rt6_stats->fib_rt_entries,
2692                    net->ipv6.rt6_stats->fib_rt_cache,
2693                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2694                    net->ipv6.rt6_stats->fib_discarded_routes);
2695
2696         return 0;
2697 }
2698
2699 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2700 {
2701         return single_open_net(inode, file, rt6_stats_seq_show);
2702 }
2703
2704 static const struct file_operations rt6_stats_seq_fops = {
2705         .owner   = THIS_MODULE,
2706         .open    = rt6_stats_seq_open,
2707         .read    = seq_read,
2708         .llseek  = seq_lseek,
2709         .release = single_release_net,
2710 };
2711 #endif  /* CONFIG_PROC_FS */
2712
2713 #ifdef CONFIG_SYSCTL
2714
2715 static
2716 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2717                               void __user *buffer, size_t *lenp, loff_t *ppos)
2718 {
2719         struct net *net;
2720         int delay;
2721         if (!write)
2722                 return -EINVAL;
2723
2724         net = (struct net *)ctl->extra1;
2725         delay = net->ipv6.sysctl.flush_delay;
2726         proc_dointvec(ctl, write, buffer, lenp, ppos);
2727         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2728         return 0;
2729 }
2730
2731 ctl_table ipv6_route_table_template[] = {
2732         {
2733                 .procname       =       "flush",
2734                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2735                 .maxlen         =       sizeof(int),
2736                 .mode           =       0200,
2737                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2738         },
2739         {
2740                 .procname       =       "gc_thresh",
2741                 .data           =       &ip6_dst_ops_template.gc_thresh,
2742                 .maxlen         =       sizeof(int),
2743                 .mode           =       0644,
2744                 .proc_handler   =       proc_dointvec,
2745         },
2746         {
2747                 .procname       =       "max_size",
2748                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2749                 .maxlen         =       sizeof(int),
2750                 .mode           =       0644,
2751                 .proc_handler   =       proc_dointvec,
2752         },
2753         {
2754                 .procname       =       "gc_min_interval",
2755                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2756                 .maxlen         =       sizeof(int),
2757                 .mode           =       0644,
2758                 .proc_handler   =       proc_dointvec_jiffies,
2759         },
2760         {
2761                 .procname       =       "gc_timeout",
2762                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2763                 .maxlen         =       sizeof(int),
2764                 .mode           =       0644,
2765                 .proc_handler   =       proc_dointvec_jiffies,
2766         },
2767         {
2768                 .procname       =       "gc_interval",
2769                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2770                 .maxlen         =       sizeof(int),
2771                 .mode           =       0644,
2772                 .proc_handler   =       proc_dointvec_jiffies,
2773         },
2774         {
2775                 .procname       =       "gc_elasticity",
2776                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2777                 .maxlen         =       sizeof(int),
2778                 .mode           =       0644,
2779                 .proc_handler   =       proc_dointvec,
2780         },
2781         {
2782                 .procname       =       "mtu_expires",
2783                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2784                 .maxlen         =       sizeof(int),
2785                 .mode           =       0644,
2786                 .proc_handler   =       proc_dointvec_jiffies,
2787         },
2788         {
2789                 .procname       =       "min_adv_mss",
2790                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2791                 .maxlen         =       sizeof(int),
2792                 .mode           =       0644,
2793                 .proc_handler   =       proc_dointvec,
2794         },
2795         {
2796                 .procname       =       "gc_min_interval_ms",
2797                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2798                 .maxlen         =       sizeof(int),
2799                 .mode           =       0644,
2800                 .proc_handler   =       proc_dointvec_ms_jiffies,
2801         },
2802         { }
2803 };
2804
2805 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2806 {
2807         struct ctl_table *table;
2808
2809         table = kmemdup(ipv6_route_table_template,
2810                         sizeof(ipv6_route_table_template),
2811                         GFP_KERNEL);
2812
2813         if (table) {
2814                 table[0].data = &net->ipv6.sysctl.flush_delay;
2815                 table[0].extra1 = net;
2816                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2817                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2818                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2819                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2820                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2821                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2822                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2823                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2824                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2825         }
2826
2827         return table;
2828 }
2829 #endif
2830
2831 static int __net_init ip6_route_net_init(struct net *net)
2832 {
2833         int ret = -ENOMEM;
2834
2835         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2836                sizeof(net->ipv6.ip6_dst_ops));
2837
2838         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2839                 goto out_ip6_dst_ops;
2840
2841         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2842                                            sizeof(*net->ipv6.ip6_null_entry),
2843                                            GFP_KERNEL);
2844         if (!net->ipv6.ip6_null_entry)
2845                 goto out_ip6_dst_entries;
2846         net->ipv6.ip6_null_entry->dst.path =
2847                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2848         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2849         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2850                          ip6_template_metrics, true);
2851
2852 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2853         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2854                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2855                                                GFP_KERNEL);
2856         if (!net->ipv6.ip6_prohibit_entry)
2857                 goto out_ip6_null_entry;
2858         net->ipv6.ip6_prohibit_entry->dst.path =
2859                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2860         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2861         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2862                          ip6_template_metrics, true);
2863
2864         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2865                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2866                                                GFP_KERNEL);
2867         if (!net->ipv6.ip6_blk_hole_entry)
2868                 goto out_ip6_prohibit_entry;
2869         net->ipv6.ip6_blk_hole_entry->dst.path =
2870                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2871         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2872         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2873                          ip6_template_metrics, true);
2874 #endif
2875
2876         net->ipv6.sysctl.flush_delay = 0;
2877         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2878         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2879         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2880         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2881         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2882         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2883         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2884
2885         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2886
2887         ret = 0;
2888 out:
2889         return ret;
2890
2891 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2892 out_ip6_prohibit_entry:
2893         kfree(net->ipv6.ip6_prohibit_entry);
2894 out_ip6_null_entry:
2895         kfree(net->ipv6.ip6_null_entry);
2896 #endif
2897 out_ip6_dst_entries:
2898         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2899 out_ip6_dst_ops:
2900         goto out;
2901 }
2902
2903 static void __net_exit ip6_route_net_exit(struct net *net)
2904 {
2905         kfree(net->ipv6.ip6_null_entry);
2906 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2907         kfree(net->ipv6.ip6_prohibit_entry);
2908         kfree(net->ipv6.ip6_blk_hole_entry);
2909 #endif
2910         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2911 }
2912
2913 static int __net_init ip6_route_net_init_late(struct net *net)
2914 {
2915 #ifdef CONFIG_PROC_FS
2916         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2917         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2918 #endif
2919         return 0;
2920 }
2921
2922 static void __net_exit ip6_route_net_exit_late(struct net *net)
2923 {
2924 #ifdef CONFIG_PROC_FS
2925         proc_net_remove(net, "ipv6_route");
2926         proc_net_remove(net, "rt6_stats");
2927 #endif
2928 }
2929
2930 static struct pernet_operations ip6_route_net_ops = {
2931         .init = ip6_route_net_init,
2932         .exit = ip6_route_net_exit,
2933 };
2934
2935 static struct pernet_operations ip6_route_net_late_ops = {
2936         .init = ip6_route_net_init_late,
2937         .exit = ip6_route_net_exit_late,
2938 };
2939
2940 static struct notifier_block ip6_route_dev_notifier = {
2941         .notifier_call = ip6_route_dev_notify,
2942         .priority = 0,
2943 };
2944
2945 int __init ip6_route_init(void)
2946 {
2947         int ret;
2948
2949         ret = -ENOMEM;
2950         ip6_dst_ops_template.kmem_cachep =
2951                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2952                                   SLAB_HWCACHE_ALIGN, NULL);
2953         if (!ip6_dst_ops_template.kmem_cachep)
2954                 goto out;
2955
2956         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2957         if (ret)
2958                 goto out_kmem_cache;
2959
2960         ret = register_pernet_subsys(&ip6_route_net_ops);
2961         if (ret)
2962                 goto out_dst_entries;
2963
2964         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2965
2966         /* Registering of the loopback is done before this portion of code,
2967          * the loopback reference in rt6_info will not be taken, do it
2968          * manually for init_net */
2969         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2970         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2971   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2972         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2973         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2974         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2975         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976   #endif
2977         ret = fib6_init();
2978         if (ret)
2979                 goto out_register_subsys;
2980
2981         ret = xfrm6_init();
2982         if (ret)
2983                 goto out_fib6_init;
2984
2985         ret = fib6_rules_init();
2986         if (ret)
2987                 goto xfrm6_init;
2988
2989         ret = register_pernet_subsys(&ip6_route_net_late_ops);
2990         if (ret)
2991                 goto fib6_rules_init;
2992
2993         ret = -ENOBUFS;
2994         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2995             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2996             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2997                 goto out_register_late_subsys;
2998
2999         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3000         if (ret)
3001                 goto out_register_late_subsys;
3002
3003 out:
3004         return ret;
3005
3006 out_register_late_subsys:
3007         unregister_pernet_subsys(&ip6_route_net_late_ops);
3008 fib6_rules_init:
3009         fib6_rules_cleanup();
3010 xfrm6_init:
3011         xfrm6_fini();
3012 out_fib6_init:
3013         fib6_gc_cleanup();
3014 out_register_subsys:
3015         unregister_pernet_subsys(&ip6_route_net_ops);
3016 out_dst_entries:
3017         dst_entries_destroy(&ip6_dst_blackhole_ops);
3018 out_kmem_cache:
3019         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3020         goto out;
3021 }
3022
3023 void ip6_route_cleanup(void)
3024 {
3025         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3026         unregister_pernet_subsys(&ip6_route_net_late_ops);
3027         fib6_rules_cleanup();
3028         xfrm6_fini();
3029         fib6_gc_cleanup();
3030         unregister_pernet_subsys(&ip6_route_net_ops);
3031         dst_entries_destroy(&ip6_dst_blackhole_ops);
3032         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3033 }