Merge branch 'iommu/fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/joro...
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77                                     const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int      ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void             ip6_dst_destroy(struct dst_entry *);
83 static void             ip6_dst_ifdown(struct dst_entry *,
84                                        struct net_device *dev, int how);
85 static int               ip6_dst_gc(struct dst_ops *ops);
86
87 static int              ip6_pkt_discard(struct sk_buff *skb);
88 static int              ip6_pkt_discard_out(struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94                                            const struct in6_addr *prefix, int prefixlen,
95                                            const struct in6_addr *gwaddr, int ifindex,
96                                            unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98                                            const struct in6_addr *prefix, int prefixlen,
99                                            const struct in6_addr *gwaddr, int ifindex);
100 #endif
101
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 {
104         struct rt6_info *rt = (struct rt6_info *) dst;
105         struct inet_peer *peer;
106         u32 *p = NULL;
107
108         if (!(rt->dst.flags & DST_HOST))
109                 return NULL;
110
111         if (!rt->rt6i_peer)
112                 rt6_bind_peer(rt, 1);
113
114         peer = rt->rt6i_peer;
115         if (peer) {
116                 u32 *old_p = __DST_METRICS_PTR(old);
117                 unsigned long prev, new;
118
119                 p = peer->metrics;
120                 if (inet_metrics_new(peer))
121                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122
123                 new = (unsigned long) p;
124                 prev = cmpxchg(&dst->_metrics, old, new);
125
126                 if (prev != old) {
127                         p = __DST_METRICS_PTR(prev);
128                         if (prev & DST_METRICS_READ_ONLY)
129                                 p = NULL;
130                 }
131         }
132         return p;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
138 }
139
140 static struct dst_ops ip6_dst_ops_template = {
141         .family                 =       AF_INET6,
142         .protocol               =       cpu_to_be16(ETH_P_IPV6),
143         .gc                     =       ip6_dst_gc,
144         .gc_thresh              =       1024,
145         .check                  =       ip6_dst_check,
146         .default_advmss         =       ip6_default_advmss,
147         .mtu                    =       ip6_mtu,
148         .cow_metrics            =       ipv6_cow_metrics,
149         .destroy                =       ip6_dst_destroy,
150         .ifdown                 =       ip6_dst_ifdown,
151         .negative_advice        =       ip6_negative_advice,
152         .link_failure           =       ip6_link_failure,
153         .update_pmtu            =       ip6_rt_update_pmtu,
154         .local_out              =       __ip6_local_out,
155         .neigh_lookup           =       ip6_neigh_lookup,
156 };
157
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
159 {
160         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
161
162         return mtu ? : dst->dev->mtu;
163 }
164
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 {
167 }
168
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170                                          unsigned long old)
171 {
172         return NULL;
173 }
174
175 static struct dst_ops ip6_dst_blackhole_ops = {
176         .family                 =       AF_INET6,
177         .protocol               =       cpu_to_be16(ETH_P_IPV6),
178         .destroy                =       ip6_dst_destroy,
179         .check                  =       ip6_dst_check,
180         .mtu                    =       ip6_blackhole_mtu,
181         .default_advmss         =       ip6_default_advmss,
182         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
183         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
184         .neigh_lookup           =       ip6_neigh_lookup,
185 };
186
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188         [RTAX_HOPLIMIT - 1] = 255,
189 };
190
191 static struct rt6_info ip6_null_entry_template = {
192         .dst = {
193                 .__refcnt       = ATOMIC_INIT(1),
194                 .__use          = 1,
195                 .obsolete       = -1,
196                 .error          = -ENETUNREACH,
197                 .input          = ip6_pkt_discard,
198                 .output         = ip6_pkt_discard_out,
199         },
200         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
201         .rt6i_protocol  = RTPROT_KERNEL,
202         .rt6i_metric    = ~(u32) 0,
203         .rt6i_ref       = ATOMIC_INIT(1),
204 };
205
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
207
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
210
211 static struct rt6_info ip6_prohibit_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -EACCES,
217                 .input          = ip6_pkt_prohibit,
218                 .output         = ip6_pkt_prohibit_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 static struct rt6_info ip6_blk_hole_entry_template = {
227         .dst = {
228                 .__refcnt       = ATOMIC_INIT(1),
229                 .__use          = 1,
230                 .obsolete       = -1,
231                 .error          = -EINVAL,
232                 .input          = dst_discard,
233                 .output         = dst_discard,
234         },
235         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
236         .rt6i_protocol  = RTPROT_KERNEL,
237         .rt6i_metric    = ~(u32) 0,
238         .rt6i_ref       = ATOMIC_INIT(1),
239 };
240
241 #endif
242
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245                                              struct net_device *dev,
246                                              int flags)
247 {
248         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
249
250         if (rt != NULL)
251                 memset(&rt->rt6i_table, 0,
252                         sizeof(*rt) - sizeof(struct dst_entry));
253
254         return rt;
255 }
256
257 static void ip6_dst_destroy(struct dst_entry *dst)
258 {
259         struct rt6_info *rt = (struct rt6_info *)dst;
260         struct inet6_dev *idev = rt->rt6i_idev;
261         struct inet_peer *peer = rt->rt6i_peer;
262
263         if (!(rt->dst.flags & DST_HOST))
264                 dst_destroy_metrics_generic(dst);
265
266         if (idev != NULL) {
267                 rt->rt6i_idev = NULL;
268                 in6_dev_put(idev);
269         }
270         if (peer) {
271                 rt->rt6i_peer = NULL;
272                 inet_putpeer(peer);
273         }
274 }
275
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
277
278 static u32 rt6_peer_genid(void)
279 {
280         return atomic_read(&__rt6_peer_genid);
281 }
282
283 void rt6_bind_peer(struct rt6_info *rt, int create)
284 {
285         struct inet_peer *peer;
286
287         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289                 inet_putpeer(peer);
290         else
291                 rt->rt6i_peer_genid = rt6_peer_genid();
292 }
293
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295                            int how)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299         struct net_device *loopback_dev =
300                 dev_net(dev)->loopback_dev;
301
302         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303                 struct inet6_dev *loopback_idev =
304                         in6_dev_get(loopback_dev);
305                 if (loopback_idev != NULL) {
306                         rt->rt6i_idev = loopback_idev;
307                         in6_dev_put(idev);
308                 }
309         }
310 }
311
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
313 {
314         return (rt->rt6i_flags & RTF_EXPIRES) &&
315                 time_after(jiffies, rt->rt6i_expires);
316 }
317
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
319 {
320         return ipv6_addr_type(daddr) &
321                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
322 }
323
324 /*
325  *      Route lookup. Any table->tb6_lock is implied.
326  */
327
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329                                                     struct rt6_info *rt,
330                                                     const struct in6_addr *saddr,
331                                                     int oif,
332                                                     int flags)
333 {
334         struct rt6_info *local = NULL;
335         struct rt6_info *sprt;
336
337         if (!oif && ipv6_addr_any(saddr))
338                 goto out;
339
340         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341                 struct net_device *dev = sprt->rt6i_dev;
342
343                 if (oif) {
344                         if (dev->ifindex == oif)
345                                 return sprt;
346                         if (dev->flags & IFF_LOOPBACK) {
347                                 if (sprt->rt6i_idev == NULL ||
348                                     sprt->rt6i_idev->dev->ifindex != oif) {
349                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
350                                                 continue;
351                                         if (local && (!oif ||
352                                                       local->rt6i_idev->dev->ifindex == oif))
353                                                 continue;
354                                 }
355                                 local = sprt;
356                         }
357                 } else {
358                         if (ipv6_chk_addr(net, saddr, dev,
359                                           flags & RT6_LOOKUP_F_IFACE))
360                                 return sprt;
361                 }
362         }
363
364         if (oif) {
365                 if (local)
366                         return local;
367
368                 if (flags & RT6_LOOKUP_F_IFACE)
369                         return net->ipv6.ip6_null_entry;
370         }
371 out:
372         return rt;
373 }
374
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
377 {
378         struct neighbour *neigh;
379         /*
380          * Okay, this does not seem to be appropriate
381          * for now, however, we need to check if it
382          * is really so; aka Router Reachability Probing.
383          *
384          * Router Reachability Probe MUST be rate-limited
385          * to no more than one per minute.
386          */
387         rcu_read_lock();
388         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389         if (!neigh || (neigh->nud_state & NUD_VALID))
390                 goto out;
391         read_lock_bh(&neigh->lock);
392         if (!(neigh->nud_state & NUD_VALID) &&
393             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394                 struct in6_addr mcaddr;
395                 struct in6_addr *target;
396
397                 neigh->updated = jiffies;
398                 read_unlock_bh(&neigh->lock);
399
400                 target = (struct in6_addr *)&neigh->primary_key;
401                 addrconf_addr_solict_mult(target, &mcaddr);
402                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403         } else {
404                 read_unlock_bh(&neigh->lock);
405         }
406 out:
407         rcu_read_unlock();
408 }
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
411 {
412 }
413 #endif
414
415 /*
416  * Default Router Selection (RFC 2461 6.3.6)
417  */
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
419 {
420         struct net_device *dev = rt->rt6i_dev;
421         if (!oif || dev->ifindex == oif)
422                 return 2;
423         if ((dev->flags & IFF_LOOPBACK) &&
424             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425                 return 1;
426         return 0;
427 }
428
429 static inline int rt6_check_neigh(struct rt6_info *rt)
430 {
431         struct neighbour *neigh;
432         int m;
433
434         rcu_read_lock();
435         neigh = dst_get_neighbour(&rt->dst);
436         if (rt->rt6i_flags & RTF_NONEXTHOP ||
437             !(rt->rt6i_flags & RTF_GATEWAY))
438                 m = 1;
439         else if (neigh) {
440                 read_lock_bh(&neigh->lock);
441                 if (neigh->nud_state & NUD_VALID)
442                         m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444                 else if (neigh->nud_state & NUD_FAILED)
445                         m = 0;
446 #endif
447                 else
448                         m = 1;
449                 read_unlock_bh(&neigh->lock);
450         } else
451                 m = 0;
452         rcu_read_unlock();
453         return m;
454 }
455
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457                            int strict)
458 {
459         int m, n;
460
461         m = rt6_check_dev(rt, oif);
462         if (!m && (strict & RT6_LOOKUP_F_IFACE))
463                 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467         n = rt6_check_neigh(rt);
468         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469                 return -1;
470         return m;
471 }
472
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474                                    int *mpri, struct rt6_info *match)
475 {
476         int m;
477
478         if (rt6_check_expired(rt))
479                 goto out;
480
481         m = rt6_score_route(rt, oif, strict);
482         if (m < 0)
483                 goto out;
484
485         if (m > *mpri) {
486                 if (strict & RT6_LOOKUP_F_REACHABLE)
487                         rt6_probe(match);
488                 *mpri = m;
489                 match = rt;
490         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491                 rt6_probe(rt);
492         }
493
494 out:
495         return match;
496 }
497
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499                                      struct rt6_info *rr_head,
500                                      u32 metric, int oif, int strict)
501 {
502         struct rt6_info *rt, *match;
503         int mpri = -1;
504
505         match = NULL;
506         for (rt = rr_head; rt && rt->rt6i_metric == metric;
507              rt = rt->dst.rt6_next)
508                 match = find_match(rt, oif, strict, &mpri, match);
509         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510              rt = rt->dst.rt6_next)
511                 match = find_match(rt, oif, strict, &mpri, match);
512
513         return match;
514 }
515
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
517 {
518         struct rt6_info *match, *rt0;
519         struct net *net;
520
521         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522                   __func__, fn->leaf, oif);
523
524         rt0 = fn->rr_ptr;
525         if (!rt0)
526                 fn->rr_ptr = rt0 = fn->leaf;
527
528         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
529
530         if (!match &&
531             (strict & RT6_LOOKUP_F_REACHABLE)) {
532                 struct rt6_info *next = rt0->dst.rt6_next;
533
534                 /* no entries matched; do round-robin */
535                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536                         next = fn->leaf;
537
538                 if (next != rt0)
539                         fn->rr_ptr = next;
540         }
541
542         RT6_TRACE("%s() => %p\n",
543                   __func__, match);
544
545         net = dev_net(rt0->rt6i_dev);
546         return match ? match : net->ipv6.ip6_null_entry;
547 }
548
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551                   const struct in6_addr *gwaddr)
552 {
553         struct net *net = dev_net(dev);
554         struct route_info *rinfo = (struct route_info *) opt;
555         struct in6_addr prefix_buf, *prefix;
556         unsigned int pref;
557         unsigned long lifetime;
558         struct rt6_info *rt;
559
560         if (len < sizeof(struct route_info)) {
561                 return -EINVAL;
562         }
563
564         /* Sanity check for prefix_len and length */
565         if (rinfo->length > 3) {
566                 return -EINVAL;
567         } else if (rinfo->prefix_len > 128) {
568                 return -EINVAL;
569         } else if (rinfo->prefix_len > 64) {
570                 if (rinfo->length < 2) {
571                         return -EINVAL;
572                 }
573         } else if (rinfo->prefix_len > 0) {
574                 if (rinfo->length < 1) {
575                         return -EINVAL;
576                 }
577         }
578
579         pref = rinfo->route_pref;
580         if (pref == ICMPV6_ROUTER_PREF_INVALID)
581                 return -EINVAL;
582
583         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584
585         if (rinfo->length == 3)
586                 prefix = (struct in6_addr *)rinfo->prefix;
587         else {
588                 /* this function is safe */
589                 ipv6_addr_prefix(&prefix_buf,
590                                  (struct in6_addr *)rinfo->prefix,
591                                  rinfo->prefix_len);
592                 prefix = &prefix_buf;
593         }
594
595         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
596                                 dev->ifindex);
597
598         if (rt && !lifetime) {
599                 ip6_del_rt(rt);
600                 rt = NULL;
601         }
602
603         if (!rt && lifetime)
604                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
605                                         pref);
606         else if (rt)
607                 rt->rt6i_flags = RTF_ROUTEINFO |
608                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
609
610         if (rt) {
611                 if (!addrconf_finite_timeout(lifetime)) {
612                         rt->rt6i_flags &= ~RTF_EXPIRES;
613                 } else {
614                         rt->rt6i_expires = jiffies + HZ * lifetime;
615                         rt->rt6i_flags |= RTF_EXPIRES;
616                 }
617                 dst_release(&rt->dst);
618         }
619         return 0;
620 }
621 #endif
622
623 #define BACKTRACK(__net, saddr)                 \
624 do { \
625         if (rt == __net->ipv6.ip6_null_entry) { \
626                 struct fib6_node *pn; \
627                 while (1) { \
628                         if (fn->fn_flags & RTN_TL_ROOT) \
629                                 goto out; \
630                         pn = fn->parent; \
631                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
633                         else \
634                                 fn = pn; \
635                         if (fn->fn_flags & RTN_RTINFO) \
636                                 goto restart; \
637                 } \
638         } \
639 } while(0)
640
641 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
642                                              struct fib6_table *table,
643                                              struct flowi6 *fl6, int flags)
644 {
645         struct fib6_node *fn;
646         struct rt6_info *rt;
647
648         read_lock_bh(&table->tb6_lock);
649         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
650 restart:
651         rt = fn->leaf;
652         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
653         BACKTRACK(net, &fl6->saddr);
654 out:
655         dst_use(&rt->dst, jiffies);
656         read_unlock_bh(&table->tb6_lock);
657         return rt;
658
659 }
660
661 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
662                             const struct in6_addr *saddr, int oif, int strict)
663 {
664         struct flowi6 fl6 = {
665                 .flowi6_oif = oif,
666                 .daddr = *daddr,
667         };
668         struct dst_entry *dst;
669         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
670
671         if (saddr) {
672                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
673                 flags |= RT6_LOOKUP_F_HAS_SADDR;
674         }
675
676         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
677         if (dst->error == 0)
678                 return (struct rt6_info *) dst;
679
680         dst_release(dst);
681
682         return NULL;
683 }
684
685 EXPORT_SYMBOL(rt6_lookup);
686
687 /* ip6_ins_rt is called with FREE table->tb6_lock.
688    It takes new route entry, the addition fails by any reason the
689    route is freed. In any case, if caller does not hold it, it may
690    be destroyed.
691  */
692
693 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
694 {
695         int err;
696         struct fib6_table *table;
697
698         table = rt->rt6i_table;
699         write_lock_bh(&table->tb6_lock);
700         err = fib6_add(&table->tb6_root, rt, info);
701         write_unlock_bh(&table->tb6_lock);
702
703         return err;
704 }
705
706 int ip6_ins_rt(struct rt6_info *rt)
707 {
708         struct nl_info info = {
709                 .nl_net = dev_net(rt->rt6i_dev),
710         };
711         return __ip6_ins_rt(rt, &info);
712 }
713
714 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
715                                       const struct in6_addr *daddr,
716                                       const struct in6_addr *saddr)
717 {
718         struct rt6_info *rt;
719
720         /*
721          *      Clone the route.
722          */
723
724         rt = ip6_rt_copy(ort, daddr);
725
726         if (rt) {
727                 struct neighbour *neigh;
728                 int attempts = !in_softirq();
729
730                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
731                         if (ort->rt6i_dst.plen != 128 &&
732                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
733                                 rt->rt6i_flags |= RTF_ANYCAST;
734                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
735                 }
736
737                 rt->rt6i_flags |= RTF_CACHE;
738
739 #ifdef CONFIG_IPV6_SUBTREES
740                 if (rt->rt6i_src.plen && saddr) {
741                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
742                         rt->rt6i_src.plen = 128;
743                 }
744 #endif
745
746         retry:
747                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
748                 if (IS_ERR(neigh)) {
749                         struct net *net = dev_net(rt->rt6i_dev);
750                         int saved_rt_min_interval =
751                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
752                         int saved_rt_elasticity =
753                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
754
755                         if (attempts-- > 0) {
756                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
757                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
758
759                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
760
761                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
762                                         saved_rt_elasticity;
763                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
764                                         saved_rt_min_interval;
765                                 goto retry;
766                         }
767
768                         if (net_ratelimit())
769                                 printk(KERN_WARNING
770                                        "ipv6: Neighbour table overflow.\n");
771                         dst_free(&rt->dst);
772                         return NULL;
773                 }
774                 dst_set_neighbour(&rt->dst, neigh);
775
776         }
777
778         return rt;
779 }
780
781 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
782                                         const struct in6_addr *daddr)
783 {
784         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
785
786         if (rt) {
787                 rt->rt6i_flags |= RTF_CACHE;
788                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
789         }
790         return rt;
791 }
792
793 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
794                                       struct flowi6 *fl6, int flags)
795 {
796         struct fib6_node *fn;
797         struct rt6_info *rt, *nrt;
798         int strict = 0;
799         int attempts = 3;
800         int err;
801         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
802
803         strict |= flags & RT6_LOOKUP_F_IFACE;
804
805 relookup:
806         read_lock_bh(&table->tb6_lock);
807
808 restart_2:
809         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
810
811 restart:
812         rt = rt6_select(fn, oif, strict | reachable);
813
814         BACKTRACK(net, &fl6->saddr);
815         if (rt == net->ipv6.ip6_null_entry ||
816             rt->rt6i_flags & RTF_CACHE)
817                 goto out;
818
819         dst_hold(&rt->dst);
820         read_unlock_bh(&table->tb6_lock);
821
822         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
823                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
824         else if (!(rt->dst.flags & DST_HOST))
825                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
826         else
827                 goto out2;
828
829         dst_release(&rt->dst);
830         rt = nrt ? : net->ipv6.ip6_null_entry;
831
832         dst_hold(&rt->dst);
833         if (nrt) {
834                 err = ip6_ins_rt(nrt);
835                 if (!err)
836                         goto out2;
837         }
838
839         if (--attempts <= 0)
840                 goto out2;
841
842         /*
843          * Race condition! In the gap, when table->tb6_lock was
844          * released someone could insert this route.  Relookup.
845          */
846         dst_release(&rt->dst);
847         goto relookup;
848
849 out:
850         if (reachable) {
851                 reachable = 0;
852                 goto restart_2;
853         }
854         dst_hold(&rt->dst);
855         read_unlock_bh(&table->tb6_lock);
856 out2:
857         rt->dst.lastuse = jiffies;
858         rt->dst.__use++;
859
860         return rt;
861 }
862
863 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
864                                             struct flowi6 *fl6, int flags)
865 {
866         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
867 }
868
869 void ip6_route_input(struct sk_buff *skb)
870 {
871         const struct ipv6hdr *iph = ipv6_hdr(skb);
872         struct net *net = dev_net(skb->dev);
873         int flags = RT6_LOOKUP_F_HAS_SADDR;
874         struct flowi6 fl6 = {
875                 .flowi6_iif = skb->dev->ifindex,
876                 .daddr = iph->daddr,
877                 .saddr = iph->saddr,
878                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
879                 .flowi6_mark = skb->mark,
880                 .flowi6_proto = iph->nexthdr,
881         };
882
883         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
884                 flags |= RT6_LOOKUP_F_IFACE;
885
886         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
887 }
888
889 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
890                                              struct flowi6 *fl6, int flags)
891 {
892         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
893 }
894
895 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
896                                     struct flowi6 *fl6)
897 {
898         int flags = 0;
899
900         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
901                 flags |= RT6_LOOKUP_F_IFACE;
902
903         if (!ipv6_addr_any(&fl6->saddr))
904                 flags |= RT6_LOOKUP_F_HAS_SADDR;
905         else if (sk)
906                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
907
908         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
909 }
910
911 EXPORT_SYMBOL(ip6_route_output);
912
913 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
914 {
915         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
916         struct dst_entry *new = NULL;
917
918         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
919         if (rt) {
920                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
921
922                 new = &rt->dst;
923
924                 new->__use = 1;
925                 new->input = dst_discard;
926                 new->output = dst_discard;
927
928                 if (dst_metrics_read_only(&ort->dst))
929                         new->_metrics = ort->dst._metrics;
930                 else
931                         dst_copy_metrics(new, &ort->dst);
932                 rt->rt6i_idev = ort->rt6i_idev;
933                 if (rt->rt6i_idev)
934                         in6_dev_hold(rt->rt6i_idev);
935                 rt->rt6i_expires = 0;
936
937                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
938                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
939                 rt->rt6i_metric = 0;
940
941                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
942 #ifdef CONFIG_IPV6_SUBTREES
943                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
944 #endif
945
946                 dst_free(new);
947         }
948
949         dst_release(dst_orig);
950         return new ? new : ERR_PTR(-ENOMEM);
951 }
952
953 /*
954  *      Destination cache support functions
955  */
956
957 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
958 {
959         struct rt6_info *rt;
960
961         rt = (struct rt6_info *) dst;
962
963         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
964                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
965                         if (!rt->rt6i_peer)
966                                 rt6_bind_peer(rt, 0);
967                         rt->rt6i_peer_genid = rt6_peer_genid();
968                 }
969                 return dst;
970         }
971         return NULL;
972 }
973
974 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
975 {
976         struct rt6_info *rt = (struct rt6_info *) dst;
977
978         if (rt) {
979                 if (rt->rt6i_flags & RTF_CACHE) {
980                         if (rt6_check_expired(rt)) {
981                                 ip6_del_rt(rt);
982                                 dst = NULL;
983                         }
984                 } else {
985                         dst_release(dst);
986                         dst = NULL;
987                 }
988         }
989         return dst;
990 }
991
992 static void ip6_link_failure(struct sk_buff *skb)
993 {
994         struct rt6_info *rt;
995
996         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
997
998         rt = (struct rt6_info *) skb_dst(skb);
999         if (rt) {
1000                 if (rt->rt6i_flags&RTF_CACHE) {
1001                         dst_set_expires(&rt->dst, 0);
1002                         rt->rt6i_flags |= RTF_EXPIRES;
1003                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1004                         rt->rt6i_node->fn_sernum = -1;
1005         }
1006 }
1007
1008 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1009 {
1010         struct rt6_info *rt6 = (struct rt6_info*)dst;
1011
1012         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1013                 rt6->rt6i_flags |= RTF_MODIFIED;
1014                 if (mtu < IPV6_MIN_MTU) {
1015                         u32 features = dst_metric(dst, RTAX_FEATURES);
1016                         mtu = IPV6_MIN_MTU;
1017                         features |= RTAX_FEATURE_ALLFRAG;
1018                         dst_metric_set(dst, RTAX_FEATURES, features);
1019                 }
1020                 dst_metric_set(dst, RTAX_MTU, mtu);
1021         }
1022 }
1023
1024 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1025 {
1026         struct net_device *dev = dst->dev;
1027         unsigned int mtu = dst_mtu(dst);
1028         struct net *net = dev_net(dev);
1029
1030         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1031
1032         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1033                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1034
1035         /*
1036          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1037          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1038          * IPV6_MAXPLEN is also valid and means: "any MSS,
1039          * rely only on pmtu discovery"
1040          */
1041         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1042                 mtu = IPV6_MAXPLEN;
1043         return mtu;
1044 }
1045
1046 static unsigned int ip6_mtu(const struct dst_entry *dst)
1047 {
1048         struct inet6_dev *idev;
1049         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1050
1051         if (mtu)
1052                 return mtu;
1053
1054         mtu = IPV6_MIN_MTU;
1055
1056         rcu_read_lock();
1057         idev = __in6_dev_get(dst->dev);
1058         if (idev)
1059                 mtu = idev->cnf.mtu6;
1060         rcu_read_unlock();
1061
1062         return mtu;
1063 }
1064
1065 static struct dst_entry *icmp6_dst_gc_list;
1066 static DEFINE_SPINLOCK(icmp6_dst_lock);
1067
1068 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1069                                   struct neighbour *neigh,
1070                                   const struct in6_addr *addr)
1071 {
1072         struct rt6_info *rt;
1073         struct inet6_dev *idev = in6_dev_get(dev);
1074         struct net *net = dev_net(dev);
1075
1076         if (unlikely(idev == NULL))
1077                 return NULL;
1078
1079         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1080         if (unlikely(rt == NULL)) {
1081                 in6_dev_put(idev);
1082                 goto out;
1083         }
1084
1085         if (neigh)
1086                 neigh_hold(neigh);
1087         else {
1088                 neigh = ndisc_get_neigh(dev, addr);
1089                 if (IS_ERR(neigh))
1090                         neigh = NULL;
1091         }
1092
1093         rt->dst.flags |= DST_HOST;
1094         rt->dst.output  = ip6_output;
1095         dst_set_neighbour(&rt->dst, neigh);
1096         atomic_set(&rt->dst.__refcnt, 1);
1097         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1098         rt->rt6i_dst.plen = 128;
1099         rt->rt6i_idev     = idev;
1100         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1101
1102         spin_lock_bh(&icmp6_dst_lock);
1103         rt->dst.next = icmp6_dst_gc_list;
1104         icmp6_dst_gc_list = &rt->dst;
1105         spin_unlock_bh(&icmp6_dst_lock);
1106
1107         fib6_force_start_gc(net);
1108
1109 out:
1110         return &rt->dst;
1111 }
1112
1113 int icmp6_dst_gc(void)
1114 {
1115         struct dst_entry *dst, **pprev;
1116         int more = 0;
1117
1118         spin_lock_bh(&icmp6_dst_lock);
1119         pprev = &icmp6_dst_gc_list;
1120
1121         while ((dst = *pprev) != NULL) {
1122                 if (!atomic_read(&dst->__refcnt)) {
1123                         *pprev = dst->next;
1124                         dst_free(dst);
1125                 } else {
1126                         pprev = &dst->next;
1127                         ++more;
1128                 }
1129         }
1130
1131         spin_unlock_bh(&icmp6_dst_lock);
1132
1133         return more;
1134 }
1135
1136 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1137                             void *arg)
1138 {
1139         struct dst_entry *dst, **pprev;
1140
1141         spin_lock_bh(&icmp6_dst_lock);
1142         pprev = &icmp6_dst_gc_list;
1143         while ((dst = *pprev) != NULL) {
1144                 struct rt6_info *rt = (struct rt6_info *) dst;
1145                 if (func(rt, arg)) {
1146                         *pprev = dst->next;
1147                         dst_free(dst);
1148                 } else {
1149                         pprev = &dst->next;
1150                 }
1151         }
1152         spin_unlock_bh(&icmp6_dst_lock);
1153 }
1154
1155 static int ip6_dst_gc(struct dst_ops *ops)
1156 {
1157         unsigned long now = jiffies;
1158         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1159         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1160         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1161         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1162         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1163         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1164         int entries;
1165
1166         entries = dst_entries_get_fast(ops);
1167         if (time_after(rt_last_gc + rt_min_interval, now) &&
1168             entries <= rt_max_size)
1169                 goto out;
1170
1171         net->ipv6.ip6_rt_gc_expire++;
1172         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1173         net->ipv6.ip6_rt_last_gc = now;
1174         entries = dst_entries_get_slow(ops);
1175         if (entries < ops->gc_thresh)
1176                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1177 out:
1178         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1179         return entries > rt_max_size;
1180 }
1181
1182 /* Clean host part of a prefix. Not necessary in radix tree,
1183    but results in cleaner routing tables.
1184
1185    Remove it only when all the things will work!
1186  */
1187
1188 int ip6_dst_hoplimit(struct dst_entry *dst)
1189 {
1190         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1191         if (hoplimit == 0) {
1192                 struct net_device *dev = dst->dev;
1193                 struct inet6_dev *idev;
1194
1195                 rcu_read_lock();
1196                 idev = __in6_dev_get(dev);
1197                 if (idev)
1198                         hoplimit = idev->cnf.hop_limit;
1199                 else
1200                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1201                 rcu_read_unlock();
1202         }
1203         return hoplimit;
1204 }
1205 EXPORT_SYMBOL(ip6_dst_hoplimit);
1206
1207 /*
1208  *
1209  */
1210
1211 int ip6_route_add(struct fib6_config *cfg)
1212 {
1213         int err;
1214         struct net *net = cfg->fc_nlinfo.nl_net;
1215         struct rt6_info *rt = NULL;
1216         struct net_device *dev = NULL;
1217         struct inet6_dev *idev = NULL;
1218         struct fib6_table *table;
1219         int addr_type;
1220
1221         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1222                 return -EINVAL;
1223 #ifndef CONFIG_IPV6_SUBTREES
1224         if (cfg->fc_src_len)
1225                 return -EINVAL;
1226 #endif
1227         if (cfg->fc_ifindex) {
1228                 err = -ENODEV;
1229                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1230                 if (!dev)
1231                         goto out;
1232                 idev = in6_dev_get(dev);
1233                 if (!idev)
1234                         goto out;
1235         }
1236
1237         if (cfg->fc_metric == 0)
1238                 cfg->fc_metric = IP6_RT_PRIO_USER;
1239
1240         table = fib6_new_table(net, cfg->fc_table);
1241         if (table == NULL) {
1242                 err = -ENOBUFS;
1243                 goto out;
1244         }
1245
1246         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1247
1248         if (rt == NULL) {
1249                 err = -ENOMEM;
1250                 goto out;
1251         }
1252
1253         rt->dst.obsolete = -1;
1254         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1255                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1256                                 0;
1257
1258         if (cfg->fc_protocol == RTPROT_UNSPEC)
1259                 cfg->fc_protocol = RTPROT_BOOT;
1260         rt->rt6i_protocol = cfg->fc_protocol;
1261
1262         addr_type = ipv6_addr_type(&cfg->fc_dst);
1263
1264         if (addr_type & IPV6_ADDR_MULTICAST)
1265                 rt->dst.input = ip6_mc_input;
1266         else if (cfg->fc_flags & RTF_LOCAL)
1267                 rt->dst.input = ip6_input;
1268         else
1269                 rt->dst.input = ip6_forward;
1270
1271         rt->dst.output = ip6_output;
1272
1273         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1274         rt->rt6i_dst.plen = cfg->fc_dst_len;
1275         if (rt->rt6i_dst.plen == 128)
1276                rt->dst.flags |= DST_HOST;
1277
1278         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1279                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1280                 if (!metrics) {
1281                         err = -ENOMEM;
1282                         goto out;
1283                 }
1284                 dst_init_metrics(&rt->dst, metrics, 0);
1285         }
1286 #ifdef CONFIG_IPV6_SUBTREES
1287         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1288         rt->rt6i_src.plen = cfg->fc_src_len;
1289 #endif
1290
1291         rt->rt6i_metric = cfg->fc_metric;
1292
1293         /* We cannot add true routes via loopback here,
1294            they would result in kernel looping; promote them to reject routes
1295          */
1296         if ((cfg->fc_flags & RTF_REJECT) ||
1297             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1298                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1299                 /* hold loopback dev/idev if we haven't done so. */
1300                 if (dev != net->loopback_dev) {
1301                         if (dev) {
1302                                 dev_put(dev);
1303                                 in6_dev_put(idev);
1304                         }
1305                         dev = net->loopback_dev;
1306                         dev_hold(dev);
1307                         idev = in6_dev_get(dev);
1308                         if (!idev) {
1309                                 err = -ENODEV;
1310                                 goto out;
1311                         }
1312                 }
1313                 rt->dst.output = ip6_pkt_discard_out;
1314                 rt->dst.input = ip6_pkt_discard;
1315                 rt->dst.error = -ENETUNREACH;
1316                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1317                 goto install_route;
1318         }
1319
1320         if (cfg->fc_flags & RTF_GATEWAY) {
1321                 const struct in6_addr *gw_addr;
1322                 int gwa_type;
1323
1324                 gw_addr = &cfg->fc_gateway;
1325                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1326                 gwa_type = ipv6_addr_type(gw_addr);
1327
1328                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1329                         struct rt6_info *grt;
1330
1331                         /* IPv6 strictly inhibits using not link-local
1332                            addresses as nexthop address.
1333                            Otherwise, router will not able to send redirects.
1334                            It is very good, but in some (rare!) circumstances
1335                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1336                            some exceptions. --ANK
1337                          */
1338                         err = -EINVAL;
1339                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1340                                 goto out;
1341
1342                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1343
1344                         err = -EHOSTUNREACH;
1345                         if (grt == NULL)
1346                                 goto out;
1347                         if (dev) {
1348                                 if (dev != grt->rt6i_dev) {
1349                                         dst_release(&grt->dst);
1350                                         goto out;
1351                                 }
1352                         } else {
1353                                 dev = grt->rt6i_dev;
1354                                 idev = grt->rt6i_idev;
1355                                 dev_hold(dev);
1356                                 in6_dev_hold(grt->rt6i_idev);
1357                         }
1358                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1359                                 err = 0;
1360                         dst_release(&grt->dst);
1361
1362                         if (err)
1363                                 goto out;
1364                 }
1365                 err = -EINVAL;
1366                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1367                         goto out;
1368         }
1369
1370         err = -ENODEV;
1371         if (dev == NULL)
1372                 goto out;
1373
1374         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1375                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1376                         err = -EINVAL;
1377                         goto out;
1378                 }
1379                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1380                 rt->rt6i_prefsrc.plen = 128;
1381         } else
1382                 rt->rt6i_prefsrc.plen = 0;
1383
1384         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1385                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1386                 if (IS_ERR(n)) {
1387                         err = PTR_ERR(n);
1388                         goto out;
1389                 }
1390                 dst_set_neighbour(&rt->dst, n);
1391         }
1392
1393         rt->rt6i_flags = cfg->fc_flags;
1394
1395 install_route:
1396         if (cfg->fc_mx) {
1397                 struct nlattr *nla;
1398                 int remaining;
1399
1400                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1401                         int type = nla_type(nla);
1402
1403                         if (type) {
1404                                 if (type > RTAX_MAX) {
1405                                         err = -EINVAL;
1406                                         goto out;
1407                                 }
1408
1409                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1410                         }
1411                 }
1412         }
1413
1414         rt->dst.dev = dev;
1415         rt->rt6i_idev = idev;
1416         rt->rt6i_table = table;
1417
1418         cfg->fc_nlinfo.nl_net = dev_net(dev);
1419
1420         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1421
1422 out:
1423         if (dev)
1424                 dev_put(dev);
1425         if (idev)
1426                 in6_dev_put(idev);
1427         if (rt)
1428                 dst_free(&rt->dst);
1429         return err;
1430 }
1431
1432 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1433 {
1434         int err;
1435         struct fib6_table *table;
1436         struct net *net = dev_net(rt->rt6i_dev);
1437
1438         if (rt == net->ipv6.ip6_null_entry)
1439                 return -ENOENT;
1440
1441         table = rt->rt6i_table;
1442         write_lock_bh(&table->tb6_lock);
1443
1444         err = fib6_del(rt, info);
1445         dst_release(&rt->dst);
1446
1447         write_unlock_bh(&table->tb6_lock);
1448
1449         return err;
1450 }
1451
1452 int ip6_del_rt(struct rt6_info *rt)
1453 {
1454         struct nl_info info = {
1455                 .nl_net = dev_net(rt->rt6i_dev),
1456         };
1457         return __ip6_del_rt(rt, &info);
1458 }
1459
1460 static int ip6_route_del(struct fib6_config *cfg)
1461 {
1462         struct fib6_table *table;
1463         struct fib6_node *fn;
1464         struct rt6_info *rt;
1465         int err = -ESRCH;
1466
1467         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1468         if (table == NULL)
1469                 return err;
1470
1471         read_lock_bh(&table->tb6_lock);
1472
1473         fn = fib6_locate(&table->tb6_root,
1474                          &cfg->fc_dst, cfg->fc_dst_len,
1475                          &cfg->fc_src, cfg->fc_src_len);
1476
1477         if (fn) {
1478                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1479                         if (cfg->fc_ifindex &&
1480                             (rt->rt6i_dev == NULL ||
1481                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1482                                 continue;
1483                         if (cfg->fc_flags & RTF_GATEWAY &&
1484                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1485                                 continue;
1486                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1487                                 continue;
1488                         dst_hold(&rt->dst);
1489                         read_unlock_bh(&table->tb6_lock);
1490
1491                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1492                 }
1493         }
1494         read_unlock_bh(&table->tb6_lock);
1495
1496         return err;
1497 }
1498
1499 /*
1500  *      Handle redirects
1501  */
1502 struct ip6rd_flowi {
1503         struct flowi6 fl6;
1504         struct in6_addr gateway;
1505 };
1506
1507 static struct rt6_info *__ip6_route_redirect(struct net *net,
1508                                              struct fib6_table *table,
1509                                              struct flowi6 *fl6,
1510                                              int flags)
1511 {
1512         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1513         struct rt6_info *rt;
1514         struct fib6_node *fn;
1515
1516         /*
1517          * Get the "current" route for this destination and
1518          * check if the redirect has come from approriate router.
1519          *
1520          * RFC 2461 specifies that redirects should only be
1521          * accepted if they come from the nexthop to the target.
1522          * Due to the way the routes are chosen, this notion
1523          * is a bit fuzzy and one might need to check all possible
1524          * routes.
1525          */
1526
1527         read_lock_bh(&table->tb6_lock);
1528         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1529 restart:
1530         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1531                 /*
1532                  * Current route is on-link; redirect is always invalid.
1533                  *
1534                  * Seems, previous statement is not true. It could
1535                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1536                  * But then router serving it might decide, that we should
1537                  * know truth 8)8) --ANK (980726).
1538                  */
1539                 if (rt6_check_expired(rt))
1540                         continue;
1541                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1542                         continue;
1543                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1544                         continue;
1545                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1546                         continue;
1547                 break;
1548         }
1549
1550         if (!rt)
1551                 rt = net->ipv6.ip6_null_entry;
1552         BACKTRACK(net, &fl6->saddr);
1553 out:
1554         dst_hold(&rt->dst);
1555
1556         read_unlock_bh(&table->tb6_lock);
1557
1558         return rt;
1559 };
1560
1561 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1562                                            const struct in6_addr *src,
1563                                            const struct in6_addr *gateway,
1564                                            struct net_device *dev)
1565 {
1566         int flags = RT6_LOOKUP_F_HAS_SADDR;
1567         struct net *net = dev_net(dev);
1568         struct ip6rd_flowi rdfl = {
1569                 .fl6 = {
1570                         .flowi6_oif = dev->ifindex,
1571                         .daddr = *dest,
1572                         .saddr = *src,
1573                 },
1574         };
1575
1576         ipv6_addr_copy(&rdfl.gateway, gateway);
1577
1578         if (rt6_need_strict(dest))
1579                 flags |= RT6_LOOKUP_F_IFACE;
1580
1581         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1582                                                    flags, __ip6_route_redirect);
1583 }
1584
1585 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1586                   const struct in6_addr *saddr,
1587                   struct neighbour *neigh, u8 *lladdr, int on_link)
1588 {
1589         struct rt6_info *rt, *nrt = NULL;
1590         struct netevent_redirect netevent;
1591         struct net *net = dev_net(neigh->dev);
1592
1593         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1594
1595         if (rt == net->ipv6.ip6_null_entry) {
1596                 if (net_ratelimit())
1597                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1598                                "for redirect target\n");
1599                 goto out;
1600         }
1601
1602         /*
1603          *      We have finally decided to accept it.
1604          */
1605
1606         neigh_update(neigh, lladdr, NUD_STALE,
1607                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1608                      NEIGH_UPDATE_F_OVERRIDE|
1609                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1610                                      NEIGH_UPDATE_F_ISROUTER))
1611                      );
1612
1613         /*
1614          * Redirect received -> path was valid.
1615          * Look, redirects are sent only in response to data packets,
1616          * so that this nexthop apparently is reachable. --ANK
1617          */
1618         dst_confirm(&rt->dst);
1619
1620         /* Duplicate redirect: silently ignore. */
1621         if (neigh == dst_get_neighbour_raw(&rt->dst))
1622                 goto out;
1623
1624         nrt = ip6_rt_copy(rt, dest);
1625         if (nrt == NULL)
1626                 goto out;
1627
1628         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1629         if (on_link)
1630                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1631
1632         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1633         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1634
1635         if (ip6_ins_rt(nrt))
1636                 goto out;
1637
1638         netevent.old = &rt->dst;
1639         netevent.new = &nrt->dst;
1640         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1641
1642         if (rt->rt6i_flags&RTF_CACHE) {
1643                 ip6_del_rt(rt);
1644                 return;
1645         }
1646
1647 out:
1648         dst_release(&rt->dst);
1649 }
1650
1651 /*
1652  *      Handle ICMP "packet too big" messages
1653  *      i.e. Path MTU discovery
1654  */
1655
1656 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1657                              struct net *net, u32 pmtu, int ifindex)
1658 {
1659         struct rt6_info *rt, *nrt;
1660         int allfrag = 0;
1661 again:
1662         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1663         if (rt == NULL)
1664                 return;
1665
1666         if (rt6_check_expired(rt)) {
1667                 ip6_del_rt(rt);
1668                 goto again;
1669         }
1670
1671         if (pmtu >= dst_mtu(&rt->dst))
1672                 goto out;
1673
1674         if (pmtu < IPV6_MIN_MTU) {
1675                 /*
1676                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1677                  * MTU (1280) and a fragment header should always be included
1678                  * after a node receiving Too Big message reporting PMTU is
1679                  * less than the IPv6 Minimum Link MTU.
1680                  */
1681                 pmtu = IPV6_MIN_MTU;
1682                 allfrag = 1;
1683         }
1684
1685         /* New mtu received -> path was valid.
1686            They are sent only in response to data packets,
1687            so that this nexthop apparently is reachable. --ANK
1688          */
1689         dst_confirm(&rt->dst);
1690
1691         /* Host route. If it is static, it would be better
1692            not to override it, but add new one, so that
1693            when cache entry will expire old pmtu
1694            would return automatically.
1695          */
1696         if (rt->rt6i_flags & RTF_CACHE) {
1697                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1698                 if (allfrag) {
1699                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1700                         features |= RTAX_FEATURE_ALLFRAG;
1701                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1702                 }
1703                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1704                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1705                 goto out;
1706         }
1707
1708         /* Network route.
1709            Two cases are possible:
1710            1. It is connected route. Action: COW
1711            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1712          */
1713         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1714                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1715         else
1716                 nrt = rt6_alloc_clone(rt, daddr);
1717
1718         if (nrt) {
1719                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1720                 if (allfrag) {
1721                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1722                         features |= RTAX_FEATURE_ALLFRAG;
1723                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1724                 }
1725
1726                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1727                  * happened within 5 mins, the recommended timer is 10 mins.
1728                  * Here this route expiration time is set to ip6_rt_mtu_expires
1729                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1730                  * and detecting PMTU increase will be automatically happened.
1731                  */
1732                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1733                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1734
1735                 ip6_ins_rt(nrt);
1736         }
1737 out:
1738         dst_release(&rt->dst);
1739 }
1740
1741 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1742                         struct net_device *dev, u32 pmtu)
1743 {
1744         struct net *net = dev_net(dev);
1745
1746         /*
1747          * RFC 1981 states that a node "MUST reduce the size of the packets it
1748          * is sending along the path" that caused the Packet Too Big message.
1749          * Since it's not possible in the general case to determine which
1750          * interface was used to send the original packet, we update the MTU
1751          * on the interface that will be used to send future packets. We also
1752          * update the MTU on the interface that received the Packet Too Big in
1753          * case the original packet was forced out that interface with
1754          * SO_BINDTODEVICE or similar. This is the next best thing to the
1755          * correct behaviour, which would be to update the MTU on all
1756          * interfaces.
1757          */
1758         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1759         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1760 }
1761
1762 /*
1763  *      Misc support functions
1764  */
1765
1766 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1767                                     const struct in6_addr *dest)
1768 {
1769         struct net *net = dev_net(ort->rt6i_dev);
1770         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1771                                             ort->dst.dev, 0);
1772
1773         if (rt) {
1774                 rt->dst.input = ort->dst.input;
1775                 rt->dst.output = ort->dst.output;
1776                 rt->dst.flags |= DST_HOST;
1777
1778                 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1779                 rt->rt6i_dst.plen = 128;
1780                 dst_copy_metrics(&rt->dst, &ort->dst);
1781                 rt->dst.error = ort->dst.error;
1782                 rt->rt6i_idev = ort->rt6i_idev;
1783                 if (rt->rt6i_idev)
1784                         in6_dev_hold(rt->rt6i_idev);
1785                 rt->dst.lastuse = jiffies;
1786                 rt->rt6i_expires = 0;
1787
1788                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1789                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1790                 rt->rt6i_metric = 0;
1791
1792 #ifdef CONFIG_IPV6_SUBTREES
1793                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1794 #endif
1795                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1796                 rt->rt6i_table = ort->rt6i_table;
1797         }
1798         return rt;
1799 }
1800
1801 #ifdef CONFIG_IPV6_ROUTE_INFO
1802 static struct rt6_info *rt6_get_route_info(struct net *net,
1803                                            const struct in6_addr *prefix, int prefixlen,
1804                                            const struct in6_addr *gwaddr, int ifindex)
1805 {
1806         struct fib6_node *fn;
1807         struct rt6_info *rt = NULL;
1808         struct fib6_table *table;
1809
1810         table = fib6_get_table(net, RT6_TABLE_INFO);
1811         if (table == NULL)
1812                 return NULL;
1813
1814         write_lock_bh(&table->tb6_lock);
1815         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1816         if (!fn)
1817                 goto out;
1818
1819         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1820                 if (rt->rt6i_dev->ifindex != ifindex)
1821                         continue;
1822                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1823                         continue;
1824                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1825                         continue;
1826                 dst_hold(&rt->dst);
1827                 break;
1828         }
1829 out:
1830         write_unlock_bh(&table->tb6_lock);
1831         return rt;
1832 }
1833
1834 static struct rt6_info *rt6_add_route_info(struct net *net,
1835                                            const struct in6_addr *prefix, int prefixlen,
1836                                            const struct in6_addr *gwaddr, int ifindex,
1837                                            unsigned pref)
1838 {
1839         struct fib6_config cfg = {
1840                 .fc_table       = RT6_TABLE_INFO,
1841                 .fc_metric      = IP6_RT_PRIO_USER,
1842                 .fc_ifindex     = ifindex,
1843                 .fc_dst_len     = prefixlen,
1844                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1845                                   RTF_UP | RTF_PREF(pref),
1846                 .fc_nlinfo.pid = 0,
1847                 .fc_nlinfo.nlh = NULL,
1848                 .fc_nlinfo.nl_net = net,
1849         };
1850
1851         ipv6_addr_copy(&cfg.fc_dst, prefix);
1852         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1853
1854         /* We should treat it as a default route if prefix length is 0. */
1855         if (!prefixlen)
1856                 cfg.fc_flags |= RTF_DEFAULT;
1857
1858         ip6_route_add(&cfg);
1859
1860         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1861 }
1862 #endif
1863
1864 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1865 {
1866         struct rt6_info *rt;
1867         struct fib6_table *table;
1868
1869         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1870         if (table == NULL)
1871                 return NULL;
1872
1873         write_lock_bh(&table->tb6_lock);
1874         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1875                 if (dev == rt->rt6i_dev &&
1876                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1877                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1878                         break;
1879         }
1880         if (rt)
1881                 dst_hold(&rt->dst);
1882         write_unlock_bh(&table->tb6_lock);
1883         return rt;
1884 }
1885
1886 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1887                                      struct net_device *dev,
1888                                      unsigned int pref)
1889 {
1890         struct fib6_config cfg = {
1891                 .fc_table       = RT6_TABLE_DFLT,
1892                 .fc_metric      = IP6_RT_PRIO_USER,
1893                 .fc_ifindex     = dev->ifindex,
1894                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1895                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1896                 .fc_nlinfo.pid = 0,
1897                 .fc_nlinfo.nlh = NULL,
1898                 .fc_nlinfo.nl_net = dev_net(dev),
1899         };
1900
1901         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1902
1903         ip6_route_add(&cfg);
1904
1905         return rt6_get_dflt_router(gwaddr, dev);
1906 }
1907
1908 void rt6_purge_dflt_routers(struct net *net)
1909 {
1910         struct rt6_info *rt;
1911         struct fib6_table *table;
1912
1913         /* NOTE: Keep consistent with rt6_get_dflt_router */
1914         table = fib6_get_table(net, RT6_TABLE_DFLT);
1915         if (table == NULL)
1916                 return;
1917
1918 restart:
1919         read_lock_bh(&table->tb6_lock);
1920         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1921                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1922                         dst_hold(&rt->dst);
1923                         read_unlock_bh(&table->tb6_lock);
1924                         ip6_del_rt(rt);
1925                         goto restart;
1926                 }
1927         }
1928         read_unlock_bh(&table->tb6_lock);
1929 }
1930
1931 static void rtmsg_to_fib6_config(struct net *net,
1932                                  struct in6_rtmsg *rtmsg,
1933                                  struct fib6_config *cfg)
1934 {
1935         memset(cfg, 0, sizeof(*cfg));
1936
1937         cfg->fc_table = RT6_TABLE_MAIN;
1938         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1939         cfg->fc_metric = rtmsg->rtmsg_metric;
1940         cfg->fc_expires = rtmsg->rtmsg_info;
1941         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1942         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1943         cfg->fc_flags = rtmsg->rtmsg_flags;
1944
1945         cfg->fc_nlinfo.nl_net = net;
1946
1947         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1948         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1949         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1950 }
1951
1952 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1953 {
1954         struct fib6_config cfg;
1955         struct in6_rtmsg rtmsg;
1956         int err;
1957
1958         switch(cmd) {
1959         case SIOCADDRT:         /* Add a route */
1960         case SIOCDELRT:         /* Delete a route */
1961                 if (!capable(CAP_NET_ADMIN))
1962                         return -EPERM;
1963                 err = copy_from_user(&rtmsg, arg,
1964                                      sizeof(struct in6_rtmsg));
1965                 if (err)
1966                         return -EFAULT;
1967
1968                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1969
1970                 rtnl_lock();
1971                 switch (cmd) {
1972                 case SIOCADDRT:
1973                         err = ip6_route_add(&cfg);
1974                         break;
1975                 case SIOCDELRT:
1976                         err = ip6_route_del(&cfg);
1977                         break;
1978                 default:
1979                         err = -EINVAL;
1980                 }
1981                 rtnl_unlock();
1982
1983                 return err;
1984         }
1985
1986         return -EINVAL;
1987 }
1988
1989 /*
1990  *      Drop the packet on the floor
1991  */
1992
1993 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1994 {
1995         int type;
1996         struct dst_entry *dst = skb_dst(skb);
1997         switch (ipstats_mib_noroutes) {
1998         case IPSTATS_MIB_INNOROUTES:
1999                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2000                 if (type == IPV6_ADDR_ANY) {
2001                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2002                                       IPSTATS_MIB_INADDRERRORS);
2003                         break;
2004                 }
2005                 /* FALLTHROUGH */
2006         case IPSTATS_MIB_OUTNOROUTES:
2007                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2008                               ipstats_mib_noroutes);
2009                 break;
2010         }
2011         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2012         kfree_skb(skb);
2013         return 0;
2014 }
2015
2016 static int ip6_pkt_discard(struct sk_buff *skb)
2017 {
2018         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2019 }
2020
2021 static int ip6_pkt_discard_out(struct sk_buff *skb)
2022 {
2023         skb->dev = skb_dst(skb)->dev;
2024         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2025 }
2026
2027 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2028
2029 static int ip6_pkt_prohibit(struct sk_buff *skb)
2030 {
2031         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2032 }
2033
2034 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2035 {
2036         skb->dev = skb_dst(skb)->dev;
2037         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2038 }
2039
2040 #endif
2041
2042 /*
2043  *      Allocate a dst for local (unicast / anycast) address.
2044  */
2045
2046 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2047                                     const struct in6_addr *addr,
2048                                     int anycast)
2049 {
2050         struct net *net = dev_net(idev->dev);
2051         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2052                                             net->loopback_dev, 0);
2053         struct neighbour *neigh;
2054
2055         if (rt == NULL) {
2056                 if (net_ratelimit())
2057                         pr_warning("IPv6:  Maximum number of routes reached,"
2058                                    " consider increasing route/max_size.\n");
2059                 return ERR_PTR(-ENOMEM);
2060         }
2061
2062         in6_dev_hold(idev);
2063
2064         rt->dst.flags |= DST_HOST;
2065         rt->dst.input = ip6_input;
2066         rt->dst.output = ip6_output;
2067         rt->rt6i_idev = idev;
2068         rt->dst.obsolete = -1;
2069
2070         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2071         if (anycast)
2072                 rt->rt6i_flags |= RTF_ANYCAST;
2073         else
2074                 rt->rt6i_flags |= RTF_LOCAL;
2075         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2076         if (IS_ERR(neigh)) {
2077                 dst_free(&rt->dst);
2078
2079                 return ERR_CAST(neigh);
2080         }
2081         dst_set_neighbour(&rt->dst, neigh);
2082
2083         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2084         rt->rt6i_dst.plen = 128;
2085         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2086
2087         atomic_set(&rt->dst.__refcnt, 1);
2088
2089         return rt;
2090 }
2091
2092 int ip6_route_get_saddr(struct net *net,
2093                         struct rt6_info *rt,
2094                         const struct in6_addr *daddr,
2095                         unsigned int prefs,
2096                         struct in6_addr *saddr)
2097 {
2098         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2099         int err = 0;
2100         if (rt->rt6i_prefsrc.plen)
2101                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2102         else
2103                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2104                                          daddr, prefs, saddr);
2105         return err;
2106 }
2107
2108 /* remove deleted ip from prefsrc entries */
2109 struct arg_dev_net_ip {
2110         struct net_device *dev;
2111         struct net *net;
2112         struct in6_addr *addr;
2113 };
2114
2115 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2116 {
2117         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2118         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2119         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2120
2121         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2122             rt != net->ipv6.ip6_null_entry &&
2123             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2124                 /* remove prefsrc entry */
2125                 rt->rt6i_prefsrc.plen = 0;
2126         }
2127         return 0;
2128 }
2129
2130 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2131 {
2132         struct net *net = dev_net(ifp->idev->dev);
2133         struct arg_dev_net_ip adni = {
2134                 .dev = ifp->idev->dev,
2135                 .net = net,
2136                 .addr = &ifp->addr,
2137         };
2138         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2139 }
2140
2141 struct arg_dev_net {
2142         struct net_device *dev;
2143         struct net *net;
2144 };
2145
2146 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2147 {
2148         const struct arg_dev_net *adn = arg;
2149         const struct net_device *dev = adn->dev;
2150
2151         if ((rt->rt6i_dev == dev || dev == NULL) &&
2152             rt != adn->net->ipv6.ip6_null_entry) {
2153                 RT6_TRACE("deleted by ifdown %p\n", rt);
2154                 return -1;
2155         }
2156         return 0;
2157 }
2158
2159 void rt6_ifdown(struct net *net, struct net_device *dev)
2160 {
2161         struct arg_dev_net adn = {
2162                 .dev = dev,
2163                 .net = net,
2164         };
2165
2166         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2167         icmp6_clean_all(fib6_ifdown, &adn);
2168 }
2169
2170 struct rt6_mtu_change_arg
2171 {
2172         struct net_device *dev;
2173         unsigned mtu;
2174 };
2175
2176 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2177 {
2178         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2179         struct inet6_dev *idev;
2180
2181         /* In IPv6 pmtu discovery is not optional,
2182            so that RTAX_MTU lock cannot disable it.
2183            We still use this lock to block changes
2184            caused by addrconf/ndisc.
2185         */
2186
2187         idev = __in6_dev_get(arg->dev);
2188         if (idev == NULL)
2189                 return 0;
2190
2191         /* For administrative MTU increase, there is no way to discover
2192            IPv6 PMTU increase, so PMTU increase should be updated here.
2193            Since RFC 1981 doesn't include administrative MTU increase
2194            update PMTU increase is a MUST. (i.e. jumbo frame)
2195          */
2196         /*
2197            If new MTU is less than route PMTU, this new MTU will be the
2198            lowest MTU in the path, update the route PMTU to reflect PMTU
2199            decreases; if new MTU is greater than route PMTU, and the
2200            old MTU is the lowest MTU in the path, update the route PMTU
2201            to reflect the increase. In this case if the other nodes' MTU
2202            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2203            PMTU discouvery.
2204          */
2205         if (rt->rt6i_dev == arg->dev &&
2206             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2207             (dst_mtu(&rt->dst) >= arg->mtu ||
2208              (dst_mtu(&rt->dst) < arg->mtu &&
2209               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2210                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2211         }
2212         return 0;
2213 }
2214
2215 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2216 {
2217         struct rt6_mtu_change_arg arg = {
2218                 .dev = dev,
2219                 .mtu = mtu,
2220         };
2221
2222         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2223 }
2224
2225 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2226         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2227         [RTA_OIF]               = { .type = NLA_U32 },
2228         [RTA_IIF]               = { .type = NLA_U32 },
2229         [RTA_PRIORITY]          = { .type = NLA_U32 },
2230         [RTA_METRICS]           = { .type = NLA_NESTED },
2231 };
2232
2233 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2234                               struct fib6_config *cfg)
2235 {
2236         struct rtmsg *rtm;
2237         struct nlattr *tb[RTA_MAX+1];
2238         int err;
2239
2240         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2241         if (err < 0)
2242                 goto errout;
2243
2244         err = -EINVAL;
2245         rtm = nlmsg_data(nlh);
2246         memset(cfg, 0, sizeof(*cfg));
2247
2248         cfg->fc_table = rtm->rtm_table;
2249         cfg->fc_dst_len = rtm->rtm_dst_len;
2250         cfg->fc_src_len = rtm->rtm_src_len;
2251         cfg->fc_flags = RTF_UP;
2252         cfg->fc_protocol = rtm->rtm_protocol;
2253
2254         if (rtm->rtm_type == RTN_UNREACHABLE)
2255                 cfg->fc_flags |= RTF_REJECT;
2256
2257         if (rtm->rtm_type == RTN_LOCAL)
2258                 cfg->fc_flags |= RTF_LOCAL;
2259
2260         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2261         cfg->fc_nlinfo.nlh = nlh;
2262         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2263
2264         if (tb[RTA_GATEWAY]) {
2265                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2266                 cfg->fc_flags |= RTF_GATEWAY;
2267         }
2268
2269         if (tb[RTA_DST]) {
2270                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2271
2272                 if (nla_len(tb[RTA_DST]) < plen)
2273                         goto errout;
2274
2275                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2276         }
2277
2278         if (tb[RTA_SRC]) {
2279                 int plen = (rtm->rtm_src_len + 7) >> 3;
2280
2281                 if (nla_len(tb[RTA_SRC]) < plen)
2282                         goto errout;
2283
2284                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2285         }
2286
2287         if (tb[RTA_PREFSRC])
2288                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2289
2290         if (tb[RTA_OIF])
2291                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2292
2293         if (tb[RTA_PRIORITY])
2294                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2295
2296         if (tb[RTA_METRICS]) {
2297                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2298                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2299         }
2300
2301         if (tb[RTA_TABLE])
2302                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2303
2304         err = 0;
2305 errout:
2306         return err;
2307 }
2308
2309 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2310 {
2311         struct fib6_config cfg;
2312         int err;
2313
2314         err = rtm_to_fib6_config(skb, nlh, &cfg);
2315         if (err < 0)
2316                 return err;
2317
2318         return ip6_route_del(&cfg);
2319 }
2320
2321 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2322 {
2323         struct fib6_config cfg;
2324         int err;
2325
2326         err = rtm_to_fib6_config(skb, nlh, &cfg);
2327         if (err < 0)
2328                 return err;
2329
2330         return ip6_route_add(&cfg);
2331 }
2332
2333 static inline size_t rt6_nlmsg_size(void)
2334 {
2335         return NLMSG_ALIGN(sizeof(struct rtmsg))
2336                + nla_total_size(16) /* RTA_SRC */
2337                + nla_total_size(16) /* RTA_DST */
2338                + nla_total_size(16) /* RTA_GATEWAY */
2339                + nla_total_size(16) /* RTA_PREFSRC */
2340                + nla_total_size(4) /* RTA_TABLE */
2341                + nla_total_size(4) /* RTA_IIF */
2342                + nla_total_size(4) /* RTA_OIF */
2343                + nla_total_size(4) /* RTA_PRIORITY */
2344                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2345                + nla_total_size(sizeof(struct rta_cacheinfo));
2346 }
2347
2348 static int rt6_fill_node(struct net *net,
2349                          struct sk_buff *skb, struct rt6_info *rt,
2350                          struct in6_addr *dst, struct in6_addr *src,
2351                          int iif, int type, u32 pid, u32 seq,
2352                          int prefix, int nowait, unsigned int flags)
2353 {
2354         struct rtmsg *rtm;
2355         struct nlmsghdr *nlh;
2356         long expires;
2357         u32 table;
2358         struct neighbour *n;
2359
2360         if (prefix) {   /* user wants prefix routes only */
2361                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2362                         /* success since this is not a prefix route */
2363                         return 1;
2364                 }
2365         }
2366
2367         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2368         if (nlh == NULL)
2369                 return -EMSGSIZE;
2370
2371         rtm = nlmsg_data(nlh);
2372         rtm->rtm_family = AF_INET6;
2373         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2374         rtm->rtm_src_len = rt->rt6i_src.plen;
2375         rtm->rtm_tos = 0;
2376         if (rt->rt6i_table)
2377                 table = rt->rt6i_table->tb6_id;
2378         else
2379                 table = RT6_TABLE_UNSPEC;
2380         rtm->rtm_table = table;
2381         NLA_PUT_U32(skb, RTA_TABLE, table);
2382         if (rt->rt6i_flags&RTF_REJECT)
2383                 rtm->rtm_type = RTN_UNREACHABLE;
2384         else if (rt->rt6i_flags&RTF_LOCAL)
2385                 rtm->rtm_type = RTN_LOCAL;
2386         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2387                 rtm->rtm_type = RTN_LOCAL;
2388         else
2389                 rtm->rtm_type = RTN_UNICAST;
2390         rtm->rtm_flags = 0;
2391         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2392         rtm->rtm_protocol = rt->rt6i_protocol;
2393         if (rt->rt6i_flags&RTF_DYNAMIC)
2394                 rtm->rtm_protocol = RTPROT_REDIRECT;
2395         else if (rt->rt6i_flags & RTF_ADDRCONF)
2396                 rtm->rtm_protocol = RTPROT_KERNEL;
2397         else if (rt->rt6i_flags&RTF_DEFAULT)
2398                 rtm->rtm_protocol = RTPROT_RA;
2399
2400         if (rt->rt6i_flags&RTF_CACHE)
2401                 rtm->rtm_flags |= RTM_F_CLONED;
2402
2403         if (dst) {
2404                 NLA_PUT(skb, RTA_DST, 16, dst);
2405                 rtm->rtm_dst_len = 128;
2406         } else if (rtm->rtm_dst_len)
2407                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2408 #ifdef CONFIG_IPV6_SUBTREES
2409         if (src) {
2410                 NLA_PUT(skb, RTA_SRC, 16, src);
2411                 rtm->rtm_src_len = 128;
2412         } else if (rtm->rtm_src_len)
2413                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2414 #endif
2415         if (iif) {
2416 #ifdef CONFIG_IPV6_MROUTE
2417                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2418                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2419                         if (err <= 0) {
2420                                 if (!nowait) {
2421                                         if (err == 0)
2422                                                 return 0;
2423                                         goto nla_put_failure;
2424                                 } else {
2425                                         if (err == -EMSGSIZE)
2426                                                 goto nla_put_failure;
2427                                 }
2428                         }
2429                 } else
2430 #endif
2431                         NLA_PUT_U32(skb, RTA_IIF, iif);
2432         } else if (dst) {
2433                 struct in6_addr saddr_buf;
2434                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2435                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2436         }
2437
2438         if (rt->rt6i_prefsrc.plen) {
2439                 struct in6_addr saddr_buf;
2440                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2441                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2442         }
2443
2444         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2445                 goto nla_put_failure;
2446
2447         rcu_read_lock();
2448         n = dst_get_neighbour(&rt->dst);
2449         if (n)
2450                 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2451         rcu_read_unlock();
2452
2453         if (rt->dst.dev)
2454                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2455
2456         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2457
2458         if (!(rt->rt6i_flags & RTF_EXPIRES))
2459                 expires = 0;
2460         else if (rt->rt6i_expires - jiffies < INT_MAX)
2461                 expires = rt->rt6i_expires - jiffies;
2462         else
2463                 expires = INT_MAX;
2464
2465         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2466                                expires, rt->dst.error) < 0)
2467                 goto nla_put_failure;
2468
2469         return nlmsg_end(skb, nlh);
2470
2471 nla_put_failure:
2472         nlmsg_cancel(skb, nlh);
2473         return -EMSGSIZE;
2474 }
2475
2476 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2477 {
2478         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2479         int prefix;
2480
2481         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2482                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2483                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2484         } else
2485                 prefix = 0;
2486
2487         return rt6_fill_node(arg->net,
2488                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2489                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2490                      prefix, 0, NLM_F_MULTI);
2491 }
2492
2493 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2494 {
2495         struct net *net = sock_net(in_skb->sk);
2496         struct nlattr *tb[RTA_MAX+1];
2497         struct rt6_info *rt;
2498         struct sk_buff *skb;
2499         struct rtmsg *rtm;
2500         struct flowi6 fl6;
2501         int err, iif = 0;
2502
2503         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2504         if (err < 0)
2505                 goto errout;
2506
2507         err = -EINVAL;
2508         memset(&fl6, 0, sizeof(fl6));
2509
2510         if (tb[RTA_SRC]) {
2511                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2512                         goto errout;
2513
2514                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2515         }
2516
2517         if (tb[RTA_DST]) {
2518                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2519                         goto errout;
2520
2521                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2522         }
2523
2524         if (tb[RTA_IIF])
2525                 iif = nla_get_u32(tb[RTA_IIF]);
2526
2527         if (tb[RTA_OIF])
2528                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2529
2530         if (iif) {
2531                 struct net_device *dev;
2532                 dev = __dev_get_by_index(net, iif);
2533                 if (!dev) {
2534                         err = -ENODEV;
2535                         goto errout;
2536                 }
2537         }
2538
2539         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2540         if (skb == NULL) {
2541                 err = -ENOBUFS;
2542                 goto errout;
2543         }
2544
2545         /* Reserve room for dummy headers, this skb can pass
2546            through good chunk of routing engine.
2547          */
2548         skb_reset_mac_header(skb);
2549         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2550
2551         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2552         skb_dst_set(skb, &rt->dst);
2553
2554         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2555                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2556                             nlh->nlmsg_seq, 0, 0, 0);
2557         if (err < 0) {
2558                 kfree_skb(skb);
2559                 goto errout;
2560         }
2561
2562         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2563 errout:
2564         return err;
2565 }
2566
2567 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2568 {
2569         struct sk_buff *skb;
2570         struct net *net = info->nl_net;
2571         u32 seq;
2572         int err;
2573
2574         err = -ENOBUFS;
2575         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2576
2577         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2578         if (skb == NULL)
2579                 goto errout;
2580
2581         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2582                                 event, info->pid, seq, 0, 0, 0);
2583         if (err < 0) {
2584                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2585                 WARN_ON(err == -EMSGSIZE);
2586                 kfree_skb(skb);
2587                 goto errout;
2588         }
2589         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2590                     info->nlh, gfp_any());
2591         return;
2592 errout:
2593         if (err < 0)
2594                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2595 }
2596
2597 static int ip6_route_dev_notify(struct notifier_block *this,
2598                                 unsigned long event, void *data)
2599 {
2600         struct net_device *dev = (struct net_device *)data;
2601         struct net *net = dev_net(dev);
2602
2603         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2604                 net->ipv6.ip6_null_entry->dst.dev = dev;
2605                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2606 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2607                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2608                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2609                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2610                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2611 #endif
2612         }
2613
2614         return NOTIFY_OK;
2615 }
2616
2617 /*
2618  *      /proc
2619  */
2620
2621 #ifdef CONFIG_PROC_FS
2622
2623 struct rt6_proc_arg
2624 {
2625         char *buffer;
2626         int offset;
2627         int length;
2628         int skip;
2629         int len;
2630 };
2631
2632 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2633 {
2634         struct seq_file *m = p_arg;
2635         struct neighbour *n;
2636
2637         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2638
2639 #ifdef CONFIG_IPV6_SUBTREES
2640         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2641 #else
2642         seq_puts(m, "00000000000000000000000000000000 00 ");
2643 #endif
2644         rcu_read_lock();
2645         n = dst_get_neighbour(&rt->dst);
2646         if (n) {
2647                 seq_printf(m, "%pi6", n->primary_key);
2648         } else {
2649                 seq_puts(m, "00000000000000000000000000000000");
2650         }
2651         rcu_read_unlock();
2652         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2653                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2654                    rt->dst.__use, rt->rt6i_flags,
2655                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2656         return 0;
2657 }
2658
2659 static int ipv6_route_show(struct seq_file *m, void *v)
2660 {
2661         struct net *net = (struct net *)m->private;
2662         fib6_clean_all(net, rt6_info_route, 0, m);
2663         return 0;
2664 }
2665
2666 static int ipv6_route_open(struct inode *inode, struct file *file)
2667 {
2668         return single_open_net(inode, file, ipv6_route_show);
2669 }
2670
2671 static const struct file_operations ipv6_route_proc_fops = {
2672         .owner          = THIS_MODULE,
2673         .open           = ipv6_route_open,
2674         .read           = seq_read,
2675         .llseek         = seq_lseek,
2676         .release        = single_release_net,
2677 };
2678
2679 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2680 {
2681         struct net *net = (struct net *)seq->private;
2682         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2683                    net->ipv6.rt6_stats->fib_nodes,
2684                    net->ipv6.rt6_stats->fib_route_nodes,
2685                    net->ipv6.rt6_stats->fib_rt_alloc,
2686                    net->ipv6.rt6_stats->fib_rt_entries,
2687                    net->ipv6.rt6_stats->fib_rt_cache,
2688                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2689                    net->ipv6.rt6_stats->fib_discarded_routes);
2690
2691         return 0;
2692 }
2693
2694 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2695 {
2696         return single_open_net(inode, file, rt6_stats_seq_show);
2697 }
2698
2699 static const struct file_operations rt6_stats_seq_fops = {
2700         .owner   = THIS_MODULE,
2701         .open    = rt6_stats_seq_open,
2702         .read    = seq_read,
2703         .llseek  = seq_lseek,
2704         .release = single_release_net,
2705 };
2706 #endif  /* CONFIG_PROC_FS */
2707
2708 #ifdef CONFIG_SYSCTL
2709
2710 static
2711 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2712                               void __user *buffer, size_t *lenp, loff_t *ppos)
2713 {
2714         struct net *net;
2715         int delay;
2716         if (!write)
2717                 return -EINVAL;
2718
2719         net = (struct net *)ctl->extra1;
2720         delay = net->ipv6.sysctl.flush_delay;
2721         proc_dointvec(ctl, write, buffer, lenp, ppos);
2722         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2723         return 0;
2724 }
2725
2726 ctl_table ipv6_route_table_template[] = {
2727         {
2728                 .procname       =       "flush",
2729                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2730                 .maxlen         =       sizeof(int),
2731                 .mode           =       0200,
2732                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2733         },
2734         {
2735                 .procname       =       "gc_thresh",
2736                 .data           =       &ip6_dst_ops_template.gc_thresh,
2737                 .maxlen         =       sizeof(int),
2738                 .mode           =       0644,
2739                 .proc_handler   =       proc_dointvec,
2740         },
2741         {
2742                 .procname       =       "max_size",
2743                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2744                 .maxlen         =       sizeof(int),
2745                 .mode           =       0644,
2746                 .proc_handler   =       proc_dointvec,
2747         },
2748         {
2749                 .procname       =       "gc_min_interval",
2750                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2751                 .maxlen         =       sizeof(int),
2752                 .mode           =       0644,
2753                 .proc_handler   =       proc_dointvec_jiffies,
2754         },
2755         {
2756                 .procname       =       "gc_timeout",
2757                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2758                 .maxlen         =       sizeof(int),
2759                 .mode           =       0644,
2760                 .proc_handler   =       proc_dointvec_jiffies,
2761         },
2762         {
2763                 .procname       =       "gc_interval",
2764                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2765                 .maxlen         =       sizeof(int),
2766                 .mode           =       0644,
2767                 .proc_handler   =       proc_dointvec_jiffies,
2768         },
2769         {
2770                 .procname       =       "gc_elasticity",
2771                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2772                 .maxlen         =       sizeof(int),
2773                 .mode           =       0644,
2774                 .proc_handler   =       proc_dointvec,
2775         },
2776         {
2777                 .procname       =       "mtu_expires",
2778                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2779                 .maxlen         =       sizeof(int),
2780                 .mode           =       0644,
2781                 .proc_handler   =       proc_dointvec_jiffies,
2782         },
2783         {
2784                 .procname       =       "min_adv_mss",
2785                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2786                 .maxlen         =       sizeof(int),
2787                 .mode           =       0644,
2788                 .proc_handler   =       proc_dointvec,
2789         },
2790         {
2791                 .procname       =       "gc_min_interval_ms",
2792                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2793                 .maxlen         =       sizeof(int),
2794                 .mode           =       0644,
2795                 .proc_handler   =       proc_dointvec_ms_jiffies,
2796         },
2797         { }
2798 };
2799
2800 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2801 {
2802         struct ctl_table *table;
2803
2804         table = kmemdup(ipv6_route_table_template,
2805                         sizeof(ipv6_route_table_template),
2806                         GFP_KERNEL);
2807
2808         if (table) {
2809                 table[0].data = &net->ipv6.sysctl.flush_delay;
2810                 table[0].extra1 = net;
2811                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2812                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2813                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2814                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2815                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2816                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2817                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2818                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2819                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2820         }
2821
2822         return table;
2823 }
2824 #endif
2825
2826 static int __net_init ip6_route_net_init(struct net *net)
2827 {
2828         int ret = -ENOMEM;
2829
2830         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2831                sizeof(net->ipv6.ip6_dst_ops));
2832
2833         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2834                 goto out_ip6_dst_ops;
2835
2836         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2837                                            sizeof(*net->ipv6.ip6_null_entry),
2838                                            GFP_KERNEL);
2839         if (!net->ipv6.ip6_null_entry)
2840                 goto out_ip6_dst_entries;
2841         net->ipv6.ip6_null_entry->dst.path =
2842                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2843         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2844         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2845                          ip6_template_metrics, true);
2846
2847 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2848         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2849                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2850                                                GFP_KERNEL);
2851         if (!net->ipv6.ip6_prohibit_entry)
2852                 goto out_ip6_null_entry;
2853         net->ipv6.ip6_prohibit_entry->dst.path =
2854                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2855         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2856         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2857                          ip6_template_metrics, true);
2858
2859         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2860                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2861                                                GFP_KERNEL);
2862         if (!net->ipv6.ip6_blk_hole_entry)
2863                 goto out_ip6_prohibit_entry;
2864         net->ipv6.ip6_blk_hole_entry->dst.path =
2865                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2866         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2867         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2868                          ip6_template_metrics, true);
2869 #endif
2870
2871         net->ipv6.sysctl.flush_delay = 0;
2872         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2873         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2874         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2875         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2876         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2877         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2878         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2879
2880 #ifdef CONFIG_PROC_FS
2881         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2882         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2883 #endif
2884         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2885
2886         ret = 0;
2887 out:
2888         return ret;
2889
2890 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2891 out_ip6_prohibit_entry:
2892         kfree(net->ipv6.ip6_prohibit_entry);
2893 out_ip6_null_entry:
2894         kfree(net->ipv6.ip6_null_entry);
2895 #endif
2896 out_ip6_dst_entries:
2897         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2898 out_ip6_dst_ops:
2899         goto out;
2900 }
2901
2902 static void __net_exit ip6_route_net_exit(struct net *net)
2903 {
2904 #ifdef CONFIG_PROC_FS
2905         proc_net_remove(net, "ipv6_route");
2906         proc_net_remove(net, "rt6_stats");
2907 #endif
2908         kfree(net->ipv6.ip6_null_entry);
2909 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2910         kfree(net->ipv6.ip6_prohibit_entry);
2911         kfree(net->ipv6.ip6_blk_hole_entry);
2912 #endif
2913         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2914 }
2915
2916 static struct pernet_operations ip6_route_net_ops = {
2917         .init = ip6_route_net_init,
2918         .exit = ip6_route_net_exit,
2919 };
2920
2921 static struct notifier_block ip6_route_dev_notifier = {
2922         .notifier_call = ip6_route_dev_notify,
2923         .priority = 0,
2924 };
2925
2926 int __init ip6_route_init(void)
2927 {
2928         int ret;
2929
2930         ret = -ENOMEM;
2931         ip6_dst_ops_template.kmem_cachep =
2932                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2933                                   SLAB_HWCACHE_ALIGN, NULL);
2934         if (!ip6_dst_ops_template.kmem_cachep)
2935                 goto out;
2936
2937         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2938         if (ret)
2939                 goto out_kmem_cache;
2940
2941         ret = register_pernet_subsys(&ip6_route_net_ops);
2942         if (ret)
2943                 goto out_dst_entries;
2944
2945         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2946
2947         /* Registering of the loopback is done before this portion of code,
2948          * the loopback reference in rt6_info will not be taken, do it
2949          * manually for init_net */
2950         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2951         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2952   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2953         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2954         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2955         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2956         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2957   #endif
2958         ret = fib6_init();
2959         if (ret)
2960                 goto out_register_subsys;
2961
2962         ret = xfrm6_init();
2963         if (ret)
2964                 goto out_fib6_init;
2965
2966         ret = fib6_rules_init();
2967         if (ret)
2968                 goto xfrm6_init;
2969
2970         ret = -ENOBUFS;
2971         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2972             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2973             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2974                 goto fib6_rules_init;
2975
2976         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2977         if (ret)
2978                 goto fib6_rules_init;
2979
2980 out:
2981         return ret;
2982
2983 fib6_rules_init:
2984         fib6_rules_cleanup();
2985 xfrm6_init:
2986         xfrm6_fini();
2987 out_fib6_init:
2988         fib6_gc_cleanup();
2989 out_register_subsys:
2990         unregister_pernet_subsys(&ip6_route_net_ops);
2991 out_dst_entries:
2992         dst_entries_destroy(&ip6_dst_blackhole_ops);
2993 out_kmem_cache:
2994         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2995         goto out;
2996 }
2997
2998 void ip6_route_cleanup(void)
2999 {
3000         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3001         fib6_rules_cleanup();
3002         xfrm6_fini();
3003         fib6_gc_cleanup();
3004         unregister_pernet_subsys(&ip6_route_net_ops);
3005         dst_entries_destroy(&ip6_dst_blackhole_ops);
3006         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3007 }